# JUST RUN THIS

from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')

# Load and clean the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/air_passengers.csv')
df.rename(columns={'#Passengers': 'Passengers'}, inplace=True)
df['Month'] = pd.to_datetime(df['Month'])

# Create our feature: months since start
df['Month_Count'] = (df['Month'].dt.year - 1949) * 12 + df['Month'].dt.month - 1

# Look at our data
print(f"\nWe have {len(df)} months of data")
df.head()


df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6))


# JUST RUN THIS

# Create X and y for sklearn
# X needs to be 2D (that's why we use double brackets)
X = df[['Month_Count']]  # Features - notice the double brackets!
y = df['Passengers']     # Labels

print(f"X shape: {X.shape}")  # Should be (144, 1)
print(f"y shape: {y.shape}")  # Should be (144,)

# Split into training and testing sets
# Everything before 1958 is training
train_mask = df['Month'] < '1958-01-01'
test_mask = df['Month'] >= '1958-01-01'

X_train = X[train_mask]
y_train = y[train_mask]
X_test = X[test_mask]
y_test = y[test_mask]

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


# EDIT THIS

from sklearn.linear_model import LinearRegression

def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

# Train the model using your function
model = train_model(X_train, y_train)

# Check what the model learned
print(f"Slope: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")
print(f"Line equation: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}")


# EDIT THIS

def make_predictions(model, X):
    y_pred = model.predict(X)
    return y_pred

# Test your function
y_pred = make_predictions(model, X_test)

# Turn into a pandas series and set the index to y_test's for plotting purposes
y_pred = pd.Series(y_pred, index=y_test.index)

# Show the first 5
display(y_pred.head())

# JUST RUN THIS
import matplotlib.pyplot as plt

# Add predictions to our dataframe
df['Predictions'] = y_pred

# Plot actual vs predicted
ax = df.plot(x='Month', y="Passengers", label='Actual', figsize=(10, 6))
df.plot(x='Month', y='Predictions', ax=ax, color='red', label='Predicted')

# Add a line showing train/test split
plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split')
plt.title('Linear Regression Predictions')
plt.legend()
plt.show()

# EDIT THIS

def calculate_test_mse(y_test, y_pred):
    # n = len(y_test)
    # total = 0
    # for i in range(n):
    #     total += (y_test[i] - y_pred[i]) ** 2
    # mse = total / n
    mse = ((y_test - y_pred) ** 2).mean()
    return mse

# Calculate MSE on test data only
y_pred = df["Predictions"][test_mask]
test_mse = calculate_test_mse(y_test, y_pred)
print(f"Test MSE: {test_mse:.2f}")

# BONUS CODE HERE

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

# This creates a model that can fit curves!
poly_model = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])

poly_model.fit(X_train, y_train)
y_pred = pd.Series(poly_model.predict(X_test), index=y_test.index)

# Plot the curved predictions
df['Poly_Predictions'] = y_pred
ax = df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6))
df.plot(x='Month', y='Poly_Predictions', ax=ax, color='green', label='Polynomial')
plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split')
plt.legend()
plt.show()

test_mse = calculate_test_mse(y_test, y_pred)
print(f"Test MSE: {test_mse:.2f}")

import numpy as np

def make_features(X):
    # Copy the dataframe to avoid side-effects o
    X = X.copy()

    # Year_Count will be how many years since 1948
    X["Year_Count"] = X["Month_Count"] // 12

    # Instead of Month_Count being the number of months since 1948,
    # we'll makeit which month of the year it is, 0-11.
    X["Month_Count"] = X["Month_Count"] % 12

    # We'll calculate both a sin and cos of Month_Count
    X["Month_sin"] = np.sin(2 * np.pi * X["Month_Count"] / 12)
    X["Month_cos"] = np.cos(2 * np.pi * X["Month_Count"] / 12)

    # And an intereaction feature of the Year with Month_sin/cos to
    # capture how the size of the curve gets larger with year.
    X["Year_sin"] = X["Year_Count"] * X["Month_sin"]
    X["Year_cos"] = X["Year_Count"] * X["Month_cos"]

    return X

new_X_train = make_features(X_train)
new_X_test  = make_features(X_test)

poly_model = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('linear', LinearRegression())
])
poly_model.fit(new_X_train, y_train)

y_pred = pd.Series(poly_model.predict(new_X_test), index=y_test.index)

# Plot the curved predictions
df['Feature_Engineered_Predictions'] = y_pred
ax = df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6))
df.plot(x='Month', y='Feature_Engineered_Predictions', ax=ax, color='green', label='Polynomial')
plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split')
plt.legend()
plt.show()

test_mse = calculate_test_mse(y_test, y_pred)
print(f"Test MSE: {test_mse:.2f}")