# JUST RUN THIS

from google.colab import drive
import pandas as pd

drive.mount('/content/gdrive')

# Load and clean the data
df = pd.read_csv('/content/gdrive/MyDrive/datasets/air_passengers.csv')
df.rename(columns={'#Passengers': 'Passengers'}, inplace=True)
df['Month'] = pd.to_datetime(df['Month'])

# Create our feature: months since start
df['Month_Count'] = (df['Month'].dt.year - 1949) * 12 + df['Month'].dt.month - 1

# Look at our data
print(df.head())
print(f"\nWe have {len(df)} months of data")


# JUST RUN THIS

# Create X and y for sklearn
# X needs to be 2D (that's why we use double brackets)
X = df[['Month_Count']]  # Features - notice the double brackets!
y = df['Passengers']     # Labels

print(f"X shape: {X.shape}")  # Should be (144, 1)
print(f"y shape: {y.shape}")  # Should be (144,)

# Split into training and testing sets
# Everything before 1958 is training
train_mask = df['Month'] < '1958-01-01'
test_mask = df['Month'] >= '1958-01-01'

X_train = X[train_mask]
y_train = y[train_mask]
X_test = X[test_mask]
y_test = y[test_mask]

print(f"\nTraining samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


# EDIT THIS

from sklearn.linear_model import LinearRegression

def train_model(X_train, y_train):
    # Input: X_train is a DataFrame of features
    #        y_train is a Series of labels
    # Output: Returns a trained model

    # TODO: Your code here!
    # 1. Create a LinearRegression model
    # 2. Train it using .fit(X_train, y_train)
    # 3. Return the trained model
    pass

# Train the model using your function
model = train_model(X_train, y_train)

# Check what the model learned
print(f"Slope: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")
print(f"Line equation: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}")


# EDIT THIS

def make_predictions(model, X_values):
    # Input: model is a trained sklearn model
    #        X_values is a DataFrame of features
    # Output: Returns a Series of predictions

    # TODO: Use model.predict() to get predictions
    # Hint: predictions = model.predict(X_values)
    pass

# Test your function
predictions = make_predictions(model, X)
print(f"First 5 predictions: {pd.Series(predictions).head()}")


# JUST RUN THIS

# Add predictions to our dataframe
df['Predictions'] = predictions

# Plot actual vs predicted
ax = df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6))
df.plot(x='Month', y='Predictions', ax=ax, color='red', label='Predicted')

# Add a line showing train/test split
import matplotlib.pyplot as plt
plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split')
plt.title('Linear Regression Predictions')
plt.legend()
plt.show()

# EDIT THIS

def calculate_test_mse(y_true, predictions):
    # Input: y_true is actual values (Series)
    #        predictions is predicted values (Series)
    # Output: Returns the MSE (a single number)

    # TODO: Calculate MSE
    # 1. Calculate errors: y_true - predictions
    # 2. Square the errors
    # 3. Return the mean
    pass

# Calculate MSE on test data only
test_predictions = predictions[test_mask]
test_mse = calculate_test_mse(y_test, test_predictions)
print(f"Test MSE: {test_mse:.2f}")

# BONUS CODE HERE