# JUST RUN THIS from google.colab import drive import pandas as pd drive.mount('/content/gdrive') # Load and clean the data df = pd.read_csv('/content/gdrive/MyDrive/datasets/air_passengers.csv') df.rename(columns={'#Passengers': 'Passengers'}, inplace=True) df['Month'] = pd.to_datetime(df['Month']) # Create our feature: months since start df['Month_Count'] = (df['Month'].dt.year - 1949) * 12 + df['Month'].dt.month - 1 # Look at our data print(df.head()) print(f"\nWe have {len(df)} months of data") # JUST RUN THIS # Create X and y for sklearn # X needs to be 2D (that's why we use double brackets) X = df[['Month_Count']] # Features - notice the double brackets! y = df['Passengers'] # Labels print(f"X shape: {X.shape}") # Should be (144, 1) print(f"y shape: {y.shape}") # Should be (144,) # Split into training and testing sets # Everything before 1958 is training train_mask = df['Month'] < '1958-01-01' test_mask = df['Month'] >= '1958-01-01' X_train = X[train_mask] y_train = y[train_mask] X_test = X[test_mask] y_test = y[test_mask] print(f"\nTraining samples: {len(X_train)}") print(f"Testing samples: {len(X_test)}") # EDIT THIS from sklearn.linear_model import LinearRegression def train_model(X_train, y_train): # Input: X_train is a DataFrame of features # y_train is a Series of labels # Output: Returns a trained model # TODO: Your code here! # 1. Create a LinearRegression model # 2. Train it using .fit(X_train, y_train) # 3. Return the trained model pass # Train the model using your function model = train_model(X_train, y_train) # Check what the model learned print(f"Slope: {model.coef_[0]:.2f}") print(f"Intercept: {model.intercept_:.2f}") print(f"Line equation: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}") # EDIT THIS def make_predictions(model, X_values): # Input: model is a trained sklearn model # X_values is a DataFrame of features # Output: Returns a Series of predictions # TODO: Use model.predict() to get predictions # Hint: predictions = model.predict(X_values) pass # Test your function predictions = make_predictions(model, X) print(f"First 5 predictions: {pd.Series(predictions).head()}") # JUST RUN THIS # Add predictions to our dataframe df['Predictions'] = predictions # Plot actual vs predicted ax = df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6)) df.plot(x='Month', y='Predictions', ax=ax, color='red', label='Predicted') # Add a line showing train/test split import matplotlib.pyplot as plt plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split') plt.title('Linear Regression Predictions') plt.legend() plt.show() # EDIT THIS def calculate_test_mse(y_true, predictions): # Input: y_true is actual values (Series) # predictions is predicted values (Series) # Output: Returns the MSE (a single number) # TODO: Calculate MSE # 1. Calculate errors: y_true - predictions # 2. Square the errors # 3. Return the mean pass # Calculate MSE on test data only test_predictions = predictions[test_mask] test_mse = calculate_test_mse(y_test, test_predictions) print(f"Test MSE: {test_mse:.2f}") # BONUS CODE HERE