# JUST RUN THIS from google.colab import drive import pandas as pd drive.mount('/content/gdrive') # Load and clean the data df = pd.read_csv('/content/gdrive/MyDrive/datasets/air_passengers.csv') df.rename(columns={'#Passengers': 'Passengers'}, inplace=True) df['Month'] = pd.to_datetime(df['Month']) # Create our feature: months since start df['Month_Count'] = (df['Month'].dt.year - 1949) * 12 + df['Month'].dt.month - 1 # Look at our data print(f"\nWe have {len(df)} months of data") df.head() df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6)) # JUST RUN THIS # Create X and y for sklearn # X needs to be 2D (that's why we use double brackets) X = df[['Month_Count']] # Features - notice the double brackets! y = df['Passengers'] # Labels print(f"X shape: {X.shape}") # Should be (144, 1) print(f"y shape: {y.shape}") # Should be (144,) # Split into training and testing sets # Everything before 1958 is training train_mask = df['Month'] < '1958-01-01' test_mask = df['Month'] >= '1958-01-01' X_train = X[train_mask] y_train = y[train_mask] X_test = X[test_mask] y_test = y[test_mask] print(f"\nTraining samples: {len(X_train)}") print(f"Testing samples: {len(X_test)}") # EDIT THIS from sklearn.linear_model import LinearRegression def train_model(X_train, y_train): model = LinearRegression() model.fit(X_train, y_train) return model # Train the model using your function model = train_model(X_train, y_train) # Check what the model learned print(f"Slope: {model.coef_[0]:.2f}") print(f"Intercept: {model.intercept_:.2f}") print(f"Line equation: y = {model.coef_[0]:.2f}x + {model.intercept_:.2f}") # EDIT THIS def make_predictions(model, X): y_pred = model.predict(X) return y_pred # Test your function y_pred = make_predictions(model, X_test) # Turn into a pandas series and set the index to y_test's for plotting purposes y_pred = pd.Series(y_pred, index=y_test.index) # Show the first 5 display(y_pred.head()) # JUST RUN THIS import matplotlib.pyplot as plt # Add predictions to our dataframe df['Predictions'] = y_pred # Plot actual vs predicted ax = df.plot(x='Month', y="Passengers", label='Actual', figsize=(10, 6)) df.plot(x='Month', y='Predictions', ax=ax, color='red', label='Predicted') # Add a line showing train/test split plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split') plt.title('Linear Regression Predictions') plt.legend() plt.show() # EDIT THIS def calculate_test_mse(y_test, y_pred): # n = len(y_test) # total = 0 # for i in range(n): # total += (y_test[i] - y_pred[i]) ** 2 # mse = total / n mse = ((y_test - y_pred) ** 2).mean() return mse # Calculate MSE on test data only y_pred = df["Predictions"][test_mask] test_mse = calculate_test_mse(y_test, y_pred) print(f"Test MSE: {test_mse:.2f}") # BONUS CODE HERE from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline # This creates a model that can fit curves! poly_model = Pipeline([ ('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression()) ]) poly_model.fit(X_train, y_train) y_pred = pd.Series(poly_model.predict(X_test), index=y_test.index) # Plot the curved predictions df['Poly_Predictions'] = y_pred ax = df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6)) df.plot(x='Month', y='Poly_Predictions', ax=ax, color='green', label='Polynomial') plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split') plt.legend() plt.show() test_mse = calculate_test_mse(y_test, y_pred) print(f"Test MSE: {test_mse:.2f}") import numpy as np def make_features(X): # Copy the dataframe to avoid side-effects o X = X.copy() # Year_Count will be how many years since 1948 X["Year_Count"] = X["Month_Count"] // 12 # Instead of Month_Count being the number of months since 1948, # we'll makeit which month of the year it is, 0-11. X["Month_Count"] = X["Month_Count"] % 12 # We'll calculate both a sin and cos of Month_Count X["Month_sin"] = np.sin(2 * np.pi * X["Month_Count"] / 12) X["Month_cos"] = np.cos(2 * np.pi * X["Month_Count"] / 12) # And an intereaction feature of the Year with Month_sin/cos to # capture how the size of the curve gets larger with year. X["Year_sin"] = X["Year_Count"] * X["Month_sin"] X["Year_cos"] = X["Year_Count"] * X["Month_cos"] return X new_X_train = make_features(X_train) new_X_test = make_features(X_test) poly_model = Pipeline([ ('poly', PolynomialFeatures(degree=2)), ('linear', LinearRegression()) ]) poly_model.fit(new_X_train, y_train) y_pred = pd.Series(poly_model.predict(new_X_test), index=y_test.index) # Plot the curved predictions df['Feature_Engineered_Predictions'] = y_pred ax = df.plot(x='Month', y='Passengers', label='Actual', figsize=(10, 6)) df.plot(x='Month', y='Feature_Engineered_Predictions', ax=ax, color='green', label='Polynomial') plt.axvline(pd.to_datetime('1958-01-01'), color='green', linestyle='--', label='Train/Test Split') plt.legend() plt.show() test_mse = calculate_test_mse(y_test, y_pred) print(f"Test MSE: {test_mse:.2f}")