#!/usr/bin/env python # coding: utf-8 # In[ ]: import numpy as np import pandas as pd from sklearn.preprocessing import RobustScaler from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold from sklearn.metrics import mean_squared_error, r2_score from xgboost import XGBRegressor import pickle import warnings warnings.filterwarnings("ignore", category=FutureWarning) # In[ ]: Combined_data = pd.read_csv('LosAngeles_2022.csv') # Combined_data['last_review'] = pd.to_datetime(Combined_data['last_review'], infer_datetime_format=True) Combined_data.drop(['host_id', 'id', 'host_name','name', 'last_review', 'neighbourhood', 'license', 'number_of_reviews_ltm'], axis=1, inplace=True) # fill NAs Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0) Combined_data['neighbourhood_group'] = Combined_data['neighbourhood_group'].fillna('unknown') # remove outliers and log transformation Combined_data = Combined_data[np.log1p(Combined_data['price']) < 8] Combined_data = Combined_data[np.log1p(Combined_data['price']) > 3] Combined_data['price'] = np.log1p(Combined_data['price']) Combined_data['reviews_per_month'] = Combined_data[Combined_data['reviews_per_month'] < 17.5]['reviews_per_month'] Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0) Combined_data['minimum_nights'] = np.log1p(Combined_data['minimum_nights']) # segment numeric variable Combined_data['all_year_avail'] = 1*(Combined_data['availability_365']>353) Combined_data['low_avail'] = 1*(Combined_data['availability_365']< 12) Combined_data['no_reviews'] = 1*(Combined_data['reviews_per_month']==0) # Combined_data['room_type'] = Combined_data['room_type'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x)) # Combined_data['room_type'] = (Combined_data['room_type']).str.replace(' ', '_') print(np.unique(Combined_data['room_type'])) print(Combined_data.shape) # one hot encode categorical variables categorical_features = Combined_data.select_dtypes(include=['object']) print(categorical_features.columns) print(categorical_features.shape) categorical_features_one_hot = pd.get_dummies(categorical_features) # select numerical variables numerical_features = Combined_data.select_dtypes(exclude=['object']) print(numerical_features.columns) print(numerical_features.shape) y = numerical_features.price numerical_features = numerical_features.drop(['price'], axis=1) X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1) # no column names X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1) # with column names print(X_df.shape) print(X_df.columns) # Processed_data = pd.concat([X_df, y], axis = 1) # Processed_data.to_csv('Airbnb_LA_Processed.dat') # In[ ]: categorical_features_one_hot numerical_features # In[ ]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print('Dimensions of the training feature matrix: {}'.format(X_train.shape)) print('Dimensions of the training target vector: {}'.format(y_train.shape)) print('Dimensions of the test feature matrix: {}'.format(X_test.shape)) print('Dimensions of the test target vector: {}'.format(y_test.shape)) # In[ ]: scaler = RobustScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) X_train.shape # In[ ]: with open('./pickles/scaler', 'wb') as file: pickle.dump(scaler, file) # In[ ]: # aim to report RMSE metric over 5-fold cross validation n_folds = 5 # rmse def rmse_cv(model, X_train = X_train): kf = KFold(n_folds, shuffle=True, random_state=2022).get_n_splits(numerical_features) return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf) # In[ ]: xbgreg_best = XGBRegressor(n_estimators=1000, #learning_rate=0.1, early_stopping=5, max_depth=9, min_child_weight=5) xbgreg_CV_best = -rmse_cv(xbgreg_best) xbgreg_best.fit(X_train, y_train) y_train_xgbreg = xbgreg_best.predict(X_train) y_test_xgbreg = xbgreg_best.predict(X_test) xgb_best_results = pd.DataFrame({'algorithm':['XGBRegressor'], 'CV error': xbgreg_CV_best.mean(), 'CV std': xbgreg_CV_best.std(), 'training error': [mean_squared_error(y_train, y_train_xgbreg)], 'test error': [mean_squared_error(y_test, y_test_xgbreg)], 'training_r2_score': [r2_score(y_train, y_train_xgbreg)], 'test_r2_score': [r2_score(y_test, y_test_xgbreg)]}) xgb_best_results # In[ ]: with open('./pickles/model', 'wb') as file: pickle.dump(xbgreg_best, file)