#!/usr/bin/env python
# coding: utf-8

# In[ ]:


import numpy as np 
import pandas as pd 

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

import pickle
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# In[ ]:


Combined_data = pd.read_csv('LosAngeles_2022.csv')
# Combined_data['last_review'] = pd.to_datetime(Combined_data['last_review'], infer_datetime_format=True) 
Combined_data.drop(['host_id', 'id', 'host_name','name',
                    'last_review', 'neighbourhood', 'license', 'number_of_reviews_ltm'], 
                   axis=1, inplace=True)

# fill NAs
Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)
Combined_data['neighbourhood_group'] = Combined_data['neighbourhood_group'].fillna('unknown')

# remove outliers and log transformation
Combined_data = Combined_data[np.log1p(Combined_data['price']) < 8]
Combined_data = Combined_data[np.log1p(Combined_data['price']) > 3]
Combined_data['price'] = np.log1p(Combined_data['price'])
Combined_data['reviews_per_month'] = Combined_data[Combined_data['reviews_per_month'] < 17.5]['reviews_per_month']
Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)
Combined_data['minimum_nights'] = np.log1p(Combined_data['minimum_nights'])

# segment numeric variable
Combined_data['all_year_avail'] = 1*(Combined_data['availability_365']>353)
Combined_data['low_avail'] = 1*(Combined_data['availability_365']< 12)
Combined_data['no_reviews'] = 1*(Combined_data['reviews_per_month']==0)

# Combined_data['room_type'] = Combined_data['room_type'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
# Combined_data['room_type'] = (Combined_data['room_type']).str.replace(' ', '_')
print(np.unique(Combined_data['room_type']))

print(Combined_data.shape)

# one hot encode categorical variables
categorical_features = Combined_data.select_dtypes(include=['object'])
print(categorical_features.columns)
print(categorical_features.shape)
categorical_features_one_hot = pd.get_dummies(categorical_features)

# select numerical variables
numerical_features =  Combined_data.select_dtypes(exclude=['object'])
print(numerical_features.columns)
print(numerical_features.shape)

y = numerical_features.price
numerical_features = numerical_features.drop(['price'], axis=1)

X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1) # no column names
X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1) # with column names

print(X_df.shape)
print(X_df.columns)
# Processed_data = pd.concat([X_df, y], axis = 1)
# Processed_data.to_csv('Airbnb_LA_Processed.dat')


# In[ ]:


categorical_features_one_hot
numerical_features


# In[ ]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Dimensions of the training feature matrix: {}'.format(X_train.shape))
print('Dimensions of the training target vector: {}'.format(y_train.shape))
print('Dimensions of the test feature matrix: {}'.format(X_test.shape))
print('Dimensions of the test target vector: {}'.format(y_test.shape))


# In[ ]:


scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape


# In[ ]:


with open('./pickles/scaler', 'wb') as file:
  pickle.dump(scaler, file)


# In[ ]:


# aim to report RMSE metric over 5-fold cross validation
n_folds = 5

# rmse
def rmse_cv(model, X_train = X_train):
    kf = KFold(n_folds, shuffle=True, random_state=2022).get_n_splits(numerical_features)
    return cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=kf)


# In[ ]:


xbgreg_best = XGBRegressor(n_estimators=1000, #learning_rate=0.1, 
                           early_stopping=5, max_depth=9, min_child_weight=5)
xbgreg_CV_best = -rmse_cv(xbgreg_best)
xbgreg_best.fit(X_train, y_train) 
y_train_xgbreg = xbgreg_best.predict(X_train)
y_test_xgbreg = xbgreg_best.predict(X_test)
xgb_best_results = pd.DataFrame({'algorithm':['XGBRegressor'],
                                 'CV error': xbgreg_CV_best.mean(), 
                                 'CV std': xbgreg_CV_best.std(),
                                 'training error': [mean_squared_error(y_train, y_train_xgbreg)],
                                 'test error': [mean_squared_error(y_test, y_test_xgbreg)],
                                 'training_r2_score': [r2_score(y_train, y_train_xgbreg)],
                                 'test_r2_score': [r2_score(y_test, y_test_xgbreg)]})
xgb_best_results


# In[ ]:


with open('./pickles/model', 'wb') as file:
  pickle.dump(xbgreg_best, file)