Combined_data = pd.read_csv('LosAngeles_2022.csv')
# Combined_data['last_review'] = pd.to_datetime(Combined_data['last_review'], infer_datetime_format=True)
Combined_data.drop(['host_id', 'id', 'host_name','name',
'last_review', 'neighbourhood', 'license', 'number_of_reviews_ltm'],
axis=1, inplace=True)
# fill NAs
Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)
Combined_data['neighbourhood_group'] = Combined_data['neighbourhood_group'].fillna('unknown')
# remove outliers and log transformation
Combined_data = Combined_data[np.log1p(Combined_data['price']) < 8]
Combined_data = Combined_data[np.log1p(Combined_data['price']) > 3]
Combined_data['price'] = np.log1p(Combined_data['price'])
Combined_data['reviews_per_month'] = Combined_data[Combined_data['reviews_per_month'] < 17.5]['reviews_per_month']
Combined_data['reviews_per_month'] = Combined_data['reviews_per_month'].fillna(0)
Combined_data['minimum_nights'] = np.log1p(Combined_data['minimum_nights'])
# segment numeric variable
Combined_data['all_year_avail'] = 1*(Combined_data['availability_365']>353)
Combined_data['low_avail'] = 1*(Combined_data['availability_365']< 12)
Combined_data['no_reviews'] = 1*(Combined_data['reviews_per_month']==0)
# Combined_data['room_type'] = Combined_data['room_type'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
# Combined_data['room_type'] = (Combined_data['room_type']).str.replace(' ', '_')
print(np.unique(Combined_data['room_type']))
print(Combined_data.shape)
# one hot encode categorical variables
categorical_features = Combined_data.select_dtypes(include=['object'])
print(categorical_features.columns)
print(categorical_features.shape)
categorical_features_one_hot = pd.get_dummies(categorical_features)
# select numerical variables
numerical_features = Combined_data.select_dtypes(exclude=['object'])
print(numerical_features.columns)
print(numerical_features.shape)
y = numerical_features.price
numerical_features = numerical_features.drop(['price'], axis=1)
X = np.concatenate((numerical_features, categorical_features_one_hot), axis=1) # no column names
X_df = pd.concat([numerical_features, categorical_features_one_hot], axis=1) # with column names
print(X_df.shape)
print(X_df.columns)
# Processed_data = pd.concat([X_df, y], axis = 1)
# Processed_data.to_csv('Airbnb_LA_Processed.dat')