#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import plotly.express as px import seaborn as sns import matplotlib.pyplot as plt # ## Data Exploration # In[2]: df = pd.read_csv('data.csv') # In[3]: df.head() # In[4]: df.describe() # In[5]: df.sample(10) # In[6]: df.shape # In[7]: df.info() # In[8]: df.columns # ## Dropping not needed columns # In[9]: ## id, name, host_id, host_name, neighbourhood_group, license columns are not needed df.drop(columns=['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'license'], axis=1, inplace=True) # In[10]: df.columns # In[11]: df.nunique() # In[12]: df.isnull().sum() # ## Removing outliers # In[13]: px.histogram(df, x='neighbourhood') # In[14]: df['neighbourhood'].value_counts() # In[15]: pr1 = df.groupby('neighbourhood', as_index=False)['price'].mean() px.bar(pr1, x='neighbourhood', y='price') # In[16]: ## removing all neighbourhoods with less than 200 units, including 'Taling Chan' which is a pricing outlier df = df.groupby('neighbourhood').filter(lambda x : len(x)>200) # In[17]: px.histogram(df, x='neighbourhood') # In[18]: pr1 = df.groupby('neighbourhood', as_index=False)['price'].mean() px.bar(pr1, x='neighbourhood', y='price') # In[19]: fig = px.violin(df, x="neighbourhood", y="price") fig.show() # In[20]: fig = px.box(df, x="neighbourhood", y="price") fig.show() # In[21]: df[["neighbourhood", "price"]].groupby("neighbourhood").describe() # In[22]: ## removing price outliers per neighbourhood def is_outlier(s): lower_limit = s.mean() - (s.std() * 3) upper_limit = s.mean() + (s.std() * 3) return ~s.between(lower_limit, upper_limit) # In[23]: df = df[~df.groupby('neighbourhood')['price'].apply(is_outlier)] # In[24]: fig = px.box(df, x="neighbourhood", y="price") fig.show() # In[25]: fig = px.violin(df, x="neighbourhood", y="price") fig.show() # In[26]: px.bar( data_frame=df.groupby('room_type')['price'].mean().reset_index(), x="room_type", y="price" ) # In[27]: fig = px.violin(df, x="room_type", y="price") fig.show() # In[28]: ## applying outliers removal to room tye prices df = df[~df.groupby('neighbourhood')['price'].apply(is_outlier)] # In[29]: px.bar( data_frame=df.groupby('room_type')['price'].mean().reset_index(), x="room_type", y="price" ) # In[30]: fig = px.violin(df, x="room_type", y="price") fig.show() # In[31]: pr = df.groupby(['neighbourhood','room_type'], as_index=False)['price'].mean() fig = px.bar(pr, x="neighbourhood", y='price', color="room_type", barmode="group") fig.show() # In[32]: pr2 = df.groupby(['neighbourhood'], as_index=False)['availability_365'].mean() fig = px.bar(pr2, x="neighbourhood", y='availability_365') fig.show() # In[ ]: