#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt


# ## Data Exploration

# In[2]:


df = pd.read_csv('data.csv')


# In[3]:


df.head()


# In[4]:


df.describe()


# In[5]:


df.sample(10)


# In[6]:


df.shape


# In[7]:


df.info()


# In[8]:


df.columns


# ## Dropping not needed columns

# In[9]:


## id, name, host_id, host_name, neighbourhood_group, license columns are not needed
df.drop(columns=['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'license'], axis=1, inplace=True)


# In[10]:


df.columns


# In[11]:


df.nunique()


# In[12]:


df.isnull().sum()


# ## Removing outliers

# In[13]:


px.histogram(df, x='neighbourhood')


# In[14]:


df['neighbourhood'].value_counts()


# In[15]:


pr1 = df.groupby('neighbourhood', as_index=False)['price'].mean()
px.bar(pr1, x='neighbourhood', y='price')


# In[16]:


## removing all neighbourhoods with less than 200 units, including 'Taling Chan' which is a pricing outlier
df = df.groupby('neighbourhood').filter(lambda x : len(x)>200)


# In[17]:


px.histogram(df, x='neighbourhood')


# In[18]:


pr1 = df.groupby('neighbourhood', as_index=False)['price'].mean()
px.bar(pr1, x='neighbourhood', y='price')


# In[19]:


fig = px.violin(df, x="neighbourhood", y="price")
fig.show()


# In[20]:


fig = px.box(df, x="neighbourhood", y="price")
fig.show()


# In[21]:


df[["neighbourhood", "price"]].groupby("neighbourhood").describe()


# In[22]:


## removing price outliers per neighbourhood
def is_outlier(s):
    lower_limit = s.mean() - (s.std() * 3)
    upper_limit = s.mean() + (s.std() * 3)
    return ~s.between(lower_limit, upper_limit)


# In[23]:


df = df[~df.groupby('neighbourhood')['price'].apply(is_outlier)]


# In[24]:


fig = px.box(df, x="neighbourhood", y="price")
fig.show()


# In[25]:


fig = px.violin(df, x="neighbourhood", y="price")
fig.show()


# In[26]:


px.bar(
    data_frame=df.groupby('room_type')['price'].mean().reset_index(), 
    x="room_type", 
    y="price"
)


# In[27]:


fig = px.violin(df, x="room_type", y="price")
fig.show()


# In[28]:


## applying outliers removal to room tye prices
df = df[~df.groupby('neighbourhood')['price'].apply(is_outlier)]


# In[29]:


px.bar(
    data_frame=df.groupby('room_type')['price'].mean().reset_index(), 
    x="room_type", 
    y="price"
)


# In[30]:


fig = px.violin(df, x="room_type", y="price")
fig.show()


# In[31]:


pr = df.groupby(['neighbourhood','room_type'], as_index=False)['price'].mean()
fig = px.bar(pr, x="neighbourhood", y='price', color="room_type", barmode="group")
fig.show()


# In[32]:


pr2 = df.groupby(['neighbourhood'], as_index=False)['availability_365'].mean()
fig = px.bar(pr2, x="neighbourhood", y='availability_365')
fig.show()


# In[ ]: