#!/usr/bin/env python # coding: utf-8 # # Outlier Detection (Part1) # > IQR, Standard Deviation, Z-score and Modified Z-score # In[ ]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import math import seaborn as sb from matplotlib.cbook import boxplot_stats from matplotlib import cbook from matplotlib.pyplot import figure # In[129]: df_calendar=pd.read_csv('calendar.csv') df_reviews=pd.read_csv('reviews.csv') df_listings=pd.read_csv('listings.csv') # In[44]: df_normal = pd.read_excel('Normal-data.xlsx') # In[45]: """Filling the null values of rating by mean""" df_listings["review_scores_rating"] = df_listings["review_scores_rating"].fillna(df_listings["review_scores_rating"].mean()) # In[46]: """Function to remove $ sign""" def remove_sign(x,sign): if type(x) is str: x = float(x.replace(sign,'').replace(',','')) return x # In[47]: df_listings.price = df_listings.price.apply(remove_sign,sign='$') # In[48]: """Boxplot of price across property type""" sb.boxplot(y='price', x='property_type',data=df_listings) plt.xticks(rotation=90) plt.ylabel('Price ($)') # In[49]: df_listings_apt = df_listings[df_listings['property_type'] == 'Apartment'] # In[50]: df_listings_apt.price # In[189]: df = df_listings_apt Q1 = df.price.quantile(0.25) Q3 = df.price.quantile(0.75) IQR = Q3-Q1 df_final = df[~((df.price<(Q1-1.5*IQR)) | (df.price>(Q3+1.5*IQR)))] # In[192]: """Boxplot of price for apt after IQR implementation""" figure(figsize=(6, 8), dpi=80) sb.boxplot(y='price', x='property_type',data=df_final) plt.xticks(rotation=90) plt.ylabel('Price ($)') # In[193]: stat = boxplot_stats(df_final.price) stat # In[54]: len(stat[0]['fliers']) # In[55]: # help(cbook.boxplot_stats) # # IQR multiplier = 1.7 (3 sigma) # In[56]: df = df_listings_apt # df = df_normal Q1 = df.price.quantile(0.25) Q3 = df.price.quantile(0.75) IQR = Q3-Q1 df_final = df[~((df.price<(Q1-1.7*IQR)) | (df.price>(Q3+1.7*IQR)))] # In[57]: """Boxplot of price for apt after IQR implementation""" sb.boxplot(y='price', x='property_type',data=df_final) # sb.boxplot(y='price',data=df_final) plt.xticks(rotation=90) plt.ylabel('Price ($)') # In[58]: stat = boxplot_stats(df_final.price) stat # In[59]: len(stat[0]['fliers']) # # 2 sigma and 3 sigma # In[199]: df = df_listings_apt # df = df_normal two_sd = df.price.std() *2 three_sd = df.price.std() *3 df_final = df[~((df.price<(np.mean(df.price) - two_sd)) | (df.price>(np.mean(df.price) + two_sd)))] # df_final = df[~((df.price<(np.mean(df.price) - three_sd)) | (df.price>(np.mean(df.price) + three_sd)))] # In[200]: np.mean(df.price) + three_sd # In[201]: # sb.boxplot(y='price', x='property_type',data=df_final) figure(figsize=(6, 8), dpi=80) sb.boxplot(y='price',data=df_final) plt.xticks(rotation=90) plt.ylabel('Price ($)') # In[202]: stat = boxplot_stats(df_final.price) stat # In[203]: len(stat[0]['fliers']) # # Z-score # In[204]: df = df_listings_apt df_price = pd.DataFrame(df.price) df_price # In[208]: m = np.mean(df_price.price) s = np.std(df_price.price) df_price['Z-score'] = (df_price.price - m)/s df_price # In[209]: df = df_price df_outlier = df[abs(df['Z-score']) >3] df_outlier # In[210]: df_outlier.shape # In[212]: df = df.drop(index = df_outlier.index) # In[213]: figure(figsize=(6, 8), dpi=80) sb.boxplot(y='price',data=df) plt.xticks(rotation=90) plt.ylabel('Price ($)') # In[214]: stat = boxplot_stats(df_final.price) stat # In[215]: len(stat[0]['fliers']) # # check Z-score for outliers # In[117]: figure(figsize=(15, 8), dpi=80) plt.hist(df_price['Z-score'], bins=50) # In[118]: df_price['Z-score'].mean() # # Modified Z-score # In[227]: df = df_listings_apt df_price = pd.DataFrame(df.price) df_price # In[228]: m = np.mean(df_price.price) s = np.std(df_price.price) df_price['Z-score'] = (df_price.price - m)/s df_price # In[229]: m = np.median(df_price.price) df_price['AD (Absolute Deviation)'] = abs(df_price.price - m) df_price # In[230]: MAD = np.mean(df_price['AD (Absolute Deviation)']) MAD # In[231]: df_price['Modified Z-score'] = (0.6745*df_price['AD (Absolute Deviation)'])/MAD df_price # In[232]: df_outlier = df_price[df_price['Modified Z-score'] >= 3.5] df_outlier # In[233]: df_outlier.shape # In[234]: df = df.drop(index = df_outlier.index) # In[235]: figure(figsize=(6, 8), dpi=80) sb.boxplot(y='price',data=df) plt.xticks(rotation=90) plt.ylabel('Price ($)') # In[236]: stat = boxplot_stats(df_final.price) stat # In[237]: len(stat[0]['fliers']) # In[ ]: