IQR, Standard Deviation, Z-score and Modified Z-score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sb
from matplotlib.cbook import boxplot_stats
from matplotlib import cbook
from matplotlib.pyplot import figure
df_calendar=pd.read_csv('calendar.csv')
df_reviews=pd.read_csv('reviews.csv')
df_listings=pd.read_csv('listings.csv')
df_normal = pd.read_excel('Normal-data.xlsx')
"""Filling the null values of rating by mean"""
df_listings["review_scores_rating"] = df_listings["review_scores_rating"].fillna(df_listings["review_scores_rating"].mean())
"""Function to remove $ sign"""
def remove_sign(x,sign):
if type(x) is str:
x = float(x.replace(sign,'').replace(',',''))
return x
df_listings.price = df_listings.price.apply(remove_sign,sign='$')
"""Boxplot of price across property type"""
sb.boxplot(y='price', x='property_type',data=df_listings)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
df_listings_apt = df_listings[df_listings['property_type'] == 'Apartment']
df_listings_apt.price
1 65.0 2 65.0 6 100.0 9 229.0 13 150.0 ... 3580 69.0 3581 150.0 3582 198.0 3583 65.0 3584 65.0 Name: price, Length: 2612, dtype: float64
df = df_listings_apt
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.5*IQR)) | (df.price>(Q3+1.5*IQR)))]
"""Boxplot of price for apt after IQR implementation"""
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price', x='property_type',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 166.03392504930966, 'iqr': 125.0, 'cilo': 146.10218983747075, 'cihi': 153.89781016252925, 'whishi': 402.0, 'whislo': 10.0, 'fliers': array([417.]), 'q1': 95.0, 'med': 150.0, 'q3': 220.0}]
len(stat[0]['fliers'])
1
# help(cbook.boxplot_stats)
df = df_listings_apt
# df = df_normal
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)
IQR = Q3-Q1
df_final = df[~((df.price<(Q1-1.7*IQR)) | (df.price>(Q3+1.7*IQR)))]
"""Boxplot of price for apt after IQR implementation"""
sb.boxplot(y='price', x='property_type',data=df_final)
# sb.boxplot(y='price',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 167.74911799294395, 'iqr': 129.5, 'cilo': 148.97455227210247, 'cihi': 157.02544772789753, 'whishi': 417.0, 'whislo': 10.0, 'fliers': array([429., 450., 450., 450., 449., 425., 450., 450., 450., 425., 450., 450., 425., 425., 425., 429.]), 'q1': 95.5, 'med': 153.0, 'q3': 225.0}]
len(stat[0]['fliers'])
16
df = df_listings_apt
# df = df_normal
two_sd = df.price.std() *2
three_sd = df.price.std() *3
df_final = df[~((df.price<(np.mean(df.price) - two_sd)) | (df.price>(np.mean(df.price) + two_sd)))]
# df_final = df[~((df.price<(np.mean(df.price) - three_sd)) | (df.price>(np.mean(df.price) + three_sd)))]
np.mean(df.price) + three_sd
644.9362244380891
# sb.boxplot(y='price', x='property_type',data=df_final)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df_final)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 168.21174168297455, 'iqr': 129.0, 'cilo': 149.9932346400486, 'cihi': 158.0067653599514, 'whishi': 417.0, 'whislo': 10.0, 'fliers': array([429., 450., 450., 450., 449., 425., 479., 450., 464., 450., 451., 450., 425., 450., 450., 425., 425., 425., 429., 459.]), 'q1': 96.0, 'med': 154.0, 'q3': 225.0}]
len(stat[0]['fliers'])
20
df = df_listings_apt
df_price = pd.DataFrame(df.price)
df_price
price | |
---|---|
1 | 65.0 |
2 | 65.0 |
6 | 100.0 |
9 | 229.0 |
13 | 150.0 |
... | ... |
3580 | 69.0 |
3581 | 150.0 |
3582 | 198.0 |
3583 | 65.0 |
3584 | 65.0 |
2612 rows × 1 columns
m = np.mean(df_price.price)
s = np.std(df_price.price)
df_price['Z-score'] = (df_price.price - m)/s
df_price
price | Z-score | |
---|---|---|
1 | 65.0 | -0.758079 |
2 | 65.0 | -0.758079 |
6 | 100.0 | -0.531238 |
9 | 229.0 | 0.304830 |
13 | 150.0 | -0.207181 |
... | ... | ... |
3580 | 69.0 | -0.732154 |
3581 | 150.0 | -0.207181 |
3582 | 198.0 | 0.103914 |
3583 | 65.0 | -0.758079 |
3584 | 65.0 | -0.758079 |
2612 rows × 2 columns
df = df_price
df_outlier = df[abs(df['Z-score']) >3]
df_outlier
price | Z-score | |
---|---|---|
391 | 725.0 | 3.519480 |
408 | 872.0 | 4.472209 |
793 | 1400.0 | 7.894255 |
889 | 650.0 | 3.033394 |
1085 | 800.0 | 4.005566 |
1234 | 650.0 | 3.033394 |
1262 | 1250.0 | 6.922083 |
1280 | 695.0 | 3.325045 |
1399 | 975.0 | 5.139767 |
1764 | 1000.0 | 5.301796 |
1854 | 769.0 | 3.804650 |
1896 | 1235.0 | 6.824866 |
1932 | 849.0 | 4.323142 |
1950 | 1345.0 | 7.537792 |
2204 | 1372.0 | 7.712783 |
2285 | 4000.0 | 24.745242 |
2394 | 750.0 | 3.681508 |
2448 | 1275.0 | 7.084112 |
2485 | 999.0 | 5.295314 |
3055 | 3000.0 | 18.264093 |
3096 | 1250.0 | 6.922083 |
df_outlier.shape
(21, 2)
df = df.drop(index = df_outlier.index)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 168.21174168297455, 'iqr': 129.0, 'cilo': 149.9932346400486, 'cihi': 158.0067653599514, 'whishi': 417.0, 'whislo': 10.0, 'fliers': array([429., 450., 450., 450., 449., 425., 479., 450., 464., 450., 451., 450., 425., 450., 450., 425., 425., 425., 429., 459.]), 'q1': 96.0, 'med': 154.0, 'q3': 225.0}]
len(stat[0]['fliers'])
20
figure(figsize=(15, 8), dpi=80)
plt.hist(df_price['Z-score'], bins=50)
(array([579., 846., 637., 359., 113., 21., 23., 13., 4., 3., 2., 0., 3., 0., 0., 4., 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]), array([-1.11454172, -0.59734605, -0.08015037, 0.4370453 , 0.95424098, 1.47143665, 1.98863233, 2.50582801, 3.02302368, 3.54021936, 4.05741503, 4.57461071, 5.09180639, 5.60900206, 6.12619774, 6.64339341, 7.16058909, 7.67778476, 8.19498044, 8.71217612, 9.22937179, 9.74656747, 10.26376314, 10.78095882, 11.2981545 , 11.81535017, 12.33254585, 12.84974152, 13.3669372 , 13.88413287, 14.40132855, 14.91852423, 15.4357199 , 15.95291558, 16.47011125, 16.98730693, 17.50450261, 18.02169828, 18.53889396, 19.05608963, 19.57328531, 20.09048098, 20.60767666, 21.12487234, 21.64206801, 22.15926369, 22.67645936, 23.19365504, 23.71085072, 24.22804639, 24.74524207]), <BarContainer object of 50 artists>)
df_price['Z-score'].mean()
-8.373427865664466e-17
df = df_listings_apt
df_price = pd.DataFrame(df.price)
df_price
price | |
---|---|
1 | 65.0 |
2 | 65.0 |
6 | 100.0 |
9 | 229.0 |
13 | 150.0 |
... | ... |
3580 | 69.0 |
3581 | 150.0 |
3582 | 198.0 |
3583 | 65.0 |
3584 | 65.0 |
2612 rows × 1 columns
m = np.mean(df_price.price)
s = np.std(df_price.price)
df_price['Z-score'] = (df_price.price - m)/s
df_price
price | Z-score | |
---|---|---|
1 | 65.0 | -0.758079 |
2 | 65.0 | -0.758079 |
6 | 100.0 | -0.531238 |
9 | 229.0 | 0.304830 |
13 | 150.0 | -0.207181 |
... | ... | ... |
3580 | 69.0 | -0.732154 |
3581 | 150.0 | -0.207181 |
3582 | 198.0 | 0.103914 |
3583 | 65.0 | -0.758079 |
3584 | 65.0 | -0.758079 |
2612 rows × 2 columns
m = np.median(df_price.price)
df_price['AD (Absolute Deviation)'] = abs(df_price.price - m)
df_price
price | Z-score | AD (Absolute Deviation) | |
---|---|---|---|
1 | 65.0 | -0.758079 | 94.0 |
2 | 65.0 | -0.758079 | 94.0 |
6 | 100.0 | -0.531238 | 59.0 |
9 | 229.0 | 0.304830 | 70.0 |
13 | 150.0 | -0.207181 | 9.0 |
... | ... | ... | ... |
3580 | 69.0 | -0.732154 | 90.0 |
3581 | 150.0 | -0.207181 | 9.0 |
3582 | 198.0 | 0.103914 | 39.0 |
3583 | 65.0 | -0.758079 | 94.0 |
3584 | 65.0 | -0.758079 | 94.0 |
2612 rows × 3 columns
MAD = np.mean(df_price['AD (Absolute Deviation)'])
MAD
84.66117917304747
df_price['Modified Z-score'] = (0.6745*df_price['AD (Absolute Deviation)'])/MAD
df_price
price | Z-score | AD (Absolute Deviation) | Modified Z-score | |
---|---|---|---|---|
1 | 65.0 | -0.758079 | 94.0 | 0.748903 |
2 | 65.0 | -0.758079 | 94.0 | 0.748903 |
6 | 100.0 | -0.531238 | 59.0 | 0.470056 |
9 | 229.0 | 0.304830 | 70.0 | 0.557694 |
13 | 150.0 | -0.207181 | 9.0 | 0.071703 |
... | ... | ... | ... | ... |
3580 | 69.0 | -0.732154 | 90.0 | 0.717035 |
3581 | 150.0 | -0.207181 | 9.0 | 0.071703 |
3582 | 198.0 | 0.103914 | 39.0 | 0.310715 |
3583 | 65.0 | -0.758079 | 94.0 | 0.748903 |
3584 | 65.0 | -0.758079 | 94.0 | 0.748903 |
2612 rows × 4 columns
df_outlier = df_price[df_price['Modified Z-score'] >= 3.5]
df_outlier
price | Z-score | AD (Absolute Deviation) | Modified Z-score | |
---|---|---|---|---|
391 | 725.0 | 3.519480 | 566.0 | 4.509351 |
408 | 872.0 | 4.472209 | 713.0 | 5.680508 |
727 | 600.0 | 2.709336 | 441.0 | 3.513470 |
793 | 1400.0 | 7.894255 | 1241.0 | 9.887111 |
889 | 650.0 | 3.033394 | 491.0 | 3.911822 |
894 | 625.0 | 2.871365 | 466.0 | 3.712646 |
982 | 600.0 | 2.709336 | 441.0 | 3.513470 |
1085 | 800.0 | 4.005566 | 641.0 | 5.106880 |
1234 | 650.0 | 3.033394 | 491.0 | 3.911822 |
1262 | 1250.0 | 6.922083 | 1091.0 | 8.692054 |
1280 | 695.0 | 3.325045 | 536.0 | 4.270340 |
1399 | 975.0 | 5.139767 | 816.0 | 6.501114 |
1402 | 600.0 | 2.709336 | 441.0 | 3.513470 |
1516 | 600.0 | 2.709336 | 441.0 | 3.513470 |
1764 | 1000.0 | 5.301796 | 841.0 | 6.700291 |
1854 | 769.0 | 3.804650 | 610.0 | 4.859902 |
1896 | 1235.0 | 6.824866 | 1076.0 | 8.572548 |
1932 | 849.0 | 4.323142 | 690.0 | 5.497266 |
1950 | 1345.0 | 7.537792 | 1186.0 | 9.448923 |
1982 | 600.0 | 2.709336 | 441.0 | 3.513470 |
2192 | 600.0 | 2.709336 | 441.0 | 3.513470 |
2204 | 1372.0 | 7.712783 | 1213.0 | 9.664034 |
2285 | 4000.0 | 24.745242 | 3841.0 | 30.601446 |
2394 | 750.0 | 3.681508 | 591.0 | 4.708528 |
2405 | 603.0 | 2.728780 | 444.0 | 3.537371 |
2432 | 603.0 | 2.728780 | 444.0 | 3.537371 |
2448 | 1275.0 | 7.084112 | 1116.0 | 8.891230 |
2485 | 999.0 | 5.295314 | 840.0 | 6.692324 |
3055 | 3000.0 | 18.264093 | 2841.0 | 22.634394 |
3062 | 603.0 | 2.728780 | 444.0 | 3.537371 |
3064 | 600.0 | 2.709336 | 441.0 | 3.513470 |
3096 | 1250.0 | 6.922083 | 1091.0 | 8.692054 |
3102 | 603.0 | 2.728780 | 444.0 | 3.537371 |
df_outlier.shape
(33, 4)
df = df.drop(index = df_outlier.index)
figure(figsize=(6, 8), dpi=80)
sb.boxplot(y='price',data=df)
plt.xticks(rotation=90)
plt.ylabel('Price ($)')
Text(0, 0.5, 'Price ($)')
stat = boxplot_stats(df_final.price)
stat
[{'mean': 168.21174168297455, 'iqr': 129.0, 'cilo': 149.9932346400486, 'cihi': 158.0067653599514, 'whishi': 417.0, 'whislo': 10.0, 'fliers': array([429., 450., 450., 450., 449., 425., 479., 450., 464., 450., 451., 450., 425., 450., 450., 425., 425., 425., 429., 459.]), 'q1': 96.0, 'med': 154.0, 'q3': 225.0}]
len(stat[0]['fliers'])
20