import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('data.csv')
df.head()
id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 27934 | Condo in Samsen Nai · ★4.85 · 1 bedroom · 1 be... | 120437 | Nuttee | NaN | Ratchathewi | 13.75983 | 100.54134 | Entire home/apt | 1903.0 | 3 | 64 | 2020-01-06 | 0.45 | 2 | 329 | 0 | NaN |
1 | 27979 | Rental unit in Bangkok · 1 bedroom · 2 beds · ... | 120541 | Emy | NaN | Bang Na | 13.66818 | 100.61674 | Private room | 1316.0 | 1 | 0 | NaN | NaN | 2 | 0 | 0 | NaN |
2 | 28745 | Rental unit in Bangkok · 1 bedroom · 1 bed · 1... | 123784 | Familyroom | NaN | Bang Kapi | 13.75232 | 100.62402 | Private room | 800.0 | 60 | 0 | NaN | NaN | 1 | 0 | 0 | NaN |
3 | 820395 | Guesthouse in Bangkok · ★4.77 · 1 bedroom · 1 ... | 822284 | Rae And Charlie | NaN | Bang Sue | 13.83026 | 100.52082 | Entire home/apt | 700.0 | 7 | 13 | 2018-02-14 | 0.10 | 8 | 349 | 0 | NaN |
4 | 35780 | Rental unit in Bangkok · ★4.75 · 1 bedroom · 1... | 153730 | Sirilak | NaN | Din Daeng | 13.78823 | 100.57256 | Private room | 1286.0 | 14 | 4 | 2023-04-03 | 0.04 | 1 | 327 | 2 | NaN |
df.describe()
id | host_id | neighbourhood_group | latitude | longitude | price | minimum_nights | number_of_reviews | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2.210400e+04 | 2.210400e+04 | 0.0 | 22104.000000 | 22104.000000 | 21582.000000 | 22104.000000 | 22104.000000 | 14162.000000 | 22104.000000 | 22104.000000 | 22104.000000 | 0.0 |
mean | 4.311415e+17 | 2.070399e+08 | NaN | 13.744426 | 100.562033 | 3104.512093 | 13.874186 | 16.850389 | 0.973434 | 23.785016 | 213.574557 | 5.638617 | NaN |
std | 4.450022e+17 | 1.737502e+08 | NaN | 0.041207 | 0.049244 | 17681.953565 | 45.670650 | 43.090693 | 1.434982 | 41.836205 | 135.820058 | 14.936326 | NaN |
min | 2.793400e+04 | 2.144700e+04 | NaN | 13.527300 | 100.329550 | 22.000000 | 1.000000 | 0.000000 | 0.010000 | 1.000000 | 0.000000 | 0.000000 | NaN |
25% | 2.924233e+07 | 4.853662e+07 | NaN | 13.719570 | 100.532250 | 999.000000 | 1.000000 | 0.000000 | 0.180000 | 2.000000 | 86.000000 | 0.000000 | NaN |
50% | 5.381190e+07 | 1.652139e+08 | NaN | 13.738820 | 100.563800 | 1506.000000 | 1.000000 | 2.000000 | 0.540000 | 6.000000 | 255.000000 | 1.000000 | NaN |
75% | 8.897166e+17 | 3.336703e+08 | NaN | 13.758170 | 100.585010 | 2632.000000 | 10.000000 | 13.000000 | 1.250000 | 26.000000 | 350.000000 | 5.000000 | NaN |
max | 1.053584e+18 | 5.522846e+08 | NaN | 13.953540 | 100.923710 | 1000000.000000 | 1115.000000 | 1809.000000 | 62.140000 | 230.000000 | 365.000000 | 796.000000 | NaN |
df.sample(10)
id | name | host_id | host_name | neighbourhood_group | neighbourhood | latitude | longitude | room_type | price | minimum_nights | number_of_reviews | last_review | reviews_per_month | calculated_host_listings_count | availability_365 | number_of_reviews_ltm | license | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
14703 | 822827148383064365 | Rental unit in Khet Watthana · ★4.86 · 1 bedro... | 345168212 | Digital Nomad Work | NaN | Vadhana | 13.737577 | 100.564350 | Entire home/apt | 2905.0 | 21 | 7 | 2023-11-30 | 0.74 | 2 | 338 | 7 | NaN |
2954 | 18011312 | Bed and breakfast in Bangkok · ★4.27 · 2 bedro... | 123827255 | Zee Thai | NaN | Phra Nakhon | 13.764350 | 100.502360 | Private room | 543.0 | 1 | 22 | 2023-12-11 | 0.27 | 20 | 354 | 6 | NaN |
9426 | 41870897 | Serviced apartment in Khet Din Daeng · 1 bedro... | 330608179 | Mr.Pranon | NaN | Din Daeng | 13.764620 | 100.563930 | Private room | 1200.0 | 1 | 0 | NaN | NaN | 1 | 88 | 0 | NaN |
13293 | 765042873480043520 | Rental unit in Khet Watthana · ★4.78 · 1 bedro... | 4138170 | Kevin | NaN | Vadhana | 13.722910 | 100.604680 | Entire home/apt | 1774.0 | 1 | 41 | 2023-12-09 | 3.10 | 12 | 360 | 37 | NaN |
17398 | 921472489614446222 | Rental unit in Khet Phra Khanong · ★4.62 · 1 b... | 52161947 | Noons | NaN | Phra Khanong | 13.689844 | 100.613614 | Entire home/apt | 1749.0 | 1 | 20 | 2023-12-15 | 3.31 | 131 | 0 | 20 | NaN |
19056 | 964116073630095296 | Rental unit in Watthana · 1 bedroom · 1 bed · ... | 274026012 | 潜 | NaN | Vadhana | 13.743108 | 100.559124 | Entire home/apt | 1643.0 | 21 | 0 | NaN | NaN | 15 | 148 | 0 | NaN |
7731 | 36512868 | Hostel in Don Mueang · ★4.50 · 1 bedroom · 3 beds | 176795569 | Suporn | NaN | Don Mueang | 13.942735 | 100.612659 | Hotel room | 774.0 | 1 | 4 | 2019-12-31 | 0.08 | 8 | 287 | 0 | NaN |
6091 | 31211112 | Home in Bangkok · ★5.0 · 3 bedrooms · 8 beds ·... | 24777734 | Paul | NaN | Saphan Sung | 13.743820 | 100.673230 | Entire home/apt | 6543.0 | 2 | 8 | 2023-12-11 | 0.14 | 4 | 364 | 5 | NaN |
13272 | 762630175838071003 | Rental unit in Khet Bang Kapi · ★5.0 · 1 bedro... | 43563241 | Raxtham | NaN | Bang Kapi | 13.767860 | 100.638780 | Private room | 390.0 | 2 | 3 | 2023-08-10 | 0.26 | 8 | 344 | 3 | NaN |
10913 | 53370694 | Rental unit in Khet Watthana · ★4.17 · 1 bedro... | 240343545 | Joseph | NaN | Vadhana | 13.740160 | 100.566990 | Entire home/apt | 3750.0 | 28 | 6 | 2023-08-06 | 0.24 | 38 | 146 | 1 | NaN |
df.shape
(22104, 18)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22104 entries, 0 to 22103 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 22104 non-null int64 1 name 22104 non-null object 2 host_id 22104 non-null int64 3 host_name 22104 non-null object 4 neighbourhood_group 0 non-null float64 5 neighbourhood 22104 non-null object 6 latitude 22104 non-null float64 7 longitude 22104 non-null float64 8 room_type 22104 non-null object 9 price 21582 non-null float64 10 minimum_nights 22104 non-null int64 11 number_of_reviews 22104 non-null int64 12 last_review 14162 non-null object 13 reviews_per_month 14162 non-null float64 14 calculated_host_listings_count 22104 non-null int64 15 availability_365 22104 non-null int64 16 number_of_reviews_ltm 22104 non-null int64 17 license 0 non-null float64 dtypes: float64(6), int64(7), object(5) memory usage: 3.0+ MB
df.columns
Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm', 'license'], dtype='object')
## id, name, host_id, host_name, neighbourhood_group, license columns are not needed
df.drop(columns=['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'license'], axis=1, inplace=True)
df.columns
Index(['neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'number_of_reviews_ltm'], dtype='object')
df.nunique()
neighbourhood 50 latitude 15580 longitude 16103 room_type 4 price 3938 minimum_nights 97 number_of_reviews 333 last_review 1667 reviews_per_month 633 calculated_host_listings_count 73 availability_365 366 number_of_reviews_ltm 122 dtype: int64
df.isnull().sum()
neighbourhood 0 latitude 0 longitude 0 room_type 0 price 522 minimum_nights 0 number_of_reviews 0 last_review 7942 reviews_per_month 7942 calculated_host_listings_count 0 availability_365 0 number_of_reviews_ltm 0 dtype: int64
px.histogram(df, x='neighbourhood')
df['neighbourhood'].value_counts()
Vadhana 3341 Khlong Toei 3295 Huai Khwang 1899 Ratchathewi 1550 Sathon 1007 Bang Rak 972 Phra Nakhon 825 Phra Khanong 824 Chatu Chak 710 Din Daeng 589 Parthum Wan 566 Bang Na 538 Suanluang 535 Khlong San 498 Phaya Thai 438 Bang Kapi 414 Bang Phlat 383 Bang Sue 305 Yan na wa 263 Lat Krabang 247 Don Mueang 219 Thon buri 217 Pra Wet 184 Bangkok Noi 184 Samphanthawong 177 Phasi Charoen 163 Bang Kho laen 159 Pom Prap Sattru Phai 158 Wang Thong Lang 154 Lak Si 147 Bang Khen 121 Dusit 116 Bangkok Yai 114 Chom Thong 101 Bang Khae 87 Khan Na Yao 66 Lat Phrao 64 Bueng Kum 64 Saphan Sung 62 Min Buri 61 Rat Burana 61 Sai Mai 49 Taling Chan 45 Khlong Sam Wa 33 Thung khru 26 Bang Khun thain 23 Nong Chok 19 Thawi Watthana 16 Nong Khaem 8 Bang Bon 7 Name: neighbourhood, dtype: int64
pr1 = df.groupby('neighbourhood', as_index=False)['price'].mean()
px.bar(pr1, x='neighbourhood', y='price')
## removing all neighbourhoods with less than 200 units, including 'Taling Chan' which is a pricing outlier
df = df.groupby('neighbourhood').filter(lambda x : len(x)>200)
px.histogram(df, x='neighbourhood')
pr1 = df.groupby('neighbourhood', as_index=False)['price'].mean()
px.bar(pr1, x='neighbourhood', y='price')
fig = px.violin(df, x="neighbourhood", y="price")
fig.show()
fig = px.box(df, x="neighbourhood", y="price")
fig.show()
df[["neighbourhood", "price"]].groupby("neighbourhood").describe()
price | ||||||||
---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | |
neighbourhood | ||||||||
Bang Kapi | 406.0 | 2394.110837 | 6225.740661 | 320.0 | 750.00 | 1179.0 | 1749.50 | 102857.0 |
Bang Na | 532.0 | 2243.793233 | 5978.382635 | 350.0 | 790.00 | 1028.5 | 1543.00 | 90000.0 |
Bang Phlat | 368.0 | 1388.774457 | 2452.067802 | 347.0 | 583.00 | 891.0 | 1223.75 | 29393.0 |
Bang Rak | 934.0 | 3226.020343 | 10160.155746 | 310.0 | 1212.25 | 1969.5 | 3569.25 | 300000.0 |
Bang Sue | 299.0 | 1859.698997 | 6503.838597 | 348.0 | 700.00 | 905.0 | 1299.50 | 100000.0 |
Chatu Chak | 693.0 | 2324.432900 | 6482.725291 | 315.0 | 800.00 | 1200.0 | 1899.00 | 100000.0 |
Din Daeng | 574.0 | 2925.024390 | 14450.833860 | 277.0 | 850.00 | 1232.5 | 2000.00 | 300000.0 |
Don Mueang | 217.0 | 1239.281106 | 1329.261571 | 346.0 | 580.00 | 800.0 | 1310.00 | 10181.0 |
Huai Khwang | 1848.0 | 4840.334957 | 46905.197577 | 350.0 | 1060.25 | 1557.0 | 2600.00 | 1000000.0 |
Khlong San | 486.0 | 2838.382716 | 8114.317901 | 280.0 | 990.00 | 1421.5 | 2346.25 | 160000.0 |
Khlong Toei | 3224.0 | 2932.180211 | 5841.815554 | 290.0 | 1217.00 | 1700.0 | 2681.75 | 160000.0 |
Lat Krabang | 240.0 | 1491.170833 | 1217.511441 | 331.0 | 812.00 | 1200.0 | 1582.50 | 8956.0 |
Parthum Wan | 557.0 | 4838.791741 | 15924.924222 | 331.0 | 1504.00 | 2500.0 | 4260.00 | 300000.0 |
Phaya Thai | 427.0 | 2067.330211 | 2082.917201 | 320.0 | 950.50 | 1454.0 | 2464.50 | 20000.0 |
Phra Khanong | 806.0 | 1703.207196 | 3004.609572 | 305.0 | 830.00 | 1150.0 | 1880.00 | 71609.0 |
Phra Nakhon | 814.0 | 1919.684275 | 2059.442213 | 250.0 | 800.00 | 1355.5 | 2398.75 | 33350.0 |
Ratchathewi | 1513.0 | 3560.150694 | 16721.269507 | 329.0 | 1300.00 | 1860.0 | 2757.00 | 300000.0 |
Sathon | 984.0 | 2495.058943 | 4460.016034 | 318.0 | 1010.00 | 1479.5 | 2389.50 | 75000.0 |
Suanluang | 519.0 | 2149.795761 | 3441.550500 | 332.0 | 850.00 | 1094.0 | 1811.00 | 37000.0 |
Thon buri | 210.0 | 1815.409524 | 4225.085954 | 350.0 | 850.00 | 1183.5 | 1800.00 | 60000.0 |
Vadhana | 3265.0 | 4248.409801 | 15357.271301 | 300.0 | 1386.00 | 2128.0 | 4000.00 | 692732.0 |
Yan na wa | 258.0 | 3318.271318 | 7117.129000 | 400.0 | 1090.00 | 1489.5 | 3410.50 | 75000.0 |
## removing price outliers per neighbourhood
def is_outlier(s):
lower_limit = s.mean() - (s.std() * 3)
upper_limit = s.mean() + (s.std() * 3)
return ~s.between(lower_limit, upper_limit)
df = df[~df.groupby('neighbourhood')['price'].apply(is_outlier)]
fig = px.box(df, x="neighbourhood", y="price")
fig.show()
fig = px.violin(df, x="neighbourhood", y="price")
fig.show()
px.bar(
data_frame=df.groupby('room_type')['price'].mean().reset_index(),
x="room_type",
y="price"
)
fig = px.violin(df, x="room_type", y="price")
fig.show()
## applying outliers removal to room tye prices
df = df[~df.groupby('neighbourhood')['price'].apply(is_outlier)]
px.bar(
data_frame=df.groupby('room_type')['price'].mean().reset_index(),
x="room_type",
y="price"
)
fig = px.violin(df, x="room_type", y="price")
fig.show()
pr = df.groupby(['neighbourhood','room_type'], as_index=False)['price'].mean()
fig = px.bar(pr, x="neighbourhood", y='price', color="room_type", barmode="group")
fig.show()
pr2 = df.groupby(['neighbourhood'], as_index=False)['availability_365'].mean()
fig = px.bar(pr2, x="neighbourhood", y='availability_365')
fig.show()