import pyodbc
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from folium import plugins
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap
from sqlalchemy import create_engine
from collections import Counter
from wordcloud import WordCloud
import re
pd.set_option("display.max_rows",100)
pd.set_option("display.max_columns",100)
plt.style.use('ggplot')
db_engine = create_engine('mssql+pyodbc://sa:8biq9V6NJxpUP4kSQVqi@system/AirBnb?driver=ODBC+Driver+17+for+SQL+Server')
'''
4 quarter periods and last_scraped values for each one
Sep 9 2022:
2022-09-09
2022-09-10
Dec 6 2022:
2022-12-06
2022-12-07
2022-12-25
Mar 7 2023:
2023-03-07
2023-03-08
2023-03-09
Jun 6 2023:
2023-06-06
2023-06-07
2023-06-08
'''
listing = pd.read_sql_table('listing_la', db_engine)
review = pd.read_sql_table('review_la', db_engine)
listing.head()
id | listing_url | scrape_id | last_scraped | source | name | description | neighborhood_overview | picture_url | host_id | host_url | host_name | host_since | host_location | host_about | host_response_time | host_response_rate | host_acceptance_rate | host_is_superhost | host_thumbnail_url | host_picture_url | host_neighbourhood | host_listings_count | host_total_listings_count | host_verifications | host_has_profile_pic | host_identity_verified | neighbourhood | neighbourhood_cleansed | neighbourhood_group_cleansed | latitude | longitude | property_type | room_type | accommodates | bathrooms | bathrooms_text | bedrooms | beds | amenities | price | minimum_nights | maximum_nights | minimum_minimum_nights | maximum_minimum_nights | minimum_maximum_nights | maximum_maximum_nights | minimum_nights_avg_ntm | maximum_nights_avg_ntm | calendar_updated | has_availability | availability_30 | availability_60 | availability_90 | availability_365 | calendar_last_scraped | number_of_reviews | number_of_reviews_ltm | number_of_reviews_l30d | first_review | last_review | review_scores_rating | review_scores_accuracy | review_scores_cleanliness | review_scores_checkin | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 675719763048199652 | https://www.airbnb.com/rooms/675719763048199652 | 20221206172243 | 2022-12-07 | city scrape | Walking distance to UCLA, private room shared ... | Forget your worries in this spacious and seren... | None | https://a0.muscache.com/pictures/miso/Hosting-... | 4780152 | https://www.airbnb.com/users/show/4780152 | Moon | 2013-01-20 | Los Angeles, CA | I am from LA\r\nI love studying. I care about ... | within an hour | 100% | 47% | t | https://a0.muscache.com/im/users/4780152/profi... | https://a0.muscache.com/im/users/4780152/profi... | Brentwood | 68.0 | 86.0 | ['email', 'phone'] | t | t | None | West Los Angeles | City of Los Angeles | 34.04863 | -118.43262 | Private room in condo | Private room | 1 | None | 1 private bath | 1.0 | 1.0 | ["Wifi", "Hot tub", "Long term stays allowed",... | 67.0 | 30 | 365 | 30.0 | 30.0 | 365.0 | 365.0 | 30.0 | 365.0 | None | t | 30 | 60 | 90 | 365 | 2022-12-07 | 1 | 1 | 0 | 2022-09-16 | 2022-09-16 | 4.0 | 3.0 | 2.0 | 5.0 | 5.0 | 5.0 | 4.0 | None | f | 26 | 4 | 22 | 0 | 0.36 |
1 | 678493211017259570 | https://www.airbnb.com/rooms/678493211017259570 | 20221206172243 | 2022-12-07 | city scrape | Film Executive Luxury Suite, pool, tiki bar, ETC | a really nice vibe atmosphere like being at at... | None | https://a0.muscache.com/pictures/miso/Hosting-... | 119676461 | https://www.airbnb.com/users/show/119676461 | David | 2017-03-07 | Los Angeles, CA | Cindy and David are a retired theatre producin... | within an hour | 100% | 97% | t | https://a0.muscache.com/im/pictures/user/9abd0... | https://a0.muscache.com/im/pictures/user/9abd0... | North Hollywood | 5.0 | 7.0 | ['email', 'phone'] | t | f | None | North Hollywood | City of Los Angeles | 34.17599 | -118.36910 | Entire guest suite | Entire home/apt | 2 | None | 1 bath | NaN | 1.0 | ["Wifi", "Long term stays allowed", "TV", "Fir... | 120.0 | 30 | 90 | 30.0 | 30.0 | 90.0 | 90.0 | 30.0 | 90.0 | None | t | 16 | 46 | 76 | 101 | 2022-12-07 | 1 | 1 | 0 | 2022-08-31 | 2022-08-31 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | None | f | 5 | 5 | 0 | 0 | 0.30 |
2 | 677263836495587840 | https://www.airbnb.com/rooms/677263836495587840 | 20221206172243 | 2022-12-07 | city scrape | Genting house with swimming pool in Runyon Canyon | This listing is in a prime location in the Hol... | The best location in Los Angeles.Near Runyon C... | https://a0.muscache.com/pictures/22fe5f73-b340... | 418509413 | https://www.airbnb.com/users/show/418509413 | Nina | 2021-08-15 | None | I'm from Los Angeles with more than 10 years o... | within an hour | 100% | 100% | f | https://a0.muscache.com/im/pictures/user/4cc11... | https://a0.muscache.com/im/pictures/user/4cc11... | Central LA | 2.0 | 3.0 | ['phone'] | t | t | Los Angeles, California, United States | Hollywood Hills West | City of Los Angeles | 34.11680 | -118.35407 | Entire villa | Entire home/apt | 9 | None | 3.5 baths | 4.0 | 5.0 | ["Shower gel", "Dishes and silverware", "TV", ... | 1241.0 | 1 | 365 | 1.0 | 1.0 | 1125.0 | 1125.0 | 1.0 | 1125.0 | None | t | 21 | 38 | 63 | 320 | 2022-12-07 | 3 | 3 | 0 | 2022-09-20 | 2022-10-30 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | HSR22-001906 | f | 2 | 2 | 0 | 0 | 1.14 |
3 | 15299255 | https://www.airbnb.com/rooms/15299255 | 20230307175359 | 2023-03-08 | previous scrape | Nice Hollywood home | Lovely home with gym and pool | None | https://a0.muscache.com/pictures/15f16ac8-c3f6... | 97000730 | https://www.airbnb.com/users/show/97000730 | Richard | 2016-09-27 | None | None | N/A | N/A | N/A | f | https://a0.muscache.com/im/pictures/user/6fc54... | https://a0.muscache.com/im/pictures/user/6fc54... | None | 1.0 | 3.0 | ['email', 'phone'] | t | f | None | North Hollywood | City of Los Angeles | 34.20079 | -118.37875 | Private room in rental unit | Private room | 1 | None | 1 bath | 1.0 | 1.0 | [] | 100.0 | 30 | 1125 | 30.0 | 30.0 | 1125.0 | 1125.0 | 30.0 | 1125.0 | None | f | 0 | 0 | 0 | 0 | 2023-03-08 | 0 | 0 | 0 | NaT | NaT | NaN | NaN | NaN | NaN | NaN | NaN | NaN | None | f | 1 | 0 | 1 | 0 | None |
4 | 678522593527800929 | https://www.airbnb.com/rooms/678522593527800929 | 20221206172243 | 2022-12-07 | city scrape | Stylish newly remodeled 2bd 2ba home in West LA | Stylish single home with mid-century/eclectic ... | None | https://a0.muscache.com/pictures/miso/Hosting-... | 163620715 | https://www.airbnb.com/users/show/163620715 | Sakona | 2017-12-21 | Los Angeles, CA | We are Waka & Sakona, we are designers in the ... | within a few hours | 100% | 100% | f | https://a0.muscache.com/im/pictures/user/b29db... | https://a0.muscache.com/im/pictures/user/b29db... | West Los Angeles | 1.0 | 1.0 | ['email', 'phone'] | t | f | None | Rancho Park | City of Los Angeles | 34.03645 | -118.43144 | Entire home | Entire home/apt | 4 | None | 2 baths | 2.0 | 2.0 | ["Private entrance", "Shower gel", "Dishes and... | 220.0 | 30 | 120 | 30.0 | 30.0 | 120.0 | 120.0 | 30.0 | 120.0 | None | t | 0 | 3 | 33 | 123 | 2022-12-07 | 3 | 3 | 1 | 2022-08-17 | 2022-11-07 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | 5.0 | None | f | 1 | 1 | 0 | 0 | 0.80 |
Всего, более 173 тысяч объявлений, описанных 75 атрибутами.
# displays the dimenstions of the listings dataset
listing.shape
(173168, 75)
# displays types for listings data
listing.dtypes
id int64 listing_url object scrape_id int64 last_scraped datetime64[ns] source object name object description object neighborhood_overview object picture_url object host_id int64 host_url object host_name object host_since datetime64[ns] host_location object host_about object host_response_time object host_response_rate object host_acceptance_rate object host_is_superhost object host_thumbnail_url object host_picture_url object host_neighbourhood object host_listings_count float64 host_total_listings_count float64 host_verifications object host_has_profile_pic object host_identity_verified object neighbourhood object neighbourhood_cleansed object neighbourhood_group_cleansed object latitude float64 longitude float64 property_type object room_type object accommodates int64 bathrooms object bathrooms_text object bedrooms float64 beds float64 amenities object price float64 minimum_nights int64 maximum_nights int64 minimum_minimum_nights float64 maximum_minimum_nights float64 minimum_maximum_nights float64 maximum_maximum_nights float64 minimum_nights_avg_ntm float64 maximum_nights_avg_ntm float64 calendar_updated object has_availability object availability_30 int64 availability_60 int64 availability_90 int64 availability_365 int64 calendar_last_scraped datetime64[ns] number_of_reviews int64 number_of_reviews_ltm int64 number_of_reviews_l30d int64 first_review datetime64[ns] last_review datetime64[ns] review_scores_rating float64 review_scores_accuracy float64 review_scores_cleanliness float64 review_scores_checkin float64 review_scores_communication float64 review_scores_location float64 review_scores_value float64 license object instant_bookable object calculated_host_listings_count int64 calculated_host_listings_count_entire_homes int64 calculated_host_listings_count_private_rooms int64 calculated_host_listings_count_shared_rooms int64 reviews_per_month object dtype: object
# checks missing values in all columns
listing.isna().sum().sort_values(ascending=False)
bathrooms 173168 calendar_updated 173168 license 129381 host_about 72505 neighbourhood 71729 neighborhood_overview 71729 review_scores_value 46580 review_scores_location 46572 review_scores_checkin 46563 review_scores_accuracy 46541 review_scores_communication 46539 review_scores_cleanliness 46538 review_scores_rating 45524 last_review 45524 first_review 45524 reviews_per_month 45524 host_neighbourhood 36323 host_location 34142 bedrooms 25734 host_is_superhost 12229 neighbourhood_group_cleansed 7889 description 3107 beds 2693 bathrooms_text 181 host_thumbnail_url 19 host_response_rate 19 host_name 19 host_since 19 host_response_time 19 maximum_nights_avg_ntm 19 host_picture_url 19 maximum_maximum_nights 19 minimum_maximum_nights 19 maximum_minimum_nights 19 minimum_minimum_nights 19 minimum_nights_avg_ntm 19 host_acceptance_rate 19 host_total_listings_count 19 host_listings_count 19 host_has_profile_pic 19 host_identity_verified 19 name 3 host_verifications 0 host_url 0 host_id 0 picture_url 0 accommodates 0 number_of_reviews_ltm 0 source 0 last_scraped 0 scrape_id 0 instant_bookable 0 calculated_host_listings_count 0 calculated_host_listings_count_entire_homes 0 calculated_host_listings_count_private_rooms 0 calculated_host_listings_count_shared_rooms 0 number_of_reviews_l30d 0 calendar_last_scraped 0 number_of_reviews 0 longitude 0 room_type 0 amenities 0 price 0 minimum_nights 0 maximum_nights 0 property_type 0 latitude 0 listing_url 0 neighbourhood_cleansed 0 has_availability 0 availability_30 0 availability_60 0 availability_90 0 availability_365 0 id 0 dtype: int64
# removes select columns
listing_mod = listing.drop(columns=["scrape_id", "source", "neighborhood_overview", "picture_url", "host_location", "host_about", \
"host_thumbnail_url", "host_picture_url", "host_neighbourhood", "host_verifications", "host_has_profile_pic", \
"host_identity_verified", "bathrooms", "minimum_minimum_nights", "maximum_minimum_nights", "minimum_maximum_nights", \
"maximum_maximum_nights", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm", "calendar_updated", "calendar_last_scraped", \
"number_of_reviews_ltm", "number_of_reviews_l30d", "review_scores_accuracy", "review_scores_cleanliness", \
"review_scores_checkin", "review_scores_communication", "review_scores_value", "instant_bookable", "calculated_host_listings_count", \
"calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms", \
"calculated_host_listings_count_shared_rooms", "reviews_per_month"])
listing_mod.dtypes
id int64 listing_url object last_scraped datetime64[ns] name object description object host_id int64 host_url object host_name object host_since datetime64[ns] host_response_time object host_response_rate object host_acceptance_rate object host_is_superhost object host_listings_count float64 host_total_listings_count float64 neighbourhood object neighbourhood_cleansed object neighbourhood_group_cleansed object latitude float64 longitude float64 property_type object room_type object accommodates int64 bathrooms_text object bedrooms float64 beds float64 amenities object price float64 minimum_nights int64 maximum_nights int64 has_availability object availability_30 int64 availability_60 int64 availability_90 int64 availability_365 int64 number_of_reviews int64 first_review datetime64[ns] last_review datetime64[ns] review_scores_rating float64 review_scores_location float64 license object dtype: object
Четыре столбца, указанные ниже, имели тип данных, состоящий из чисел с плавающей запятой, и были преобразованы в целые числа.
# displays and fills NA values for some columns before type conversion
temp = listing_mod[['id','host_listings_count','host_total_listings_count','bedrooms','beds']]
temp[temp.isna().any(axis=1)]
listing_mod.fillna({'host_listings_count':0,'host_total_listings_count':0,'bedrooms':0,'beds':0}, inplace = True)
# converts columns type from float to integer
listing_mod = listing_mod.astype({'host_listings_count':'int64','host_total_listings_count':'int64','bedrooms':'int64','beds':'int64'})
listing_mod.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 173168 entries, 0 to 173167 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 173168 non-null int64 1 listing_url 173168 non-null object 2 last_scraped 173168 non-null datetime64[ns] 3 name 173165 non-null object 4 description 170061 non-null object 5 host_id 173168 non-null int64 6 host_url 173168 non-null object 7 host_name 173149 non-null object 8 host_since 173149 non-null datetime64[ns] 9 host_response_time 173149 non-null object 10 host_response_rate 173149 non-null object 11 host_acceptance_rate 173149 non-null object 12 host_is_superhost 160939 non-null object 13 host_listings_count 173168 non-null int64 14 host_total_listings_count 173168 non-null int64 15 neighbourhood 101439 non-null object 16 neighbourhood_cleansed 173168 non-null object 17 neighbourhood_group_cleansed 165279 non-null object 18 latitude 173168 non-null float64 19 longitude 173168 non-null float64 20 property_type 173168 non-null object 21 room_type 173168 non-null object 22 accommodates 173168 non-null int64 23 bathrooms_text 172987 non-null object 24 bedrooms 173168 non-null int64 25 beds 173168 non-null int64 26 amenities 173168 non-null object 27 price 173168 non-null float64 28 minimum_nights 173168 non-null int64 29 maximum_nights 173168 non-null int64 30 has_availability 173168 non-null object 31 availability_30 173168 non-null int64 32 availability_60 173168 non-null int64 33 availability_90 173168 non-null int64 34 availability_365 173168 non-null int64 35 number_of_reviews 173168 non-null int64 36 first_review 127644 non-null datetime64[ns] 37 last_review 127644 non-null datetime64[ns] 38 review_scores_rating 127644 non-null float64 39 review_scores_location 126596 non-null float64 40 license 43787 non-null object dtypes: datetime64[ns](4), float64(5), int64(14), object(18) memory usage: 54.2+ MB
# explores the unique values in these columns
print('Уникальные значения в "Neighbourhood": ')
print(listing_mod.neighbourhood_group_cleansed.unique())
print('Уникальные значения в "Room type": ')
print(listing_mod.room_type.unique())
print('Уникальные значения в "Host response time": ')
print(listing_mod.host_response_time.unique())
# converts types to a category
listing_mod = listing_mod.astype({'neighbourhood_group_cleansed':'category','room_type':'category', \
'host_response_time':'category'})
listing_mod.dtypes
Уникальные значения в "Neighbourhood": ['City of Los Angeles' 'Unincorporated Areas' 'Other Cities' None] Уникальные значения в "Room type": ['Private room' 'Entire home/apt' 'Shared room' 'Hotel room'] Уникальные значения в "Host response time": ['within an hour' 'N/A' 'within a few hours' 'a few days or more' 'within a day' None]
id int64 listing_url object last_scraped datetime64[ns] name object description object host_id int64 host_url object host_name object host_since datetime64[ns] host_response_time category host_response_rate object host_acceptance_rate object host_is_superhost object host_listings_count int64 host_total_listings_count int64 neighbourhood object neighbourhood_cleansed object neighbourhood_group_cleansed category latitude float64 longitude float64 property_type object room_type category accommodates int64 bathrooms_text object bedrooms int64 beds int64 amenities object price float64 minimum_nights int64 maximum_nights int64 has_availability object availability_30 int64 availability_60 int64 availability_90 int64 availability_365 int64 number_of_reviews int64 first_review datetime64[ns] last_review datetime64[ns] review_scores_rating float64 review_scores_location float64 license object dtype: object
# checks duplicate records for all columns
print('Количество дубликатов: ' + str(listing_mod.duplicated().sum()))
# checks duplicates on a specific subset of columns
listing_mod.duplicated(subset = ['id', 'last_scraped', 'host_id']).sum()
Количество дубликатов: 0
0
listing_corr = listing_mod[['bedrooms','beds','price','number_of_reviews', 'minimum_nights']].dropna().corr()
sns.heatmap(listing_corr, annot=True)
plt.show()
# identifies and removes outliers in the price column
plt.figure(figsize=(12,6))
plt.title('Распределение цены')
sns.boxplot(data=listing_mod, x='price')
plt.show()
listing_mod.price.describe()
count 173168.000000 mean 283.135664 std 961.390765 min 0.000000 25% 89.000000 50% 148.000000 75% 250.000000 max 99999.000000 Name: price, dtype: float64
Q1 = listing_mod.price.quantile(0.25)
Q3 = listing_mod.price.quantile(0.75)
IQR = Q3 - Q1
lower_limit = Q1 - IQR * 1.5
upper_limit = Q3 + IQR * 12
print(f'Нижний предел: {lower_limit}\nВерхний предел: {upper_limit}')
# since the lower limit is negative, let's check if there is any prices below zero
print(f'Записи с ценами ниже нуля: {len(listing_mod[listing_mod.price < 0].index)}')
# we pick big factor of 12 to give room for listings with reasonably high prices considering the location and the property size
listing_mod = listing_mod[~(listing_mod.price > upper_limit)]
print(f'Обновлённое количество строк: {len(listing_mod.index)}')
Нижний предел: -152.5 Верхний предел: 2182.0 Записи с ценами ниже нуля: 0 Обновлённое количество строк: 171061
У нас есть несколько дат, когда была собрана информация, и одна дата, когда были сохранены большинство записей.
Например, период, датированный 6 июня 2023 года, также содержит некоторые записи, датированные двумя следующими днями.
Июнь 6 2023:
2023-06-06
2023-06-07
2023-06-08
# shows unique dates of scraped data for all 4 quarters
listing_mod.groupby('last_scraped').size()
# substitutes dates with the latest value in each quarter for cleaner grouping
listing_mod.loc[listing_mod.last_scraped == '2022-09-09', 'last_scraped'] = '2022-09-10'
listing_mod.loc[listing_mod.last_scraped == '2022-12-06', 'last_scraped'] = '2022-12-25'
listing_mod.loc[listing_mod.last_scraped == '2022-12-07', 'last_scraped'] = '2022-12-25'
listing_mod.loc[listing_mod.last_scraped == '2023-03-07', 'last_scraped'] = '2023-03-09'
listing_mod.loc[listing_mod.last_scraped == '2023-03-08', 'last_scraped'] = '2023-03-09'
listing_mod.loc[listing_mod.last_scraped == '2023-06-06', 'last_scraped'] = '2023-06-08'
listing_mod.loc[listing_mod.last_scraped == '2023-06-07', 'last_scraped'] = '2023-06-08'
# displays 4 unique dates: one per quarter
listing_mod.groupby('last_scraped').size()
last_scraped 2022-09-10 45264 2022-12-25 39946 2023-03-09 41922 2023-06-08 43929 dtype: int64
В этом разделе исследуются 25 собственников с самой высокой прогнозируемой ежемесячной выручкой.
# identifies top 25 hosts by their projected revenue (average for all quarters)
listing_mod['projected_revenue_30d'] = listing_mod['price'] * (30 - listing_mod['availability_30'])
top_hosts_rev = listing_mod.groupby('host_id')['projected_revenue_30d'].mean().sort_values(ascending=False).head(25)
top_hosts_rev = listing_mod.loc[listing_mod['host_id'].isin(top_hosts_rev.index), \
['host_id', 'host_name', 'host_since', 'neighbourhood_cleansed', 'property_type', 'projected_revenue_30d', 'license']]
# shows how many hosts are licensed
host_lic_dist = top_hosts_rev[['host_id','license']]
host_lic_dist = host_lic_dist.fillna({'license': 'None'})
host_lic_dist = host_lic_dist.drop_duplicates()
plt.figure(figsize=(4,2))
host_lic_dist.license.value_counts().plot(kind='bar')
plt.title("Распределение Лицензий Среди Лучших Собственников")
plt.xlabel('Лицензия')
plt.ylabel('Количество')
plt.xticks(rotation=75)
plt.show()
# lists information about top 25 hosts by projected revenue(30 days)
top_hosts_rev = top_hosts_rev.groupby(['host_id', 'host_name', 'host_since', 'neighbourhood_cleansed', 'property_type']) \
.agg(average_proj_revenue=('projected_revenue_30d','mean')).sort_values('average_proj_revenue', ascending=False).reset_index()
top_hosts_rev
host_id | host_name | host_since | neighbourhood_cleansed | property_type | average_proj_revenue | |
---|---|---|---|---|---|---|
0 | 127909016 | Jordan | 2017-04-27 | Beverly Crest | Entire home | 63000.0 |
1 | 165879616 | Jaida | 2018-01-04 | Beverly Grove | Entire home | 60000.0 |
2 | 262960927 | Kay | 2019-05-20 | Westwood | Private room in condo | 60000.0 |
3 | 1779977 | Dennis | 2012-02-20 | Malibu | Entire home | 60000.0 |
4 | 344798399 | Moneyteam | 2020-04-27 | Hawthorne | Entire home | 60000.0 |
5 | 4620768 | Jess | 2013-01-07 | Westwood | Entire condo | 60000.0 |
6 | 165879616 | Jaida | 2018-01-04 | Beverly Crest | Entire home | 60000.0 |
7 | 21603394 | Kevin | 2014-09-21 | Redondo Beach | Entire rental unit | 60000.0 |
8 | 25276597 | Jonathan | 2014-12-27 | Beverly Grove | Entire serviced apartment | 60000.0 |
9 | 257369700 | Billy | 2019-04-22 | Rancho Palos Verdes | Entire home | 60000.0 |
10 | 49148904 | Donna | 2015-11-16 | Inglewood | Entire home | 60000.0 |
11 | 73617291 | Amir | 2016-05-23 | Malibu | Entire villa | 60000.0 |
12 | 302834378 | Alaa | 2019-10-17 | Culver City | Private room in rental unit | 59970.0 |
13 | 38857249 | Chantal | 2015-07-18 | West Hollywood | Entire home | 59850.0 |
14 | 155261 | Lindsay | 2010-06-29 | Windsor Square | Entire home | 57420.0 |
15 | 267453940 | Kaylen | 2019-06-09 | Malibu | Entire home | 57000.0 |
16 | 11522437 | Leslie | 2014-01-21 | Hollywood Hills West | Entire home | 57000.0 |
17 | 160766240 | Bob | 2017-11-30 | Sherman Oaks | Entire home | 56970.0 |
18 | 385575830 | Onefinestay | 2021-01-22 | Hollywood Hills | Entire rental unit | 56700.0 |
19 | 41239505 | Shah | 2015-08-12 | Venice | Entire rental unit | 56700.0 |
20 | 186993728 | Joe | 2018-04-29 | Laguna Niguel | Entire home | 53850.0 |
21 | 1410898 | Jeff | 2011-11-15 | Fairfax | Entire home | 53200.0 |
22 | 2620843 | Sandra | 2012-06-12 | Venice | Entire home | 52595.5 |
23 | 93729818 | Ori | 2016-09-05 | Encino | Entire rental unit | 51000.0 |
24 | 37594030 | John | 2015-07-05 | Bel-Air | Entire home | 50940.0 |
25 | 174309124 | Jose | 2018-02-20 | Studio City | Entire rental unit | 50850.0 |
# displays the distribution of neighbourhoods for the top hosts
loc_dist = top_hosts_rev.neighbourhood_cleansed.value_counts()
plt.figure(figsize=(5, 4))
sns.countplot(y='neighbourhood_cleansed', data=top_hosts_rev, order=loc_dist.index, orient='h',edgecolor=None)
plt.title("Распределение по Районам (Топ Собственники)")
plt.xlabel("Количество")
plt.ylabel("Районы")
plt.show()
# displays the distribution of property types for the top hosts
prop_dist = top_hosts_rev['property_type'].value_counts()
#plt.style.use('ggplot')
plt.pie(prop_dist, labels=prop_dist.index, autopct="%.1f%%")
plt.title("Распределение по Виду Собственности(Топ Собственники)")
plt.show()
Датасет содержит четыре отрезка времени, покрывающих информацию об объявлениях за период примерно с июня 2022 до 6 июня 2023
price_mean = listing_mod.groupby('last_scraped').agg({'price':'mean'})
plt.figure(figsize=(5, 3))
sns.lineplot(x='last_scraped', y='price', data=price_mean, label='Цена')
plt.title("Средняя Цена за Период Времени")
plt.xlabel("Дата")
plt.ylabel("Цена(USD)")
# rotates x-axis labels for readability
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Это показывает нам тренд, при котором собственники снижают цены ближе к концу года, чтобы увеличить спрос в этом последнем квартале 2022 года.
# identifies top 25 listings with the largest number of reviews
# shows all listings across all dates when the data was collected
review_group = review.groupby('listing_id')['listing_id'].count().sort_values(ascending=False).head(25)
listing_25 = listing_mod.loc[listing_mod.id.isin(review_group.index),['id', 'listing_url', 'name', 'host_id', \
'host_name', 'last_scraped', 'neighbourhood_cleansed', 'property_type', \
'number_of_reviews']].sort_values('number_of_reviews', ascending=False)
print(f'\nРаспределение по виду собственности:\n\n{listing_25.property_type.value_counts()}')
listing_25.head(15)
Распределение по виду собственности: property_type Entire guesthouse 21 Private room in home 14 Room in boutique hotel 13 Entire guest suite 13 Entire home 11 Entire rental unit 7 Entire cottage 4 Private room in cottage 1 Name: count, dtype: int64
id | listing_url | name | host_id | host_name | last_scraped | neighbourhood_cleansed | property_type | number_of_reviews | |
---|---|---|---|---|---|---|---|---|---|
99920 | 42409434 | https://www.airbnb.com/rooms/42409434 | The Burlington Hotel | 229716119 | Simone | 2023-03-09 | Westlake | Entire rental unit | 2084 |
77420 | 42409434 | https://www.airbnb.com/rooms/42409434 | The Burlington Hotel | 229716119 | Simone | 2022-12-25 | Westlake | Entire rental unit | 1924 |
18622 | 42409434 | https://www.airbnb.com/rooms/42409434 | The Burlington Hotel | 229716119 | Simone | 2022-09-10 | Westlake | Entire rental unit | 1702 |
38704 | 25186230 | https://www.airbnb.com/rooms/25186230 | King Kitchen Studio -Disneyland/Knott's Berry ... | 174393062 | Hotel Pepper Tree | 2022-09-10 | Anaheim | Room in boutique hotel | 1175 |
155210 | 578174478591578178 | https://www.airbnb.com/rooms/578174478591578178 | Boutique hotel in Inglewood · ★4.28 · 1 bedroo... | 253105222 | Sonder (Los Angeles) | 2023-06-08 | Inglewood | Room in boutique hotel | 1137 |
129714 | 1990543 | https://www.airbnb.com/rooms/1990543 | Guesthouse in Santa Monica · ★4.65 · 1 bedroom... | 10265095 | Janet | 2023-06-08 | Santa Monica | Entire guesthouse | 1094 |
101601 | 1990543 | https://www.airbnb.com/rooms/1990543 | Private 1906 Bungalow | 10265095 | Janet | 2023-03-09 | Santa Monica | Entire guesthouse | 1068 |
61032 | 1990543 | https://www.airbnb.com/rooms/1990543 | Private 1906 Bungalow | 10265095 | Janet | 2022-12-25 | Santa Monica | Entire guesthouse | 1048 |
40193 | 3561523 | https://www.airbnb.com/rooms/3561523 | Private Entrance Queen Bed Newport Mesa | 3346091 | Vicki | 2022-09-10 | Costa Mesa | Private room in cottage | 1043 |
153568 | 53827863 | https://www.airbnb.com/rooms/53827863 | Boutique hotel in Inglewood · ★4.34 · 1 bedroo... | 253105222 | Sonder (Los Angeles) | 2023-06-08 | Inglewood | Room in boutique hotel | 1039 |
1624 | 1990543 | https://www.airbnb.com/rooms/1990543 | Private 1906 Bungalow | 10265095 | Janet | 2022-09-10 | Santa Monica | Entire guesthouse | 1027 |
145451 | 42515360 | https://www.airbnb.com/rooms/42515360 | Boutique hotel in Los Angeles · ★4.76 · 1 bedr... | 295744253 | Hoxton | 2023-06-08 | Downtown | Room in boutique hotel | 1024 |
94201 | 42515360 | https://www.airbnb.com/rooms/42515360 | A Lush Room at The Hoxton, Downtown LA | 295744253 | Hoxton | 2023-03-09 | Downtown | Room in boutique hotel | 1020 |
131550 | 7293369 | https://www.airbnb.com/rooms/7293369 | Home in Malibu · ★4.95 · 1 bedroom · 1 bed · 1... | 37437994 | Shelly & Paul | 2023-06-08 | Malibu | Private room in home | 1019 |
77551 | 42515360 | https://www.airbnb.com/rooms/42515360 | A Lush Room at The Hoxton, Downtown LA | 295744253 | Hoxton | 2022-12-25 | Downtown | Room in boutique hotel | 1012 |
unique_coordinates = listing_mod.drop_duplicates(subset=['latitude', 'longitude'])
locations = list(zip(unique_coordinates.latitude, unique_coordinates.longitude))
init_long = -118.36
init_lat = 34.06
map_la_listings = folium.Map(location=[init_lat, init_long], zoom_start=9)
HeatMap(locations,min_opacity=0.25).add_to(map_la_listings)
FastMarkerCluster(data=locations, radius=5).add_to(map_la_listings)
map_la_listings
neig_dist = listing_mod.neighbourhood_cleansed.value_counts().head(50)
plt.figure(figsize=(12,10))
plt.title("50 Лучших Районов по Количеству Объявлений", fontsize=16)
plt.xlabel('Количество Объявлений', fontsize=12)
reversed_palette = sns.color_palette("inferno", n_colors=len(neig_dist))[::-1]
sns.countplot(y='neighbourhood_cleansed', data=listing_mod, order=neig_dist.index, palette=reversed_palette)
plt.ylabel('Районы')
plt.xlabel('Количество Объявлений')
# adds listings count beside each neighbourhood
for index, value in enumerate(neig_dist):
plt.text(value + 20, index, str(value), fontsize=8, va='center', weight='bold')
plt.show()
neighbourhoods = listing_mod.groupby('neighbourhood_cleansed', as_index=False)
top_loc = pd.DataFrame({'Neighbourhood': pd.Series(dtype='str'),
'Mean Projected Revenue(30d)':pd.Series(dtype='float'),
'Mean Price': pd.Series(dtype='float'),
'Property Type': pd.Series(dtype='str')})
# calculates most used property type for a neighbourhood and adds the row to the temporary table
for neighb, data in neighbourhoods:
projected_revenue = data['projected_revenue_30d'].mean()
price = data['price'].mean()
property_type = data['property_type'].mode()[0]
top_loc.loc[len(top_loc)] = [neighb, projected_revenue, price, property_type]
# Mean Projected Revenue(30d) value is relative to all listings in the neighbourhood and not the whole neighbourhood itself
top_loc = top_loc.sort_values('Mean Projected Revenue(30d)', ascending=False).head(25)
top_loc
Neighbourhood | Mean Projected Revenue(30d) | Mean Price | Property Type | |
---|---|---|---|---|
84 | Emerald Bay | 18650.000000 | 1450.000000 | Entire home |
6 | Aliso and Wood Regional Park | 12825.000000 | 513.000000 | Entire home |
160 | Malibu | 10636.365679 | 806.946456 | Entire home |
22 | Bel-Air | 9742.377049 | 818.715847 | Entire home |
280 | Villa Park | 9384.000000 | 408.000000 | Entire home |
212 | Rolling Hills | 8730.850000 | 323.800000 | Entire guesthouse |
263 | Unincorporated Santa Monica Mountains | 8563.676681 | 592.638054 | Entire home |
296 | Westlake Village | 8239.736842 | 410.877193 | Entire home |
177 | Newport Beach | 8201.215130 | 497.951143 | Entire home |
207 | Rancho Palos Verdes | 8108.256065 | 396.110512 | Private room in home |
193 | Palos Verdes Estates | 7711.961538 | 470.179487 | Entire home |
26 | Beverly Crest | 7545.798913 | 743.193841 | Entire home |
117 | Hollywood Hills West | 7459.039374 | 652.057042 | Entire home |
189 | Pacific Palisades | 7375.629008 | 494.201527 | Entire home |
303 | Windsor Square | 6767.688525 | 289.688525 | Entire rental unit |
162 | Manhattan Beach | 6502.803381 | 410.327402 | Entire home |
213 | Rolling Hills Estates | 6426.760870 | 479.173913 | Entire home |
222 | San Marino | 6313.316667 | 248.550000 | Private room in home |
17 | Avalon | 6307.815951 | 434.279755 | Entire condo |
128 | La Crescenta-Montrose | 6165.328671 | 246.230769 | Entire home |
137 | Laguna Beach | 5722.688705 | 436.595041 | Entire home |
29 | Beverlywood | 5672.872340 | 287.936170 | Entire rental unit |
136 | Ladera Ranch | 5642.933333 | 233.533333 | Entire home |
257 | Topanga | 5590.427403 | 335.654397 | Entire home |
60 | Cypress | 5529.857143 | 270.142857 | Entire home |
loc_rooms_price = listing_mod[listing_mod.neighbourhood_cleansed.isin(top_loc.Neighbourhood)]
loc_rooms_price = loc_rooms_price.groupby(['neighbourhood_cleansed', 'bedrooms']).price.mean().unstack()
loc_rooms_price.sort_values('neighbourhood_cleansed', ascending=False)
plt.figure(figsize=(10,8))
sns.heatmap(loc_rooms_price, annot=True, fmt=".0f")
plt.ylabel('Районы')
plt.xlabel('Количество спален')
plt.show()
Самыми дорогими микрорайонами в целом являются Bel-Air, Malibu, Beverly Crest, Pacific Palisades и Unincorporated Santa Monica Mountains. Средние цены этих микрорайонов колеблются от 400 до более 1500 долларов за ночь.
Мы также замечаем тенденцию среди лучших микрорайонов. Например, Bel-Air показывает значительный скачок цен с 2-х до 3-х спальных комнат, в среднем 879 долларов за трёхспальные комнаты в сравнении с 303 долларами за две спальни.
Newport Beach имеет наибольшую доступность среди более крупных объектов 4-8 спальных комнат среди лучших районов.
Мы видим много пересечений. Это подтверждает, что новые хозяева должны обратить внимание на эти наиболее упомянутые удобства, чтобы соответствовать высочайшим ожиданиям клиентов.
amenities = listing_mod.amenities.to_list()
amenities_top_loc = listing_mod[listing_mod.neighbourhood_cleansed.isin(top_loc.Neighbourhood)].amenities.to_list()
import matplotlib.gridspec as gridspec
# extracts individual words from amenities column
words = []
for item in amenities:
words_lst = item.strip('][').split(', ')
words.extend(words_lst)
amenities_counter = Counter(words)
# gets the top 25 most common words
top_words = amenities_counter.most_common(25)
top_words_df = pd.DataFrame(top_words)
top_words_df.rename(columns={0:'Words', 1:'Count'}, inplace=True)
# extracts individual words from amenities column
words_loc = []
for item in amenities_top_loc:
words_lst_loc = item.strip('][').split(', ')
words_loc.extend(words_lst_loc)
amenities_counter_loc = Counter(words_loc)
# gets the top 25 most common words for neighbourhoods
top_words_loc = amenities_counter_loc.most_common(25)
top_words_loc_df = pd.DataFrame(top_words_loc)
top_words_loc_df.rename(columns={0:'Words', 1:'Count'}, inplace=True)
# plots
gs = gridspec.GridSpec(1, 2, width_ratios=[1.5,2])
fig = plt.figure(figsize=(25,14),dpi=300)
ax = plt.subplot(gs[0,0])
reversed_palette = sns.color_palette("inferno", n_colors=25)[::-1]
ax = sns.barplot(x='Count', y='Words', data=top_words_df, palette=reversed_palette, alpha=0.9)
ax.set_title('Топ 25 Наиболее Упоминаемых Удобств (Все Объявления)\n', fontsize=24)
ax.set_xlabel('Количество слов', fontsize=16)
ax.set_ylabel('Слова', fontsize=16)
ax.tick_params(axis='x', labelsize=16)
ax.tick_params(axis='y', labelsize=16)
ax1=plt.subplot(gs[0,1])
ax1 = sns.barplot(x='Count', y='Words', data=top_words_loc_df, palette=reversed_palette, alpha=0.9)
ax1.set_title('Топ 25 Наиболее Упоминаемых Удобств (Лучшие Районы)\n', fontsize=24)
ax1.tick_params(axis='x', labelsize=16)
ax1.tick_params(axis='y', labelsize=16)
plt.xlabel('Количество слов', fontsize=16)
plt.ylabel('Слова', fontsize=16)
fig.tight_layout()
plt.show()
from transformers import AutoTokenizer, AutoConfig
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import warnings
warnings.filterwarnings('ignore')
#pd.set_option('display.max_colwidth', None)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# max sequence length
MAX_LEN = 512
# uses the pretrained model to get the scores for each review
def get_sentiment(review):
review = review[:MAX_LEN]
encoded_input = tokenizer(review, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# gets the highest score and fills a sentiment label
max_score = max(scores)
sentiment = 'Neutral'
if scores[0] == max_score:
sentiment = 'Negative'
elif scores[2] == max_score:
sentiment = 'Positive'
return np.array([scores[0], scores[1], scores[2]]), sentiment
# applies sentiment scores and labels to a copy of the dataframe
def apply_sentiment(df):
df_copy = df.copy()
# creates sentiment columns first
df_copy['score_neg'] = None
df_copy['score_neu'] = None
df_copy['score_pos'] = None
df_copy['sentiment'] = None
for idx, row in df.iterrows():
scores, sentiment = get_sentiment(row.comments)
df_copy.at[idx, 'score_neg'] = scores[0]
df_copy.at[idx, 'score_neu'] = scores[1]
df_copy.at[idx, 'score_pos'] = scores[2]
df_copy.at[idx, 'sentiment'] = sentiment
return df_copy
# obtains the top 25 hosts listings ids for mathcing with reviews records
listings_ids = listing_mod.loc[listing_mod['host_id'].isin(top_hosts_rev.host_id), 'id']
review_top_hosts = review[review.listing_id.isin(listings_ids)]
review_sentiment = apply_sentiment(review_top_hosts)
review_sentiment.head()
model.safetensors: 0%| | 0.00/501M [00:00<?, ?B/s]
listing_id | id | date | reviewer_id | reviewer_name | comments | score_neg | score_neu | score_pos | sentiment | |
---|---|---|---|---|---|---|---|---|---|---|
118832 | 16635769 | 196016735 | 2017-09-21 | 1808452 | Jeffrey | Amir's place is a huge house located right on ... | 0.00231 | 0.022884 | 0.974806 | Positive |
118833 | 16635769 | 324266676 | 2018-09-16 | 28777882 | Lara | This place is palatial! The Views!!!! The ow... | 0.00393 | 0.007768 | 0.988302 | Positive |
118834 | 16635769 | 652899389 | 2020-08-17 | 35660075 | Alexandra | Awesome for big groups. Host was very responsi... | 0.007756 | 0.043151 | 0.949093 | Positive |
119181 | 19573901 | 202036996 | 2017-10-09 | 43428003 | Candice | This location cannot be beat. Shah was a grea... | 0.004962 | 0.008434 | 0.986603 | Positive |
119182 | 19573901 | 204559179 | 2017-10-18 | 9117752 | Yehia | This is the best place you can stay at ever in... | 0.003481 | 0.007355 | 0.989164 | Positive |
# displays the distribution of property types for the top hosts
sent_dist = review_sentiment['sentiment'].value_counts()
plt.figure(figsize=(5,3))
plt.pie(sent_dist, labels=sent_dist.index, autopct="%.1f%%")
plt.title("Распределение Тональности Отзывов (Лучшие собственники)")
plt.show()
В успешных объявлениях мы видим слова такие как "место", "дом", "потрясающе", "любим", "красиво", "семья", "кухня", "гардеробная", "отличный собственник", "бассейн", "местоположение", "вид", "океан", "просторно", "Малибу" и так далее.
Эти неформальные слова и фразы указывают на удовлетворенность клиентов условиями проживания, видами и местоположением, выражая позитивное настроение и эмоции.
Некоторые настоящие имена хозяев также есть в облаке, указывающие на популярность определённых управляющих, а также внимание и отношение клиентов.
# fills missing values
print(review.isna().sum())
review.comments = review.comments.fillna('')
print('\n --- Отзывы после вызова fillna: \n')
print(review.isna().sum())
print('\n')
gs = gridspec.GridSpec(1, 2, width_ratios=[1,1])
fig = plt.figure(figsize=(14,10), dpi=250)
ax_r_hosts = plt.subplot(gs[0,0])
ax_r_all = plt.subplot(gs[0,1])
review_words_hosts = " ".join(rev for rev in review_top_hosts.comments)
wordcloud_hosts = WordCloud(width=700, height=400, max_words=350, min_word_length=3, ranks_only=True).generate(review_words_hosts)
ax_r_hosts.set_axis_off()
ax_r_hosts.set_title('Облако Слов (Отзывы лучших собственников)')
ax_r_hosts.imshow(wordcloud_hosts, interpolation='bilinear')
# generates a wordcloud plot
review_words = " ".join(rev for rev in review.comments)
wordcloud = WordCloud(width=700, height=400, max_words=350, min_word_length=3, ranks_only=True).generate(review_words)
ax_r_all.set_axis_off()
ax_r_all.set_title('Облако Слов (Все отзывы)')
ax_r_all.imshow(wordcloud, interpolation='bilinear')
fig.tight_layout()
plt.show()
listing_id 0 id 0 date 0 reviewer_id 0 reviewer_name 0 comments 2 dtype: int64 --- Отзывы после вызова fillna: listing_id 0 id 0 date 0 reviewer_id 0 reviewer_name 0 comments 0 dtype: int64
Цели, поставленные для этого проекта, были достигнуты, и этот раздел содержит ключевые выводы, открытия и фактические числа. Объявления AirBnb и отзывы для региона Лос-Анджелеса, Калифорния, были получены из публичного веб-ресурса Inside AirBnb, который регулярно публикует квартальные данные в виде снимков объявлений, доступных в определенной точке времени.
Оценивая лучшие районы на основе средней прогнозируемой месячной выручки, можно отметить, что прибрежные районы, такие как, Emerald Bay, Newport Beach, и Palos Verdes, занимают высокие позиции по генерации выручки.
Анализ показал, что эти прибыльные районы поддерживают баланс между спросом и доходами, предлагая цены, часто находящиеся в диапазоне от 400 до 800 долларов. Доминирование целых домов в этих районах подчеркивает фокус на групповом и семейном проживании.
В заключение, анализ раскрыл данные о наиболее высокооплачиваемых собственниках, оптимальных видов недвижимости, ценах, районах, удобствах и отношении гостей.