from IPython.display import display_markdown
display_markdown(open("README.md").read(), raw=True)
This dataset is based on the 2016-07-07 San Diego extract from Inside Airbnb. The original set includes AirBnb property listings with characteristics and price, and this is extended to additional variables based on each property's location.
Source
: Inside AirBnbURL
Processing
: download of the original files and addition of new variables is documented in regression_cleaning.ipynb
regression_db.geojson
%matplotlib inline
import requests
import pandas as pd
import geopandas as gpd
#import googlemaps
from scipy.spatial.distance import cdist
url = 'http://data.insideairbnb.com/united-states/'\
'ca/san-diego/2016-07-07/data/'\
'listings.csv.gz'
r = requests.get(url)
with open('listings.csv.gz', 'wb') as fo:
fo.write(r.content)
url = 'http://data.insideairbnb.com/united-states/'\
'ca/san-diego/2016-07-07/data/'\
'calendar.csv.gz'
r = requests.get(url)
with open('calendar.csv.gz', 'wb') as fo:
fo.write(r.content)
url = 'http://data.insideairbnb.com/united-states/'\
'ca/san-diego/2016-07-07/visualisations/'\
'neighbourhoods.geojson'
r = requests.get(url)
with open('neighbourhoods.geojson', 'wb') as fo:
fo.write(r.content)
lst = pd.read_csv('listings.csv.gz')
lst['priceN'] = lst['price'].apply(
lambda x: float(str(x)\
.replace(',', '')\
.strip('$')))
lst['l_price'] = pd.np.log(lst['priceN'])
GeoDataFrame
¶from shapely.geometry import Point
xys = lst[['longitude', 'latitude']]\
.apply(lambda row: Point(*row), axis=1)
gdb = gpd.GeoDataFrame(lst.assign(geometry=xys),
crs="+init=epsg:4326")
ams = []
gdb['pool'] = 0
for i in range(gdb.shape[0]):
r = gdb.loc[i, 'amenities']
pcs = r.strip('{').strip('}').split(',')
ams.extend(pcs)
if 'Pool' in pcs:
gdb.loc[i, 'pool'] = 1
set(ams)
{'', '"24-Hour Check-in"', '"Air Conditioning"', '"Buzzer/Wireless Intercom"', '"Cable TV"', '"Carbon Monoxide Detector"', '"Elevator in Building"', '"Family/Kid Friendly"', '"Fire Extinguisher"', '"First Aid Kit"', '"Free Parking on Premises"', '"Hair Dryer"', '"Hot Tub"', '"Indoor Fireplace"', '"Laptop Friendly Workspace"', '"Lock on Bedroom Door"', '"Other pet(s)"', '"Pets Allowed"', '"Pets live on this property"', '"Safety Card"', '"Smoke Detector"', '"Smoking Allowed"', '"Suitable for Events"', '"Wheelchair Accessible"', '"Wireless Internet"', '"translation missing: en.hosting_amenity_49"', '"translation missing: en.hosting_amenity_50"', 'Breakfast', 'Cat(s)', 'Dog(s)', 'Doorman', 'Dryer', 'Essentials', 'Gym', 'Hangers', 'Heating', 'Internet', 'Iron', 'Kitchen', 'Pool', 'Shampoo', 'TV', 'Washer'}
We can use geopy
to find out its location:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
Just type the name into the locator:
bp = geolocator.geocode("Balboa Park, San Diego, US")
bp
Location(Balboa Park, North Park, San Diego, San Diego County, California, 92102, United States of America, (32.73135675, -117.146526555704, 0.0))
b_ll = bp.longitude, bp.latitude
b_ll
(-117.146526555704, 32.73135675)
Then calculate distance to the park from each house:
# USA Contiguous Albers Equal Area (m.)
# http://epsg.io/102003
tgt_crs = "+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 "\
"+lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m +no_defs"
b_xy = gpd.GeoSeries(Point(b_ll), crs=gdb.crs).to_crs(tgt_crs)[0]
b_xy = (b_xy.x, b_xy.y)
# Calculate distance in Km.
d2b = lambda pt: cdist([(pt.x, pt.y)], [b_xy])[0][0] / 1000
gdb['d2balboa'] = gdb['geometry'].to_crs(tgt_crs)\
.apply(d2b)
gdb.plot(column='d2balboa', scheme='quantiles', k=9,
cmap='viridis_r', s=1)
<matplotlib.axes._subplots.AxesSubplot at 0x7f2dae1f1ef0>
key = open('../google_maps_key').readline().strip('\n')
gmaps = googlemaps.Client(key=key)
# Google takes lat/lon instead of lon/lat
gmaps.elevation([b_ll[::-1]])
[{'elevation': 89.00752258300781, 'location': {'lat': 32.73136, 'lng': -117.14653}, 'resolution': 4.771975994110107}]
pts = gdb['geometry'].apply(lambda pt: (pt.y, pt.x))
%time ele = gmaps.elevation(pts.head().tolist())
ele
CPU times: user 4 ms, sys: 0 ns, total: 4 ms Wall time: 309 ms
[{'elevation': 95.95661163330078, 'location': {'lat': 32.58188, 'lng': -117.00194}, 'resolution': 4.771975994110107}, {'elevation': 171.3590698242188, 'location': {'lat': 32.65303, 'lng': -116.97681}, 'resolution': 4.771975994110107}, {'elevation': 177.8932495117188, 'location': {'lat': 32.66148, 'lng': -116.97181}, 'resolution': 4.771975994110107}, {'elevation': 184.3952026367188, 'location': {'lat': 32.63869, 'lng': -116.95125}, 'resolution': 4.771975994110107}, {'elevation': 185.0911254882812, 'location': {'lat': 32.63894000000001, 'lng': -116.93889}, 'resolution': 4.771975994110107}]
extract_ele = lambda x: pd.Series(x)['elevation']
eleS = pd.Series(ele).apply(extract_ele)
eleS
0 95.956612 1 171.359070 2 177.893250 3 184.395203 4 185.091125 dtype: float64
NOTE: Still some mistakes it seems but neighborhood_cleansed
works much better than neighborhood
.
coastal_neighborhoods = ['Wooded Area', 'Ocean Beach', 'Pacific Beach', \
'La Jolla', 'Torrey Pines', 'Del Mar Heighs', \
'Mission Bay']
def coastal(neigh):
if neigh in coastal_neighborhoods:
return 1
else:
return 0
gdb['coastal_neig'] = gdb['neighbourhood_cleansed'].apply(coastal)
gdb.plot(column='coastal_neig', s=1,
categorical=True, legend=True);
We keep only observations in neighborhoods with more than 25 AirBnb houses so FE on neighborhood make sense.
lrg_nei = gdb.groupby('neighbourhood_cleansed').size() > 25
gdb['lrg_nei'] = gdb['neighbourhood_cleansed'].map(lrg_nei)
xs = ['accommodates', 'bathrooms', 'bedrooms',
'beds', 'neighbourhood_cleansed', 'pool',
'd2balboa', 'coastal_neig', 'lrg_nei',
'priceN', 'l_price',
'geometry', 'id']
rt = pd.get_dummies(gdb['room_type'], prefix='rt').rename(columns=lambda x: x.replace(' ', '_'))
def simplify(p):
bigs = ['House', 'Apartment', 'Condominium', 'Townhouse']
if p in bigs:
return p
else:
return 'Other'
gdb['property_group'] = gdb['property_type'].apply(simplify)
pg = pd.get_dummies(gdb['property_group'], prefix='pg')
gdb[['lrg_nei']].info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6608 entries, 0 to 6607 Data columns (total 1 columns): lrg_nei 6608 non-null bool dtypes: bool(1) memory usage: 6.5 KB
! rm 'regression_db.geojson'
final = gdb[xs].join(pg)\
.join(rt)\
.rename(columns={'priceN': 'price'})\
.loc[gdb['lrg_nei']==True, :]\
.drop(['lrg_nei'], axis=1)\
.dropna()
<class 'geopandas.geodataframe.GeoDataFrame'> Int64Index: 6110 entries, 19 to 6607 Data columns (total 20 columns): accommodates 6110 non-null int64 bathrooms 6110 non-null float64 bedrooms 6110 non-null float64 beds 6110 non-null float64 neighbourhood_cleansed 6110 non-null object pool 6110 non-null int64 d2balboa 6110 non-null float64 coastal_neig 6110 non-null int64 price 6110 non-null float64 l_price 6110 non-null float64 geometry 6110 non-null object id 6110 non-null int64 pg_Apartment 6110 non-null uint8 pg_Condominium 6110 non-null uint8 pg_House 6110 non-null uint8 pg_Other 6110 non-null uint8 pg_Townhouse 6110 non-null uint8 rt_Entire_home/apt 6110 non-null uint8 rt_Private_room 6110 non-null uint8 rt_Shared_room 6110 non-null uint8 dtypes: float64(6), int64(4), object(2), uint8(8) memory usage: 668.3+ KB
final = final.rename(columns=dict(neighbourhood_cleansed='neighborhood',
coastal_neig='coastal',
l_price = 'log_price'))
!rm regression_db.geojson
final.to_file('regression_db.geojson', driver='GeoJSON')
final.info()
rm: cannot remove 'airbnb/regression_db.geojson': No such file or directory <class 'geopandas.geodataframe.GeoDataFrame'> RangeIndex: 6110 entries, 0 to 6109 Data columns (total 20 columns): accommodates 6110 non-null int64 bathrooms 6110 non-null float64 bedrooms 6110 non-null float64 beds 6110 non-null float64 neighborhood 6110 non-null object pool 6110 non-null int64 d2balboa 6110 non-null float64 coastal 6110 non-null int64 price 6110 non-null float64 log_price 6110 non-null float64 id 6110 non-null int64 pg_Apartment 6110 non-null int64 pg_Condominium 6110 non-null int64 pg_House 6110 non-null int64 pg_Other 6110 non-null int64 pg_Townhouse 6110 non-null int64 rt_Entire_home/apt 6110 non-null int64 rt_Private_room 6110 non-null int64 rt_Shared_room 6110 non-null int64 geometry 6110 non-null object dtypes: float64(6), int64(12), object(2) memory usage: 954.8+ KB