#!/usr/bin/env python # coding: utf-8 # In[1]: from bs4 import BeautifulSoup import requests import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px ### To create interactive plots in python import chart_studio username ='janice.gbedemah' ### An account needs to be created online to retrieve API keys api_key = '7fuMphA2ExbPtpp9j3Bv' chart_studio.tools.set_credentials_file(username=username,api_key=api_key) import chart_studio.plotly as py import chart_studio.tools as tls # In[2]: alberta_data = {'address':[], 'bedroom': [], 'bathroom':[], 'price':[]} ontario_data = {'address':[], 'bedroom': [], 'bathroom':[], 'price':[]} BC_data = {'address':[], 'bedroom': [], 'bathroom':[], 'price':[]} # In[3]: ## SCRAPING LISTINGS IN ALBERTA for i in range(1,54): ## TO loop through all the url pages from page 1 to 53 url='https://apartmentlove.com/apartments-for-rent/alberta/canada/apartment/page'+str(i) page = requests.get(url) ## requesting to have access/connection to that page from the url soup = BeautifulSoup(page.content, 'html.parser') ## used to parse the html imformation from that page lists = soup.find_all('div', class_="property-details")## The parent class for all the information for each listing for list in lists: address = list.find('div', class_='property-details-heading').h3.a.text.replace("\n"," ") bedroom = list.find('ul', class_='d-flex').li.text.replace("\n"," ") bathroom = list.find('i', class_='bath-icon').next_sibling ## to extract the first string/text next to a tag/element price = list.find('span', class_='price-text').text.replace("\n"," ") if address: alberta_data['address'].append(address) if bedroom: alberta_data['bedroom'].append(bedroom) if bathroom: alberta_data['bathroom'].append(bathroom) if price: alberta_data['price'].append(price) alberta = pd.DataFrame(alberta_data) alberta.head(5) # In[5]: ## SCRAPING LISTINGS IN ONTARIO for j in range(1,9): url2 = 'https://apartmentlove.com/apartments-for-rent/ontario/canada/apartment/page'+str(j) page2 = requests.get(url2) soup2 = BeautifulSoup(page2.content, "html.parser") lists2 = soup2.find_all('div', class_="property-details") for list in lists2: address2 = list.find('div', class_='property-details-heading').h3.a.text.replace("\n"," ") bedroom2 = list.find('ul', class_='d-flex').li.text.replace("\n"," ") bathroom2= list.find('i', class_='bath-icon').next_sibling ## to extract the first string/text next to a tag/element price2 = list.find('span', class_='price-text').text.replace("\n"," ") if address2: ontario_data['address'].append(address2) if bedroom: ontario_data['bedroom'].append(bedroom2) if bathroom: ontario_data['bathroom'].append(bathroom2) if price: ontario_data['price'].append(price2) ontario = pd.DataFrame(ontario_data) ontario.head(5) # In[6]: ontario.shape # In[7]: ## SCRAPING LISTINGS IN BRITISH COLUMBIA for k in range(1,7): url3 = 'https://apartmentlove.com/apartments-for-rent/british-columbia/canada/apartment/page'+str(k) page3 = requests.get(url3) soup3 = BeautifulSoup(page3.content, "html.parser") lists3 = soup3.find_all('div', class_="property-details") for list in lists3: address3 = list.find('div', class_='property-details-heading').h3.a.text.replace("\n"," ") bedroom3 = list.find('ul', class_='d-flex').li.text.replace("\n"," ") bathroom3= list.find('i', class_='bath-icon').next_sibling ## to extract the first string/text next to a tag/element price3 = list.find('span', class_='price-text').text.replace("\n"," ") if address3: BC_data['address'].append(address3) if bedroom: BC_data['bedroom'].append(bedroom3) if bathroom: BC_data['bathroom'].append(bathroom3) if price: BC_data['price'].append(price3) BC = pd.DataFrame(BC_data) BC.head(5) # In[8]: BC.shape # In[9]: all_listings = pd.concat([alberta,ontario,BC], axis = 0) all_listings # ## DATA INSPECTION # In[10]: all_listings.shape # In[11]: all_listings.isna().sum() # In[12]: all_listings.describe() # ## DATA CLEANING/WRANGLING # In[14]: all_listings[['address','city','province']] = all_listings['address'].str.rsplit(',', n=2,expand=True) all_listings.head(5) # In[15]: ## DATA MANIPULATION all_listings['bedroom'] = all_listings['bedroom'].str.replace('Beds','') all_listings['bedroom'] = all_listings['bedroom'].str.replace('Bed','') all_listings['bathroom'] = all_listings['bathroom'].str.replace('Baths','') all_listings['bathroom'] = all_listings['bathroom'].str.replace('Bath','') all_listings['bedroom'] = all_listings['bedroom'].str.replace('Studio',"0") all_listings.head(5) # In[16]: ## CHANGE THE DATA TYPES all_listings[['bedroom','bathroom']] = all_listings[['bedroom','bathroom']].apply(pd.to_numeric) all_listings['price'] = all_listings['price'].str.replace('CAD / month','') all_listings['price'] = all_listings['price'].str.replace('\$','',regex = True) all_listings['price'] = pd.to_numeric(all_listings['price'].replace({',':''}, regex = True)).astype(float) # In[17]: all_listings.bathroom.unique() # In[18]: all_listings.bedroom.unique() # In[19]: all_listings.dtypes # In[20]: ## SAVING DATA TO A CSV FILE all_listings.to_csv('apartment_listings.csv') # In[69]: ## Analysis is performed for only 0-3 bedroom apartments apartments = all_listings[(all_listings['bedroom']== 0) | (all_listings['bedroom']==1)|(all_listings['bedroom']==2)|(all_listings['bedroom']==3)] # In[70]: apartments # In[71]: ## Price Distribution for the dataset plt.figure(figsize=(8,5)) plt.hist(apartments['price'],bins =range(500,5000,200),color='purple') plt.xlabel('Rent Prices/month') plt.ylabel('Count of listings') plt.title('Price Distribution') plt.show() # In[72]: ## Histogram of prices for each province apartments['price'].hist(by=apartments['province'],color='green',bins =range(500,4000,200),figsize=(10,8)) plt.show() # In[73]: ##Boxplot for each province apartments.boxplot(column='price', by='province',color= "green", figsize=(8,8), showmeans=True) plt.show() ## The ouliers in the plot below can be attributed to the natural variations in the provinces. That is, the cities in each province have their own unique characteristics which may cause the disparity in prices # In[76]: ## A pivot table for the average rent prices for 0-3 bedroom apartments in each province pivot1 = pd.pivot_table(apartments, index=["province"], columns ='bedroom', values='price', aggfunc = 'mean') pivot1 # In[77]: table1 = apartments.groupby(['province','bedroom'], as_index=False)['price'].mean() plt.figure(figsize=(1,1)) fig=px.histogram(data_frame=pivot2, x='province',y ='price', color='bedroom', barmode="group",text_auto=True,color_discrete_map={ "0.0": "gold", "1.0": "bronze", "2.0": "blue", "3.0": "goldenrod"}) fig.update_layout( title='AVERAGE RENT PRICES FOR 0-3 BEDROOM APARTMENTS', xaxis_title='PROVINCE', yaxis_title='AVERAGE RENT PRICE', legend_title='BEDROOM SIZE', paper_bgcolor="rgba(0,0,0,0)", ## Sets the background to white plot_bgcolor="rgba(0,0,0,0)", ## Sets the background of the plot to white/clear font_size=13, ) fig.show() # In[51]: py.plot(fig, filename="housing affordability study", auto_open=True) ## This saves the plot to your account and can be retrieved online