#!/usr/bin/env python # coding: utf-8 # # Analyze Districts in Madrid to open a cafeteria # ## Description # Madrid is the capital of Spain and its largest city. It is located in the center of the peninsula, as the capital of Spain Madrid has the seat of government, the Court and is also the official residence of the King and Queen of Spain. # On the economic level, Madrid is home to the headquarters of many national and international companies. Culturally, Madrid has world-famous museums such as the Prado Museum and the Reina Sofia Museum. Madrid's neighborhoods are full of history and peculiar characteristics that make them unique. # # # Madrid stands out for the infinity of bars, restaurants and cafes in its streets. No doubt in every walk through the city we discover new places and the list of places to go is getting longer and longer. # This project aims to help by analyzing the characteristics of the districts of Madrid, its impact on the COVID combined with the data and datasets of the city, will determine the best or possible locations for a new restaurant type cafeteria. # ## Data description # # The data will be collected from different Madrid official soruces all of them with real and current data # * COVID datasets distributed by districts: https://datos.comunidad.madrid/catalogo/dataset/covid19_tia_muni_y_distritos/resource/877fa8f5-cd6c-4e44-9df5-0fb60944a841 # * Neighborhoods an districts information: https://datos.gob.es/en/catalogo/a13002908-covid-19-tia-por-municipios-y-distritos-de-madrid1 # * Prominents Madrid's data: https://datos.madrid.es/portal/site/egob/menuitem.ca8ea3bd9f53b2811ff64a46a8a409a0/?vgnextoid=6db862c549810510VgnVCM1000008a4a900aRCRD&vgnextchannel=6db862c549810510VgnVCM1000008a4a900aRCRD&vgnextfmt=default # * District's numbers: https://www.madrid.es/portales/munimadrid/es/Inicio/El-Ayuntamiento/Estadistica/Distritos-en-cifras/Distritos-en-cifras-Informacion-de-Distritos-/?vgnextfmt=default&vgnextoid=74b33ece5284c310VgnVCM1000000b205a0aRCRD&vgnextchannel=27002d05cb71b310VgnVCM1000000b205a0aRCRD# # ## Venue data # The data will be extracted from the API Foursquare wich has large numbers of venues throughout the different districts # **Import libraries** # In[434]: import random from IPython.display import Image from IPython.core.display import HTML from pandas.io.json import json_normalize import geocoder from geopy.geocoders import Nominatim import folium from folium import plugins import matplotlib.cm as cm import matplotlib.colors as colors from sklearn.preprocessing import StandardScaler import numpy as np from sklearn.cluster import KMeans from sklearn.datasets.samples_generator import make_blobs from bs4 import BeautifulSoup import requests import pandas as pd import plotly.express as px import matplotlib.pyplot as plt from sklearn.metrics import silhouette_score # In[4]: import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') # ### Data analysis and wrangling # The csv file is taken from the links given above # In[290]: madrid_data = pd.read_csv('Barrios.csv', sep=";", encoding='latin-1') madrid_data.head() # In[291]: madrid_data.drop(['OBJECTID_1','Shape_Leng','Shape_Area', ], axis = 1, inplace=True) madrid_data.head() # In[292]: madrid_data.drop(['COD_DIS','COD_DIS_TX','BARRIO_MAY','COD_DISBAR','COD_BAR','NUM_BAR','BARRIO_MT','COD_DISB'], axis = 1, inplace=True) madrid_data.head() # Madrid latitude and longitude: # In[293]: city = 'Madrid, MAD' geolocator = Nominatim(user_agent="madrid") location = geolocator.geocode(city) latitude = location.latitude longitude = location.longitude print("Madrid latitude: ", location.latitude, "\n", "Madrid longitude: ", location.longitude) # In[294]: # this is another way using Nominatim passing the coordinates we can see its name geolocator = Nominatim(user_agent = "url") location = geolocator.reverse("40.4948384 , -3.5740806206811313") print(location.address) # **Madrid map with districts and neighborhoods:** # In[295]: # create map of Madrid using latitude and longitude values madrid_map = folium.Map(location=[latitude, longitude], zoom_start=11) # add markers to map for lat, lng, district, neighborhood in zip(madrid_data['DISTRICT_LATITUDE'],madrid_data['DISTRICT_LONGITUDE'],madrid_data['DISTRICT'],madrid_data['NEIGHBORHOOD']): label = '{}, {}'.format(neighborhood,district) label = folium.Popup(label, parse_html=True) folium.CircleMarker( [lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(madrid_map) madrid_map # **Madrid COVID data** # In[15]: madrid_covid = pd.read_csv('COVID_madrid.csv', sep=";", encoding='latin-1') madrid_covid.head() # In[16]: madrid_covid.drop(['tasa_incidencia_acumulada_activos_ultimos_14dias','tasa_incidencia_acumulada_ultimos_14dias','codigo_geometria','casos_confirmados_activos_ultimos_14dias'], axis=1, inplace=True) # In[17]: madrid_covid # Unnecessary rows are eliminated, leaving only the first 21 rows corresponding to the 21 districts of Madrid which have the most recent case reports. which have the most recent report of cases # In[18]: madrid_covid=madrid_covid.iloc[0:21] # In[19]: madrid_covid.shape # "Madrid-" is removed from the municipio_distrito column since we only need the name of the district. # In[20]: madrid_covid['municipio_distrito']=madrid_covid['municipio_distrito'].replace({'Madrid-':''}, regex=True) # Columns names: # * municipio_distrito: district name # * fecha_informe: date of the report # * casos_confirmados_ultimos_14dias: confirmed cases for the last 14 days # * casos_confirmados_totales: total confirmed cases for each district # * poblacion_distrito: district population # * tasa_incidencia_acumulada_total: total cumulative incidence rate # In[21]: madrid_covid # In[22]: madrid_covid.set_index('municipio_distrito')['casos_confirmados_totales'].plot(kind='bar') # In[487]: mean = madrid_covid["casos_confirmados_totales"].mean() median = madrid_covid["casos_confirmados_totales"].median() mode = madrid_covid["casos_confirmados_totales"].mode() print("Mean: {}, Median: {}, Mode: {}".format(mean, median, mode)) # In[23]: # Create map of Madrid using latitude and longitude values district_map = folium.Map(location=[latitude, longitude], zoom_start=11) # add markers to map for lat, lng, district, confirmed in zip(madrid_covid['latitude'],madrid_covid['longitude'],madrid_covid['municipio_distrito'], madrid_covid['casos_confirmados_totales']): label = '{}, {} Confirmed cases'.format(district, confirmed) label = folium.Popup(label, parse_html=True) folium.CircleMarker( [lat, lng], radius=10, popup=label, color='blue', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(district_map) district_map # In[24]: madrid_covid['municipio_distrito'] # In[25]: madrid_covid.head() # Function that extracts the **category of the venue** # In[26]: def categories_type(row): categories_list = row['venue.categories'] if len(categories_list) == 0: return None else: return categories_list[0]['name'] # In[27]: foursquare_venues=pd.DataFrame() mdc = [] # In[28]: for latitude, longitude, distr, conf in zip(madrid_covid['latitude'],madrid_covid['longitude'],madrid_covid['municipio_distrito'],madrid_covid['casos_confirmados_totales']): url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, latitude, longitude, radius, LIMIT) results = requests.get(url).json() venues = pd.json_normalize(results['response']['groups'][0]['items']) filtered_columns = ['venue.id', 'venue.location.lat', 'venue.location.lng', 'venue.name', 'venue.categories'] venues_df = venues[filtered_columns] venues_df['venue.categories'] = venues_df.apply(categories_type, axis=1) venues_df['district']=distr venues_df['confirmed']=conf mdc.append(venues_df) # Venues from the API Foursquare with each district and COVID confirmed cases # In[29]: foursquare_venues = pd.concat(mdc) foursquare_venues.head() # In[30]: foursquare_venues.shape # In[31]: print('{} venues were returned by Foursquare.'.format(foursquare_venues.shape[0])) # In[32]: foursquare_venues['venue.categories'].value_counts() # In[467]: # cafeteria venues venues_cafeteria = foursquare_venues[foursquare_venues['venue.categories'] == 'Cafeteria'] venues_cafeteria # Lets start by exploring just the first district in our dataframe using Foursquare API. # In[385]: district_name = madrid_data.loc[0, 'District'] district_lat = madrid_data.loc[0, 'District_Latitude'] district_long = madrid_data.loc[0, 'District_Longitude'] print("The district is {} and it's district's coordinates are {} latitude and {} longitude".format(district_name, district_lat, district_long)) # In[386]: # API Foursquare LIMIT = 100 radius = 1000 url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, district_lat, district_long, radius, LIMIT) url # This function extracts the categories of venues # In[388]: def get_category_type(row): try: categories_list = row['categories'] except: categories_list = row['venue.categories'] if len(categories_list) == 0: return None else: return categories_list[0]['name'] # Cleaning the json and structure it into a pandas dataframe # In[389]: venues = results['response']['groups'][0]['items'] nearby_venues = pd.json_normalize(venues) # flatten JSON # filter columns filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng'] nearby_venues = nearby_venues.loc[:, filtered_columns] # filter the category for each row nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1) # clean columns nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns] nearby_venues.head() # In[390]: print("{} venues were returned for {} by Foursquare".format(len(nearby_venues), district_name)) # Getting nearby venues for all districts by creating the function getNearbyVenues.This function extracts the venues data for the districts in Madrid # In[397]: def getNearbyVenues(names, latitudes, longitudes, radius=500): venues_list=[] for name, lat, lng in zip(names, latitudes, longitudes): print(name) # create the API request URL url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT) # make the GET request results = requests.get(url).json()["response"]['groups'][0]['items'] # return only relevant information for each nearby venue venues_list.append([( name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results]) nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list]) nearby_venues.columns = ['District', 'Latitude', 'Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category'] return(nearby_venues) # In[398]: # we run the above function on each district and create a new dataframe called madrid_venues. madrid_venues = getNearbyVenues(names=madrid_data['District'], latitudes=madrid_data['District_Latitude'], longitudes=madrid_data['District_Longitude'] ) # In[399]: print(madrid_venues.shape) madrid_venues.head() # Counting venues grouped by district: # In[400]: madrid_venues.groupby('District').count() # In[401]: print("There are {} unique categories".format(len(madrid_venues['Venue Category'].unique()))) # In[402]: # one hot encoding madrid_onehot = pd.get_dummies(madrid_venues[['Venue Category']], prefix="", prefix_sep="") # add disctrict column back to dataframe madrid_onehot['District'] = madrid_venues['District'] # move neighborhood column to the first column fixed_columns = [madrid_onehot.columns[-1]] + list(madrid_onehot.columns[:-1]) madrid_onehot = madrid_onehot[fixed_columns] madrid_onehot.head(5) # In[403]: madrid_grouped = madrid_onehot.groupby('District').mean().reset_index() madrid_grouped # Displaying the top 5 venues of all districts # In[404]: num_top_venues = 5 for n in madrid_grouped['District']: print("----"+n+"----") temp = madrid_grouped[madrid_grouped['District']==n].T.reset_index() temp.columns = ['venue', 'freq'] temp = temp.iloc[1:] temp['freq'] = temp['freq'].astype(float) temp = temp.round({'freq': 2}) print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues)) print('\n') # Putting the previous into a pandas dataframe # In[405]: def return_most_common_venues(row, num_top_venues): row_categories = row.iloc[1:] row_categories_sorted = row_categories.sort_values(ascending=False) return row_categories_sorted.index.values[0:num_top_venues] # Let's create the new dataframe and display the top 10 venues for each neighborhood # In[406]: num_top_venues = 10 indicators = ['st', 'nd', 'rd'] # create columns according to number of top venues columns = ['District'] for ind in np.arange(num_top_venues): try: columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind])) except: columns.append('{}th Most Common Venue'.format(ind+1)) # create a new dataframe neighborhoods_venues_sorted = pd.DataFrame(columns=columns) neighborhoods_venues_sorted['District'] = madrid_grouped['District'] for ind in np.arange(madrid_grouped.shape[0]): neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(madrid_grouped.iloc[ind, :], num_top_venues) neighborhoods_venues_sorted # ### Cluster Districts # # Run k-means to cluster districts. # The following is to determine how many clusters should we use, for this we will use the Silhouette Score - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html # In[407]: def plot(x, y): fig = plt.figure(figsize=(12,6)) plt.plot(x, y, 'o-') plt.xlabel('Number of clusters') plt.ylabel('Silhouette Scores') plt.title('Checking Optimum Number of Clusters') ax = plt.gca() ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) # In[408]: maxk = 15 scores = [] kval = [] for k in range(2, maxk+1): cl_df = madrid_grouped.drop('District', axis=1) kmeans = KMeans(n_clusters=k, init="k-means++", random_state=0).fit_predict(cl_df) #Choose any random_state score = silhouette_score(cl_df, kmeans, metric='euclidean', random_state=0) kval.append(k) scores.append(score) # In[409]: print(scores) print(kval) plot(kval, scores) # Acording to the previous we can use 3 or 4 clusters given that it provides the highest silhouette score. Also note that it decreases as the number of clusters increases # In[410]: # set number of clusters kclusters = 4 madrid_grouped_clustering = madrid_grouped.drop('District', 1) # run k-means clustering kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(madrid_grouped_clustering) # check cluster labels generated for each row in the dataframe kmeans.labels_[0:10] # In[411]: # add clustering labels neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_) madrid_merged = madrid_data # merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood madrid_merged = madrid_merged.join(neighborhoods_venues_sorted.set_index('District'), on='District') madrid_merged.head() # check the last columns! # In[488]: # Had to do this the column's names were giving error madrid_data = madrid_data.rename(columns={'DISTRICT': 'District', 'NEIGHBORHOOD':'Neighborhood', 'DISTRICT_LATITUDE':'District_Latitude', 'DISTRICT_LONGITUDE':'District_Longitude'}) madrid_data # Finally, let's visualize the resulting clusters # In[414]: # create map map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11) # set color scheme for the clusters x = np.arange(kclusters) ys = [i+x+(i*x)**2 for i in range(kclusters)] colors_array = cm.rainbow(np.linspace(0, 1, len(ys))) rainbow = [colors.rgb2hex(i) for i in colors_array] # add markers to the map markers_colors = [] for lat, lon, poi, cluster in zip(madrid_merged['District_Latitude'], madrid_merged['District_Longitude'], madrid_merged['District'], madrid_merged['Cluster Labels']): label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True) folium.CircleMarker( [lat, lon], radius=5, popup=label, color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.7).add_to(map_clusters) map_clusters # ### Cluster 1 # In[421]: madrid_merged.loc[madrid_merged['Cluster Labels'] == 0, madrid_merged.columns[[0] + [1] + list(range(5, madrid_merged.shape[1]))]] # ### Cluster 2 # In[422]: madrid_merged.loc[madrid_merged['Cluster Labels'] == 1, madrid_merged.columns[[0] + [1] + list(range(5, madrid_merged.shape[1]))]] # ### Cluster 3 # In[424]: madrid_merged.loc[madrid_merged['Cluster Labels'] == 2, madrid_merged.columns[[0] + [1] + list(range(5, madrid_merged.shape[1]))]] # ### Cluster 4 # In[425]: madrid_merged.loc[madrid_merged['Cluster Labels'] == 3, madrid_merged.columns[[0] + [1] + list(range(5, madrid_merged.shape[1]))]] # ## Results and discussion # By segmenting into clusters we can distinguish the different districts of the city of Madrid and their respective venues according to Foursquare, by using Silhouette Score the best number of clusters were 3 and 4, in this exercise we used 4 clusters. # # When analyzing the different clusters, cluster number 1 has in fourth, sixth, seventh, ninth and tenth place categories cafeteria and derivatives, it is also cluster number 1 the longest one where more districts and neighborhoods are located. # According to the above Madrid is a city where there are quite a few cafeteria type places partly also because of its strong tourist area especially in the central and most touristic part of the city, districts such as Chamartin, Barajas, Hortaleza, Puente de Vallecas have quite a few venues in cafeteria. # # A good option for a possible interested in starting a cafeteria type project would be in the mentioned districts due to its touristic nature, for example Moratalaz presents in the second cluster a fifth place in cafeteria venues. # **This is the cluster number 1 wich has the most cafeteria venue:** # In[469]: cafeteria_districts = madrid_merged.loc[madrid_merged['Cluster Labels'] == 0, madrid_merged.columns[[0, 1, 2, 3] + list(range(5, madrid_merged.shape[1]))]] cafeteria_districts.head() # In[470]: cafe_locations = folium.Map(location=[latitude, longitude], zoom_start=10) tooltip = "Click me!" for lat, lng, location, neighborhood in zip(cafeteria_districts['District_Latitude'], cafeteria_districts['District_Longitude'], cafeteria_districts['District'], cafeteria_districts['Neighborhood']): label = '{}, {}'.format(neighborhood, location) folium.Marker([lat, lng], popup='{} has geographical coordinates ({:.4f}, {:.4f})'.format(label, lat, lng), icon=folium.Icon(color='lightred')).add_to(cafe_locations) label = folium.Popup(label, parse_html=True) folium.CircleMarker( [lat, lng], radius=5, popup=label, color='yellow', fill=True, fill_color='#3186cc', fill_opacity=0.7, parse_html=False).add_to(cafe_locations) cafe_locations # ## Conclusion # This exercise shows on the map where most coffee is consumed in the most touristic area of Madrid, but at the same time there are areas where there are many confirmed cases of COVID, for example Puente de Vallecas with more than 30000 confirmed cases is a good district due to the venues in cafeteria but is well above the average of infections, Moratalaz can be interesting as it does not have as many infections as other districts and Barajas also has the lowest number of infections and has a fourth place in venus of cafeteria. # # # Villa de Vallecas and Hortaleza have in first place the Breakfast Spot venue which can be a good idea for a stakeholder as it can be related to cafeteria but at the same time they are districts with high confirmed cases. # Therefore cluster number 1 is indicated for a new coffee shop or cafeteria due to its touristic nature and the amount of venues related to groceries, restaurants, breakfast, etc, that this cluster has but without neglecting the COVID issue.