First, you have to find your access tokens to use the Foursquare API with reasonable rate limits.
If you have an access token, you can use that, otherwise register an app an use the client id and secret for the following steps.
import foursquare
import pandas as pd
#ACCESS_TOKEN = ""
#client = foursquare.Foursquare(access_token=ACCESS_TOKEN)
CLIENT_ID = ""
CLIENT_SECRET = ""
client = foursquare.Foursquare(client_id=CLIENT_ID, client_secret=CLIENT_SECRET)
# bbox = [11.109872,47.815652,12.068588,48.397136] # bounding box for Munich
# bbox = [13.088400,52.338120,13.761340,52.675499] # bounding box for Berlin
bbox = [5.866240,47.270210,15.042050,55.058140] # bounding box for Germany
new_crawl = [] # list of locations to be crawled
done = [] # list of crawled locations
links = [] # list of tuples that represent links between locations
venues = pd.DataFrame() # dictionary of locations id => meta-data on location
Set seed values for Marienplatz, Airport and Central Station.
Depth is the number of recursive crawling processes.
to_crawl = ["4ade0ccef964a520246921e3", "4cbd1bfaf50e224b160503fc", "4b0674e2f964a520f4eb22e3"]
depth = 8
for i in range(depth):
new_crawl = []
print "Step " + str(i) + ": " + str(len(venues)) + " locations and " + str(len(links)) + " links. " + str(len(to_crawl)) + " venues to go."
for v in to_crawl:
if v not in venues:
res = client.venues(v)
venues = venues.append(pd.DataFrame({"name":res["venue"]["name"],"users":res["venue"]["stats"]["usersCount"],
"checkins":res["venue"]["stats"]["checkinsCount"], "lat":res["venue"]["location"]["lat"],
"lng":res["venue"]["location"]["lng"]}, index=[v]))
next_venues = client.venues.nextvenues(v)
for nv in next_venues['nextVenues']['items']:
if ((nv["location"]["lat"] > bbox[1]) & (nv["location"]["lat"] < bbox[3]) &
(nv["location"]["lng"] > bbox[0]) & (nv["location"]["lng"] < bbox[2])):
if nv["id"] not in venues:
venues = venues.append(pd.DataFrame({"name":nv["name"],"users":nv["stats"]["usersCount"],
"checkins":nv["stats"]["checkinsCount"], "lat":nv["location"]["lat"],
"lng":nv["location"]["lng"]}, index=[nv["id"]]))
if (nv["id"] not in done) & (nv["id"] not in to_crawl) & (nv["id"] not in new_crawl):
new_crawl.append(nv["id"])
links.append((v, nv["id"]))
done.append(v)
to_crawl = new_crawl
Step 0: 0 locations and 0 links. 3 venues to go. Step 1: 12 locations and 9 links. 7 venues to go. Step 2: 53 locations and 43 links. 17 venues to go. Step 3: 153 locations and 126 links. 17 venues to go. Step 4: 235 locations and 191 links. 12 venues to go. Step 5: 291 locations and 235 links. 13 venues to go. Step 6: 348 locations and 279 links. 22 venues to go. Step 7: 461 locations and 370 links. 28 venues to go.
We're importing networkx to build the network out of our crawled venues (= nodes) and links between them.
venues = venues.reset_index().drop_duplicates(cols='index',take_last=True).set_index('index')
venues.head()
checkins | lat | lng | name | users | |
---|---|---|---|---|---|
index | |||||
4cbd1bfaf50e224b160503fc | 224872 | 48.352599 | 11.780992 | München Flughafen "Franz Josef Strauß" (MUC) | 83604 |
4b0674e2f964a520f4eb22e3 | 88327 | 48.140547 | 11.555772 | München Hauptbahnhof | 18833 |
4b56f6eef964a520ec2028e3 | 1845 | 48.137558 | 11.579466 | Augustiner am Platzl | 1471 |
4ade0d1df964a520dc6a21e3 | 2053 | 48.136930 | 11.574156 | Sporthaus Schuster | 1137 |
4bbc6329afe1b7136d4d304b | 2702 | 48.135282 | 11.576350 | Biergarten am Viktualienmarkt | 1534 |
5 rows × 5 columns
labels = venues["name"].to_dict()
import networkx as nx
G = nx.DiGraph()
G.add_nodes_from(venues.index)
for f,t in links:
G.add_edge(f, t)
nx.info(G)
'Name: \nType: DiGraph\nNumber of nodes: 155\nNumber of edges: 478\nAverage in degree: 3.0839\nAverage out degree: 3.0839'
Calculate some useful metrics and visualize the most important venues
pagerank = nx.pagerank(G,alpha=0.9)
betweenness = nx.betweenness_centrality(G)
venues['pagerank'] = [pagerank[n] for n in venues.index]
venues['betweenness'] = [betweenness[n] for n in venues.index]
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 6), dpi=150)
ax = fig.add_subplot(111)
venues.sort('users', inplace=True)
venues.set_index('name')[-20:].users.plot(kind='barh')
ax.set_ylabel('Location')
ax.set_xlabel('Users')
ax.set_title('Top 20 Locations by Users')
plt.show()
fig = plt.figure(figsize=(8, 6), dpi=150)
ax = fig.add_subplot(111)
venues.sort('checkins', inplace=True)
venues.set_index('name')[-20:].checkins.plot(kind='barh')
ax.set_ylabel('Location')
ax.set_xlabel('Checkins')
ax.set_title('Top 20 Locations by Checkins')
plt.show()
fig = plt.figure(figsize=(8, 6), dpi=150)
ax = fig.add_subplot(111)
venues.sort('pagerank', inplace=True)
venues.set_index('name')[-20:].pagerank.plot(kind='barh')
ax.set_ylabel('Location')
ax.set_xlabel('Pagerank')
ax.set_title('Top 20 Locations by Pagerank')
plt.show()
fig = plt.figure(figsize=(8, 6), dpi=150)
ax = fig.add_subplot(111)
venues.sort('betweenness', inplace=True)
venues.set_index('name')[-20:].betweenness.plot(kind='barh')
ax.set_ylabel('Location')
ax.set_xlabel('Pagerank')
ax.set_title('Top 20 Locations by Betweenness Centrality')
plt.show()
Visualize the network
fig = plt.figure(figsize=(16, 9), dpi=150)
graph_pos=nx.spring_layout(G)
nodesize = [10000*n for n in pagerank.values()]
nx.draw_networkx_nodes(G,graph_pos,node_size=nodesize, alpha=0.5, node_color='blue')
nx.draw_networkx_edges(G,graph_pos,width=1, alpha=0.3,edge_color='blue')
nx.draw_networkx_labels(G, graph_pos, labels=labels, font_size=10, font_family='Arial')
plt.axis('off')
plt.show()