#!/usr/bin/env python # coding: utf-8 # # Stacked bar charts # We've only been looking at the largest cities by population, but we have a whole bunch of other data we can use. # In[68]: import pandas df = pandas.read_csv('US_cities.csv', index_col="AccentCity") df[:5] # We have over 4000 cities and towns in the database, definitely too many for a bar chart. But we could look at the population data at the state level. First, let's try just a simple groupby in pandas. # In[86]: states_pop = df[["Population", "Region"]].groupby(by=["Region"]).sum() states_pop[:5] # In[70]: states_pop.plot.bar() plt.show() # Let's make a stacked bar chart that shows us how much the three largest cities contribute to population compared to the rest of the state. First, we'll need to get a separate list of the three top cities. To do that, we'll be using the pandas groupby method. # In[71]: largest_cities_idx = df.groupby(by=["Region"], sort=False)["Population"].transform(max) == df["Population"] df[largest_cities_idx][:5] # In[72]: def max2(cities): return nth_largest(cities, 2) def max3(cities): return nth_largest(cities, 3) def nth_largest(cities, n): nlargest = cities.nlargest(n) if len(nlargest) < n: return None return nlargest[n-1] second_largest_idx = df.groupby(by=["Region"], sort=False)["Population"].transform(max2) == df["Population"] third_largest_idx = df.groupby(by=["Region"], sort=False)["Population"].transform(max3) == df["Population"] df[second_largest_idx][:5] # Now we just need to sum the rest of the cities that we haven't counted yet. # In[88]: smaller_cities = df[~largest_cities_idx & ~second_largest_idx & ~third_largest_idx].copy() smaller_cities[["Population", "Region"]].groupby(by=["Region"]).sum().plot.bar(stacked=True) df[largest_cities_idx][["Population", "Region"]].groupby(by=["Region"]).sum().plot.bar(stacked=True) # df[second_largest_idx].plot.bar() # df[third_largest_idx].plot.bar() plt.show() # In[89]: df