#!/usr/bin/env python # coding: utf-8 # # Single Variable Visualizations # # # ## (Graphs) ## # # In this notebook we will cover more thoroughly than before: # # - How to make and interpret a bar graph or bar chart # - How to make and interpret a histogram # - The fundamental differences between bar charts and histograms # - How to make and interpret boxplots # - When possible how to make side-by-side or overlapping graphs to compare two groups # # # In[ ]: from datascience import * import numpy as np import warnings warnings.filterwarnings("ignore") get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') plots.rcParams["patch.force_edgecolor"] = True # ## Categorical Distribution ## # In[ ]: #help(Table.hist) # ## Bar Charts ## # In[ ]: top_movies = Table.read_table('top_movies_2017.csv') top_movies # In[ ]: top_movies = top_movies.with_column('Millions', np.round(top_movies.column('Gross')/1000000,3)) top_movies.take(np.arange(10)).barh('Title', 'Millions') # When the bars of a bar graph are arranged from longest to shortest, that is called a *Pareto* chart. # In[ ]: top_movies.take(np.arange(10)).sort("Millions", descending=True).barh('Title', 'Millions') # In[ ]: studios = top_movies.select('Studio') studios # In[ ]: studio_distribution = studios.group('Studio') # In[ ]: studio_distribution # In[ ]: sum(studio_distribution.column('count')) # In[ ]: studio_distribution.barh('Studio') # In[ ]: #Pareto studio_distribution.sort('count', descending=True).barh('Studio') # In[ ]: baby = Table.read_table("baby.csv") baby # In[ ]: baby.group("Maternal Smoker") # In[ ]: baby.group("Maternal Smoker").barh("Maternal Smoker") plots.xlabel("Count", color = "blue") plots.title("Counts of Mothers in \nBaby Dataset", color = "blue") plots.yticks(make_array(.5, 1.5), make_array("Yes", "No"), color = "gold") plots.ylabel("Smoker", color ="blue", size = 25); # In[ ]: help(plots) # ## Numerical Distribution ## # In[ ]: ages = 2022 - top_movies.column('Year') top_movies = top_movies.with_column('Age', ages) # In[ ]: top_movies # ## Binning ## # In[ ]: min(ages), max(ages) # In[ ]: my_bins = make_array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110) # In[ ]: binned_data = top_movies.bin('Age', bins = my_bins) binned_data # In[ ]: sum(binned_data.column('Age count')) # In[ ]: top_movies.bin('Age', bins = np.arange(0, 126, 25)) # In[ ]: top_movies.bin('Age', bins = np.arange(0, 60, 25)) # In[ ]: top_movies.where('Age', 52) # ## Histograms ## # In[ ]: my_bins # In[ ]: binned_data # In[ ]: # Let's make our first histogram! top_movies.hist('Age') # In[ ]: # Let's try picking our own bins instead. top_movies.hist('Age', bins = np.arange(0, 100, 10), unit = 'Year') # In[ ]: top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year', density=False) # In[ ]: 25/2 # In[ ]: # Let's try not specifying any bins! top_movies.hist('Age', unit='Year') # In[ ]: 3*9.61+5 # In[ ]: # Add a column containing what percent of movies are in each bin binned_data = binned_data.with_column( 'Percent', 100*binned_data.column('Age count')/200) # In[ ]: binned_data # ## Height ## # # ### Question: What is the height of the [42, 66] bin? # In[ ]: # Step 1: Calculate % of movies in the [40, 65) bin percent = binned_data.where('bin', 42).column('Percent').item(0) percent # In[ ]: # Step 2: Calculate the width of the 42-66 bin width = 66 - 42 width # In[ ]: # Step 3: Area of rectangle = height * width # --> height = percent / width height = percent / width height # ### What are the heights of the rest of the bins? # In[ ]: # Get the bin lefts bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1)) # In[ ]: # Get the bin widths bin_widths = np.diff(binned_data.column('bin')) bin_lefts = bin_lefts.with_column('Width', bin_widths) # In[ ]: # Get the bin heights bin_heights = bin_lefts.column('Percent') / bin_widths bin_lefts = bin_lefts.with_column('Height', bin_heights) # In[ ]: bin_lefts # In[ ]: top_movies.hist('Age', bins = my_bins, unit = 'Year') # Please note that the example above is a bad example. On a histogram, bins should be the **same** size. Now, let's look at another example showing the impact of bins. # # ### The impact of bins ### # In[ ]: survey = Table.read_table('welcome_survey_v1.csv') survey # In[ ]: sleep_bins = np.arange(4, 12, 0.5) sleep_bins # In[ ]: survey.hist('Hours of sleep', bins=sleep_bins) # In[ ]: sleep_bins = np.arange(4,12, 1) survey.hist('Hours of sleep', bins=sleep_bins) # ## Boxplots # # Box plots are a graphical display of the descriptive statistics that we know as the Five Number Summary. # # Five Number Summary is as follows # # - The minimum # - The $25^{th}$ percentile, $Q_1$ # - The $50^{th}$ percentile, the median # - The $75^{th}$ percentile, $Q_3$ # - The maximum # # In one of the next couple of notebooks, we will discuss how to compute these. # In[ ]: skyscrapers = Table.read_table('skyscrapers.csv') ny = skyscrapers.where('city', "New York City") la = skyscrapers.where('city', 'Los Angeles') ny # In[ ]: plots.boxplot(ny.column(3), widths = 0.4); #plots.xlabel("NYC"); # In[ ]: plots.boxplot(ny.column(3), widths=.5 ) plots.xticks(make_array(1), make_array("NYC")) plots.title("Heights of New York City \nSkyscrapers"); # ### Can we also display the mean on a boxplot? ### # # Yes, if you set the value of **showmeans** equal to True. # In[ ]: plots.boxplot(ny.column(3), widths=.5 , showmeans=True) plots.xticks(make_array(1), make_array("NYC")); # In[ ]: plots.boxplot(ny.column(3), widths=.5 , showmeans=True, whis=10) plots.xticks(make_array(1), make_array("NYC")); # In[ ]: # ## Comparing two groups numerically ## # # We can use side-by-side boxplots to visually compare two groups. # In[ ]: ticks=make_array(2.5, 3.5) labels=make_array("NYC", "LA") plots.figure(figsize=(6, 6)) plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)) ) plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)) ) plots.xticks(ticks, labels) plots.title("Boxplots Comparing NYC and LA Skyscrapers"); # In[ ]: ticks=make_array(2.5, 3.5) labels=make_array("NYC", "LA") plots.figure(figsize=(6, 6)) plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)), showmeans=True) plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)), showmeans=True) plots.xticks(ticks, labels) plots.title("Boxplots Comparing NYC and LA Skyscrapers"); # In[ ]: smokers = baby.where("Maternal Smoker", True).column("Birth Weight") nonsmokers = baby.where("Maternal Smoker", False).column("Birth Weight") ticks=make_array(2.5, 3.5) labels=make_array("Smokers", "Non-Smokers") plots.figure(figsize=(6, 6)) plots.boxplot(smokers, widths=.5, positions=make_array(ticks.item(0)), showmeans=True) plots.boxplot(nonsmokers, widths=.5, positions=make_array(ticks.item(1)), showmeans=True) plots.xticks(ticks, labels) plots.title("Comparing Birth Weights \nof Babies"); # Sometimes, overlapping histograms are also possible. # In[ ]: galton = Table.read_table('galton.csv') # In[ ]: galton # In[ ]: galton.hist('midparentHeight') # In[ ]: galton.hist('childHeight') # What part of this histogram is the children with heights above 70 inches? # # In[ ]: galton.hist('childHeight', left_end=70) # In[ ]: galton.hist('midparentHeight', 'childHeight') # In[ ]: galton.hist("mother", "father", "childHeight", "midparentHeight") # In[ ]: ticks=make_array(2.5, 3.5, 4.5, 5.5) labels=make_array("Midparent", "Child", "Mother", "Father") plots.figure(figsize=(6, 6)) plots.boxplot(galton.column("midparentHeight"), widths=.5, positions=make_array(ticks.item(0)), showmeans=True) plots.boxplot(galton.column("childHeight"), widths=.5, positions=make_array(ticks.item(1)), showmeans=True) plots.boxplot(galton.column("mother"), widths=.5, positions=make_array(ticks.item(2)), showmeans=True) plots.boxplot(galton.column("father"), widths=.5, positions=make_array(ticks.item(3)), showmeans=True) plots.xticks(ticks, labels) plots.title("Comparing Heights"); # In[ ]: