Single Variable Visualizations¶

(Graphs)¶

In this notebook we will cover more thoroughly than before:

How to make and interpret a bar graph or bar chart
How to make and interpret a histogram
The fundamental differences between bar charts and histograms
How to make and interpret boxplots
When possible how to make side-by-side or overlapping graphs to compare two groups

In [ ]:

from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True

Categorical Distribution¶

In [ ]:

#help(Table.hist)

Bar Charts¶

In [ ]:

top_movies = Table.read_table('top_movies_2017.csv')
top_movies

In [ ]:

top_movies = top_movies.with_column('Millions', np.round(top_movies.column('Gross')/1000000,3))
top_movies.take(np.arange(10)).barh('Title', 'Millions')

When the bars of a bar graph are arranged from longest to shortest, that is called a Pareto chart.

In [ ]:

top_movies.take(np.arange(10)).sort("Millions", descending=True).barh('Title', 'Millions')

In [ ]:

studios = top_movies.select('Studio')
studios

In [ ]:

studio_distribution = studios.group('Studio')

In [ ]:

studio_distribution

In [ ]:

sum(studio_distribution.column('count'))

In [ ]:

studio_distribution.barh('Studio')

In [ ]:

#Pareto

studio_distribution.sort('count', descending=True).barh('Studio')

In [ ]:

baby = Table.read_table("baby.csv")
baby

In [ ]:

baby.group("Maternal Smoker")

In [ ]:

baby.group("Maternal Smoker").barh("Maternal Smoker")
plots.xlabel("Count", color = "blue")
plots.title("Counts of Mothers in \nBaby Dataset", color = "blue")
plots.yticks(make_array(.5, 1.5), make_array("Yes", "No"), color = "gold")
plots.ylabel("Smoker", color ="blue", size = 25);

In [ ]:

help(plots)

Numerical Distribution¶

In [ ]:

ages = 2022 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)

In [ ]:

top_movies

Binning¶

In [ ]:

min(ages), max(ages)

In [ ]:

my_bins = make_array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110)

In [ ]:

binned_data = top_movies.bin('Age', bins = my_bins)
binned_data

In [ ]:

sum(binned_data.column('Age count'))

In [ ]:

top_movies.bin('Age', bins = np.arange(0, 126, 25))

In [ ]:

top_movies.bin('Age', bins = np.arange(0, 60, 25))

In [ ]:

top_movies.where('Age', 52)

Histograms¶

In [ ]:

my_bins

In [ ]:

binned_data

In [ ]:

# Let's make our first histogram!
top_movies.hist('Age')

In [ ]:

# Let's try picking our own bins instead.
top_movies.hist('Age', bins = np.arange(0, 100, 10),  unit = 'Year')

In [ ]:

top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year', density=False)

In [ ]:

25/2

In [ ]:

# Let's try not specifying any bins!
top_movies.hist('Age', unit='Year')

In [ ]:

3*9.61+5

In [ ]:

# Add a column containing what percent of movies are in each bin
binned_data = binned_data.with_column(
    'Percent', 100*binned_data.column('Age count')/200)

In [ ]:

binned_data

Height¶

Question: What is the height of the [42, 66] bin?¶

In [ ]:

# Step 1: Calculate % of movies in the [40, 65) bin
percent = binned_data.where('bin', 42).column('Percent').item(0)
percent

In [ ]:

# Step 2: Calculate the width of the 42-66 bin
width = 66 - 42
width

In [ ]:

# Step 3: Area of rectangle = height * width
#         --> height = percent / width
height = percent / width
height

What are the heights of the rest of the bins?¶

In [ ]:

# Get the bin lefts
bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))

In [ ]:

# Get the bin widths
bin_widths = np.diff(binned_data.column('bin'))
bin_lefts = bin_lefts.with_column('Width', bin_widths)

In [ ]:

# Get the bin heights
bin_heights = bin_lefts.column('Percent') / bin_widths
bin_lefts = bin_lefts.with_column('Height', bin_heights)

In [ ]:

bin_lefts

In [ ]:

top_movies.hist('Age', bins = my_bins, unit = 'Year')

Please note that the example above is a bad example. On a histogram, bins should be the same size. Now, let's look at another example showing the impact of bins.

The impact of bins¶

In [ ]:

survey = Table.read_table('welcome_survey_v1.csv')
survey

In [ ]:

sleep_bins = np.arange(4, 12, 0.5)
sleep_bins

In [ ]:

survey.hist('Hours of sleep', bins=sleep_bins)

In [ ]:

sleep_bins = np.arange(4,12, 1)
survey.hist('Hours of sleep', bins=sleep_bins)

Boxplots¶

Box plots are a graphical display of the descriptive statistics that we know as the Five Number Summary.

Five Number Summary is as follows

The minimum
The $25^{th}$ percentile, $Q_1$
The $50^{th}$ percentile, the median
The $75^{th}$ percentile, $Q_3$
The maximum

In one of the next couple of notebooks, we will discuss how to compute these.

In [ ]:

skyscrapers = Table.read_table('skyscrapers.csv')
ny = skyscrapers.where('city', "New York City")
la = skyscrapers.where('city', 'Los Angeles')
ny

In [ ]:

plots.boxplot(ny.column(3), widths = 0.4);
#plots.xlabel("NYC");

In [ ]:

plots.boxplot(ny.column(3), widths=.5 )
plots.xticks(make_array(1), make_array("NYC"))
plots.title("Heights of New York City \nSkyscrapers");

Can we also display the mean on a boxplot?¶

Yes, if you set the value of showmeans equal to True.

In [ ]:

plots.boxplot(ny.column(3), widths=.5 , showmeans=True)
plots.xticks(make_array(1), make_array("NYC"));

In [ ]:

plots.boxplot(ny.column(3), widths=.5 , showmeans=True, whis=10)
plots.xticks(make_array(1), make_array("NYC"));

In [ ]:

Comparing two groups numerically¶

We can use side-by-side boxplots to visually compare two groups.

In [ ]:

ticks=make_array(2.5, 3.5)
labels=make_array("NYC", "LA")

plots.figure(figsize=(6, 6))
plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)) )
plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)) )
plots.xticks(ticks, labels)
plots.title("Boxplots Comparing NYC and LA Skyscrapers");

In [ ]:

ticks=make_array(2.5, 3.5)
labels=make_array("NYC", "LA")

plots.figure(figsize=(6, 6))
plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)
plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)
plots.xticks(ticks, labels)
plots.title("Boxplots Comparing NYC and LA Skyscrapers");

In [ ]:

smokers = baby.where("Maternal Smoker", True).column("Birth Weight")
nonsmokers = baby.where("Maternal Smoker", False).column("Birth Weight")


ticks=make_array(2.5, 3.5)
labels=make_array("Smokers", "Non-Smokers")

plots.figure(figsize=(6, 6))
plots.boxplot(smokers, widths=.5, positions=make_array(ticks.item(0)), showmeans=True)
plots.boxplot(nonsmokers, widths=.5, positions=make_array(ticks.item(1)), showmeans=True)
plots.xticks(ticks, labels)
plots.title("Comparing Birth Weights \nof Babies");

Sometimes, overlapping histograms are also possible.

In [ ]:

galton = Table.read_table('galton.csv')

In [ ]:

galton

In [ ]:

galton.hist('midparentHeight')

In [ ]:

galton.hist('childHeight')

What part of this histogram is the children with heights above 70 inches?

In [ ]:

galton.hist('childHeight', left_end=70)

In [ ]:

galton.hist('midparentHeight', 'childHeight')

In [ ]:

galton.hist("mother", "father", "childHeight", "midparentHeight")

In [ ]:

ticks=make_array(2.5, 3.5, 4.5, 5.5)
labels=make_array("Midparent", "Child", "Mother", "Father")

plots.figure(figsize=(6, 6))
plots.boxplot(galton.column("midparentHeight"), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)
plots.boxplot(galton.column("childHeight"), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)
plots.boxplot(galton.column("mother"), widths=.5, positions=make_array(ticks.item(2)), showmeans=True)
plots.boxplot(galton.column("father"), widths=.5, positions=make_array(ticks.item(3)), showmeans=True)
plots.xticks(ticks, labels)
plots.title("Comparing Heights");

In [ ]: