In this notebook we will cover more thoroughly than before:
from datascience import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
plots.rcParams["patch.force_edgecolor"] = True
#help(Table.hist)
top_movies = Table.read_table('top_movies_2017.csv')
top_movies
top_movies = top_movies.with_column('Millions', np.round(top_movies.column('Gross')/1000000,3))
top_movies.take(np.arange(10)).barh('Title', 'Millions')
When the bars of a bar graph are arranged from longest to shortest, that is called a Pareto chart.
top_movies.take(np.arange(10)).sort("Millions", descending=True).barh('Title', 'Millions')
studios = top_movies.select('Studio')
studios
studio_distribution = studios.group('Studio')
studio_distribution
sum(studio_distribution.column('count'))
studio_distribution.barh('Studio')
#Pareto
studio_distribution.sort('count', descending=True).barh('Studio')
baby = Table.read_table("baby.csv")
baby
baby.group("Maternal Smoker")
baby.group("Maternal Smoker").barh("Maternal Smoker")
plots.xlabel("Count", color = "blue")
plots.title("Counts of Mothers in \nBaby Dataset", color = "blue")
plots.yticks(make_array(.5, 1.5), make_array("Yes", "No"), color = "gold")
plots.ylabel("Smoker", color ="blue", size = 25);
help(plots)
ages = 2022 - top_movies.column('Year')
top_movies = top_movies.with_column('Age', ages)
top_movies
min(ages), max(ages)
my_bins = make_array(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110)
binned_data = top_movies.bin('Age', bins = my_bins)
binned_data
sum(binned_data.column('Age count'))
top_movies.bin('Age', bins = np.arange(0, 126, 25))
top_movies.bin('Age', bins = np.arange(0, 60, 25))
top_movies.where('Age', 52)
my_bins
binned_data
# Let's make our first histogram!
top_movies.hist('Age')
# Let's try picking our own bins instead.
top_movies.hist('Age', bins = np.arange(0, 100, 10), unit = 'Year')
top_movies.hist('Age', bins = np.arange(0, 110, 10), unit = 'Year', density=False)
25/2
# Let's try not specifying any bins!
top_movies.hist('Age', unit='Year')
3*9.61+5
# Add a column containing what percent of movies are in each bin
binned_data = binned_data.with_column(
'Percent', 100*binned_data.column('Age count')/200)
binned_data
# Step 1: Calculate % of movies in the [40, 65) bin
percent = binned_data.where('bin', 42).column('Percent').item(0)
percent
# Step 2: Calculate the width of the 42-66 bin
width = 66 - 42
width
# Step 3: Area of rectangle = height * width
# --> height = percent / width
height = percent / width
height
# Get the bin lefts
bin_lefts = binned_data.take(np.arange(binned_data.num_rows - 1))
# Get the bin widths
bin_widths = np.diff(binned_data.column('bin'))
bin_lefts = bin_lefts.with_column('Width', bin_widths)
# Get the bin heights
bin_heights = bin_lefts.column('Percent') / bin_widths
bin_lefts = bin_lefts.with_column('Height', bin_heights)
bin_lefts
top_movies.hist('Age', bins = my_bins, unit = 'Year')
Please note that the example above is a bad example. On a histogram, bins should be the same size. Now, let's look at another example showing the impact of bins.
survey = Table.read_table('welcome_survey_v1.csv')
survey
sleep_bins = np.arange(4, 12, 0.5)
sleep_bins
survey.hist('Hours of sleep', bins=sleep_bins)
sleep_bins = np.arange(4,12, 1)
survey.hist('Hours of sleep', bins=sleep_bins)
Box plots are a graphical display of the descriptive statistics that we know as the Five Number Summary.
Five Number Summary is as follows
In one of the next couple of notebooks, we will discuss how to compute these.
skyscrapers = Table.read_table('skyscrapers.csv')
ny = skyscrapers.where('city', "New York City")
la = skyscrapers.where('city', 'Los Angeles')
ny
plots.boxplot(ny.column(3), widths = 0.4);
#plots.xlabel("NYC");
plots.boxplot(ny.column(3), widths=.5 )
plots.xticks(make_array(1), make_array("NYC"))
plots.title("Heights of New York City \nSkyscrapers");
Yes, if you set the value of showmeans equal to True.
plots.boxplot(ny.column(3), widths=.5 , showmeans=True)
plots.xticks(make_array(1), make_array("NYC"));
plots.boxplot(ny.column(3), widths=.5 , showmeans=True, whis=10)
plots.xticks(make_array(1), make_array("NYC"));
We can use side-by-side boxplots to visually compare two groups.
ticks=make_array(2.5, 3.5)
labels=make_array("NYC", "LA")
plots.figure(figsize=(6, 6))
plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)) )
plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)) )
plots.xticks(ticks, labels)
plots.title("Boxplots Comparing NYC and LA Skyscrapers");
ticks=make_array(2.5, 3.5)
labels=make_array("NYC", "LA")
plots.figure(figsize=(6, 6))
plots.boxplot(ny.column(3), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)
plots.boxplot(la.column(3), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)
plots.xticks(ticks, labels)
plots.title("Boxplots Comparing NYC and LA Skyscrapers");
smokers = baby.where("Maternal Smoker", True).column("Birth Weight")
nonsmokers = baby.where("Maternal Smoker", False).column("Birth Weight")
ticks=make_array(2.5, 3.5)
labels=make_array("Smokers", "Non-Smokers")
plots.figure(figsize=(6, 6))
plots.boxplot(smokers, widths=.5, positions=make_array(ticks.item(0)), showmeans=True)
plots.boxplot(nonsmokers, widths=.5, positions=make_array(ticks.item(1)), showmeans=True)
plots.xticks(ticks, labels)
plots.title("Comparing Birth Weights \nof Babies");
Sometimes, overlapping histograms are also possible.
galton = Table.read_table('galton.csv')
galton
galton.hist('midparentHeight')
galton.hist('childHeight')
What part of this histogram is the children with heights above 70 inches?
galton.hist('childHeight', left_end=70)
galton.hist('midparentHeight', 'childHeight')
galton.hist("mother", "father", "childHeight", "midparentHeight")
ticks=make_array(2.5, 3.5, 4.5, 5.5)
labels=make_array("Midparent", "Child", "Mother", "Father")
plots.figure(figsize=(6, 6))
plots.boxplot(galton.column("midparentHeight"), widths=.5, positions=make_array(ticks.item(0)), showmeans=True)
plots.boxplot(galton.column("childHeight"), widths=.5, positions=make_array(ticks.item(1)), showmeans=True)
plots.boxplot(galton.column("mother"), widths=.5, positions=make_array(ticks.item(2)), showmeans=True)
plots.boxplot(galton.column("father"), widths=.5, positions=make_array(ticks.item(3)), showmeans=True)
plots.xticks(ticks, labels)
plots.title("Comparing Heights");