import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
recent_grads = pd.read_csv('recent-grads.csv')
recent_grads.iloc[0]
recent_grads.head()
recent_grads.tail()
recent_grads.describe(include='all')
Clean the data by removing rows with missing values
raw_data_count = recent_grads.shape[0]
recent_grads = recent_grads.dropna()
cleaned_data_count = recent_grads.shape[0]
print(raw_data_count, cleaned_data_count)
ax = recent_grads.plot(x='Sample_size', y='Employed', kind='scatter', title='Employed vs. Sample_size', figsize=(5,10))
recent_grads.plot(x='Sample_size', y='Median', kind='scatter')
recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter')
recent_grads.plot(x='Full_time', y='Median', kind='scatter')
recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter')
recent_grads.plot(x='Men', y='Median', kind='scatter')
recent_grads.plot(x='Women', y='Median', kind='scatter')
Do students in more popular majors make more money?
recent_grads['Sample_size'].hist(bins=25, range=(0,5000))
recent_grads['Median'].hist(bins=25, range=(20000, 50000))
recent_grads['Employed'].hist(bins=25)
recent_grads['Full_time'].hist(bins=25)
recent_grads['ShareWomen'].hist(bins=25)
recent_grads['Unemployment_rate'].hist(bins=25)
recent_grads['Men'].hist(bins=25, range=(0,75000))
recent_grads['Women'].hist(bins=25, range=(0,75000))
What percent of majors are predominantly male?
from pandas.plotting import scatter_matrix
scatter_matrix(recent_grads[['Sample_size','Median']], figsize=(10,10))
scatter_matrix(recent_grads[['Sample_size','Median','Unemployment_rate']], figsize=(10,10))
recent_grads[:10].plot.bar(x='Major', y='ShareWomen')
recent_grads[-10:].plot.bar(x='Major', y='ShareWomen')
recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate')
recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate')
This plot shows quantity of men and women in each category of majors
(recent_grads[['Major_category','Men','Women']]
.groupby(['Major_category'])
.sum()
.plot.bar())
Distributions of median salaries and unemployment rate
recent_grads[['Median']].boxplot()
recent_grads[['Unemployment_rate']].boxplot()
recent_grads.plot.hexbin(x='Women', y='Median',gridsize=25)
recent_grads.plot.hexbin(x='Men', y='Median',gridsize=25)