#!/usr/bin/env python # coding: utf-8 # # 1. Introduction # In[36]: import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') recent_grads = pd.read_csv('recent-grads.csv') print(recent_grads.iloc[0]) print(recent_grads.head()) print(recent_grads.tail()) # In[37]: recent_grads.head() # In[38]: print(recent_grads.describe()) # In[39]: raw_data_count = recent_grads.count() print(raw_data_count) # In[40]: recent_grads = recent_grads.dropna() cleaned_data_count = recent_grads.count() print(cleaned_data_count) # # 2. Pandas, Scatter Plots # Generate scatter plots in separate jupyter notebook cells to explore the following relations: # - Sample_size and Median # - Sample_size and Unemployment_rate # - Full_time and Median # - ShareWomen and Unemployment_rate # - Men and Median # - Women and Median # # Use the plots to explore the following questions: # - Do students in more popular majors make more money? # - Do students that majored in subjects that were majority female make more money? # - Is there any link between the number of full-time employees and median salary? # In[41]: ax = recent_grads.plot(x='Sample_size', y='Employed', kind='scatter') ax.set_title('Employed vs. Sample-size') # In[43]: recent_grads.plot(x='Sample_size', y='Median', kind='scatter') # In[44]: recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter') # In[45]: recent_grads.plot(x='Full_time', y='Median', kind='scatter') # In[46]: recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter') # In[47]: recent_grads.plot(x='Men', y='Median', kind='scatter') # In[48]: recent_grads.plot(x='Women', y='Median', kind='scatter') # # 3. Pandas, Histograms # To explore the distribution of values in a column, we can select it from the DataFrame, call Series.plot(), and set the kind parameter to hist: # In[53]: recent_grads['Sample_size'].plot(kind='hist') # In[54]: recent_grads['Sample_size'].plot(kind='hist', bins=25, range=(0,5000)) # Alternative way to produce the same result: # In[55]: recent_grads['Sample_size'].hist(bins=25, range=(0,5000)) # Generate histograms in separate jupyter notebook cells to explore the distributions of the following columns: # - Sample_size # - Median # - Employed # - Full_time # - ShareWomen # - Unemployment_rate # - Men # - Women # # Use the plots to explore the following questions: # - What percent of majors are predominantly male? Predominantly female? # - What's the most common median salary range? # In[72]: recent_grads['Median'].hist(bins=10) # In[76]: recent_grads['Employed'].hist(bins=15) # In[77]: recent_grads['Full_time'].hist() # In[80]: recent_grads['ShareWomen'].hist() # In[79]: recent_grads['Unemployment_rate'].hist() # In[81]: recent_grads['Men'].hist() # In[82]: recent_grads['Women'].hist() # In[66]: cols = ['Sample_size', 'Median', 'Employed', 'Full_time', 'ShareWomen', 'Unemployment_rate', 'Men', 'Women'] fig = plt.figure(figsize=(5, 15)) for i in range (0,4): ax = fig.add_subplot(4,1,i+1) ax = recent_grads[cols[i]].plot(kind='hist', rot=30) # # 4. Pandas, Scatter Matrix Plot # A scatter matrix plot combines both scatter plots and histograms into one grid of plots and allows us to explore potential relationships and distributions simultaneously. A scatter matrix plot consists of n by n plots on a grid, where n is the number of columns, the plots on the diagonal are histograms, and the non-diagonal plots are scatter plots. # In[85]: from pandas.plotting import scatter_matrix scatter_matrix(recent_grads[['Women', 'Men']], figsize=(10,10)) # In[86]: scatter_matrix(recent_grads[['Sample_size', 'Median']], figsize=(10,10)) # In[87]: scatter_matrix(recent_grads[['Sample_size', 'Median', 'Unemployment_rate']], figsize=(10,10)) # # 5. Pandas, Bar Plots # The following code returns a bar plot of the first 5 values in the Women column: # In[88]: recent_grads[:5]['Women'].plot(kind='bar') # By default, pandas will use the default labels on the x-axis for each bar (1 to n) from matplotlib. If we instead use the DataFrame.plot.bar() method, we can use the x parameter to specify the labels and the y parameter to specify the data for the bars: # In[92]: recent_grads[:5].plot.bar(x='Major', y='Women') # Use bar plots to compare the percentages of women (ShareWomen) from the first ten rows and last ten rows of the recent_grads dataframe. # In[95]: recent_grads[:10].plot.bar(x='Major', y='ShareWomen', legend=False) # In[96]: recent_grads[-10:].plot.bar(x='Major', y='ShareWomen', legend=False) # Use bar plots to compare the unemployment rate (Unemployment_rate) from the first ten rows and last ten rows of the recent_grads dataframe. # In[101]: recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate', legend=False) # In[102]: recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate', legend=False)