#!/usr/bin/env python
# coding: utf-8

#  # 1. Introduction

# In[36]:


import pandas as pd
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

recent_grads = pd.read_csv('recent-grads.csv')

print(recent_grads.iloc[0])
print(recent_grads.head())
print(recent_grads.tail())


# In[37]:


recent_grads.head()


# In[38]:


print(recent_grads.describe())


# In[39]:


raw_data_count = recent_grads.count()
print(raw_data_count)


# In[40]:


recent_grads = recent_grads.dropna()
cleaned_data_count = recent_grads.count()
print(cleaned_data_count)


#  # 2. Pandas, Scatter Plots

# Generate scatter plots in separate jupyter notebook cells to explore the following relations:
#     - Sample_size and Median
#     - Sample_size and Unemployment_rate
#     - Full_time and Median
#     - ShareWomen and Unemployment_rate
#     - Men and Median
#     - Women and Median
#     
# Use the plots to explore the following questions:
#     - Do students in more popular majors make more money?
#     - Do students that majored in subjects that were majority female make more money?
#     - Is there any link between the number of full-time employees and median salary?

# In[41]:


ax = recent_grads.plot(x='Sample_size', y='Employed', kind='scatter')
ax.set_title('Employed vs. Sample-size')


# In[43]:


recent_grads.plot(x='Sample_size', y='Median', kind='scatter')


# In[44]:


recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter')


# In[45]:


recent_grads.plot(x='Full_time', y='Median', kind='scatter')


# In[46]:


recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter')


# In[47]:


recent_grads.plot(x='Men', y='Median', kind='scatter')


# In[48]:


recent_grads.plot(x='Women', y='Median', kind='scatter')


# # 3. Pandas, Histograms

# To explore the distribution of values in a column, we can select it from the DataFrame, call Series.plot(), and set the kind parameter to hist:

# In[53]:


recent_grads['Sample_size'].plot(kind='hist')


# In[54]:


recent_grads['Sample_size'].plot(kind='hist', bins=25, range=(0,5000))


# Alternative way to produce the same result:

# In[55]:


recent_grads['Sample_size'].hist(bins=25, range=(0,5000))


# Generate histograms in separate jupyter notebook cells to explore the distributions of the following columns:
#     - Sample_size
#     - Median
#     - Employed
#     - Full_time
#     - ShareWomen
#     - Unemployment_rate
#     - Men
#     - Women
# 
# Use the plots to explore the following questions:
#     - What percent of majors are predominantly male? Predominantly female?
#     - What's the most common median salary range?

# In[72]:


recent_grads['Median'].hist(bins=10)


# In[76]:


recent_grads['Employed'].hist(bins=15)


# In[77]:


recent_grads['Full_time'].hist()


# In[80]:


recent_grads['ShareWomen'].hist()


# In[79]:


recent_grads['Unemployment_rate'].hist()


# In[81]:


recent_grads['Men'].hist()


# In[82]:


recent_grads['Women'].hist()


# In[66]:


cols = ['Sample_size', 'Median', 'Employed', 'Full_time', 'ShareWomen', 'Unemployment_rate', 'Men', 'Women']

fig = plt.figure(figsize=(5, 15))
for i in range (0,4):
    ax = fig.add_subplot(4,1,i+1)
    ax = recent_grads[cols[i]].plot(kind='hist', rot=30)


# # 4. Pandas, Scatter Matrix Plot

# A scatter matrix plot combines both scatter plots and histograms into one grid of plots and allows us to explore potential relationships and distributions simultaneously. A scatter matrix plot consists of n by n plots on a grid, where n is the number of columns, the plots on the diagonal are histograms, and the non-diagonal plots are scatter plots.

# In[85]:


from pandas.plotting import scatter_matrix 

scatter_matrix(recent_grads[['Women', 'Men']], figsize=(10,10))


# In[86]:


scatter_matrix(recent_grads[['Sample_size', 'Median']], figsize=(10,10))


# In[87]:


scatter_matrix(recent_grads[['Sample_size', 'Median', 'Unemployment_rate']], figsize=(10,10))


# # 5. Pandas, Bar Plots

# The following code returns a bar plot of the first 5 values in the Women column:

# In[88]:


recent_grads[:5]['Women'].plot(kind='bar')


# By default, pandas will use the default labels on the x-axis for each bar (1 to n) from matplotlib. If we instead use the DataFrame.plot.bar() method, we can use the x parameter to specify the labels and the y parameter to specify the data for the bars:

# In[92]:


recent_grads[:5].plot.bar(x='Major', y='Women')


# Use bar plots to compare the percentages of women (ShareWomen) from the first ten rows and last ten rows of the recent_grads dataframe.

# In[95]:


recent_grads[:10].plot.bar(x='Major', y='ShareWomen', legend=False)


# In[96]:


recent_grads[-10:].plot.bar(x='Major', y='ShareWomen', legend=False)


# Use bar plots to compare the unemployment rate (Unemployment_rate) from the first ten rows and last ten rows of the recent_grads dataframe.

# In[101]:


recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate', legend=False)


# In[102]:


recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate', legend=False)