#!/usr/bin/env python # coding: utf-8 # # Guided Project: Visualizing Earnings Based on College Majors # ### Installing the necessary libaries # In[1]: import pandas as pd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: recent_grads = pd.read_csv('recent-grads.csv') # First we want to find out how the Table "recent grads" looks like and try to better understand it # In[3]: recent_grads.iloc[[0]] # In[4]: recent_grads.head() # In[5]: recent_grads.tail() # We can see between the lowest and highest ranked majors, that the discrepancy between them is up to 80k$. # In[6]: recent_grads.describe() # Some of the rows how more counts then the others (173 / 172) # In[7]: raw_data_count = recent_grads.count() raw_data_count # e # In[8]: recent_grads_drop = recent_grads.dropna(0) recent_grads_drop # In[9]: recent_grads_drop_desc = recent_grads_drop.describe() recent_grads_drop_desc # In[10]: recent_grads_drop_count = recent_grads_drop.count() recent_grads_drop_count # We dropped all empty fields. They all have now the same number (172). # In[11]: recent_grads_drop_count_desc = recent_grads_drop_count.describe() recent_grads_drop_count_desc # In[12]: recent_grads = recent_grads_drop recent_grads # In[13]: cleaned_data_count = recent_grads.count() cleaned_data_count # In[14]: recent_grads.plot(x='Sample_size', y='Median', kind='scatter') # In[15]: recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter') # In[16]: recent_grads.plot(x='Full_time', y='Median', kind='scatter') # In[17]: recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter') # In[18]: recent_grads.plot(x='Men', y='Median', kind='scatter') # In[19]: recent_grads.plot(x='Women', y='Median', kind='scatter') # INFOS # In[20]: recent_grads.plot(x='ShareWomen', y='Median', kind='scatter') # In[21]: recent_grads.plot(x='Total', y='Median', kind='scatter') # Questions: # # 1) Do students in more popular majors make more money? # -> No! Not neceserrally # 2) Do students that majored in subjects that were majority female make more money? # -> No! Majors were the majority are women earn i nthe median less. # 3) Is there any link between the number of full-time employees and median salary? # -> Not if we just look at the number of full-time employees - but if we are looking at the share between full-time and part-time we could find out, whether full-time gives higher salaries. # # We will generate histograms in separate jupyter notebook cells to explore the distributions of the following columns: # Sample_size # Median # Employed # Full_time # ShareWomen # Unemployment_rate # Men # Women # # In[22]: recent_grads['Sample_size'].hist(bins=25, range=(0,5000)) # In[30]: recent_grads['Median'].hist(bins=25, range=(0,5000)) # In[31]: recent_grads['Employed'].hist(bins=25, range=(0,5000)) # In[32]: recent_grads['Full_time'].hist(bins=25, range=(0,5000)) # In[34]: recent_grads['ShareWomen'].hist(bins=5, range=(0,5000)) # In[36]: recent_grads['Unemployment_rate'].hist(bins=25, range=(0,5000)) # In[37]: recent_grads['Men'].hist(bins=25, range=(0,5000)) # In[38]: recent_grads['Women'].hist(bins=25, range=(0,5000)) # In[47]: cols = ["Sample_size", "Median", "Employed", "Full_time", "ShareWomen", "Unemployment_rate", "Men", "Women"] fig = plt.figure(figsize=(5,12)) for r in range(0,4): ax = fig.add_subplot(4,1,r+1) ax = recent_grads[cols[r]].plot(kind='hist', rot=30) # In[ ]: