import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
recent_grads = pd.read_csv('recent-grads.csv')
recent_grads.iloc[0]
Rank 1 Major_code 2419 Major PETROLEUM ENGINEERING Total 2339 Men 2057 Women 282 Major_category Engineering ShareWomen 0.120564 Sample_size 36 Employed 1976 Full_time 1849 Part_time 270 Full_time_year_round 1207 Unemployed 37 Unemployment_rate 0.0183805 Median 110000 P25th 95000 P75th 125000 College_jobs 1534 Non_college_jobs 364 Low_wage_jobs 193 Name: 0, dtype: object
recent_grads.head()
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.120564 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.018381 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.101852 | 7 | 640 | ... | 170 | 388 | 85 | 0.117241 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.153037 | 3 | 648 | ... | 133 | 340 | 16 | 0.024096 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.107313 | 16 | 758 | ... | 150 | 692 | 40 | 0.050125 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.341631 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.061098 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
5 rows × 21 columns
recent_grads.tail()
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.637293 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.046320 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.817099 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.065112 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.799859 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.149048 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.798746 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.053621 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.877960 | 2 | 742 | ... | 237 | 410 | 87 | 0.104946 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
5 rows × 21 columns
recent_grads.describe(include='all')
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 173.000000 | 173.000000 | 173 | 172.000000 | 172.000000 | 172.000000 | 173 | 172.000000 | 173.000000 | 173.000000 | ... | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 | 173.000000 |
unique | NaN | NaN | 173 | NaN | NaN | NaN | 16 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
top | NaN | NaN | FRENCH GERMAN LATIN AND OTHER COMMON FOREIGN L... | NaN | NaN | NaN | Engineering | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
freq | NaN | NaN | 1 | NaN | NaN | NaN | 29 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
mean | 87.000000 | 3879.815029 | NaN | 39370.081395 | 16723.406977 | 22646.674419 | NaN | 0.522223 | 356.080925 | 31192.763006 | ... | 8832.398844 | 19694.427746 | 2416.329480 | 0.068191 | 40151.445087 | 29501.445087 | 51494.219653 | 12322.635838 | 13284.497110 | 3859.017341 |
std | 50.084928 | 1687.753140 | NaN | 63483.491009 | 28122.433474 | 41057.330740 | NaN | 0.231205 | 618.361022 | 50675.002241 | ... | 14648.179473 | 33160.941514 | 4112.803148 | 0.030331 | 11470.181802 | 9166.005235 | 14906.279740 | 21299.868863 | 23789.655363 | 6944.998579 |
min | 1.000000 | 1100.000000 | NaN | 124.000000 | 119.000000 | 0.000000 | NaN | 0.000000 | 2.000000 | 0.000000 | ... | 0.000000 | 111.000000 | 0.000000 | 0.000000 | 22000.000000 | 18500.000000 | 22000.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 44.000000 | 2403.000000 | NaN | 4549.750000 | 2177.500000 | 1778.250000 | NaN | 0.336026 | 39.000000 | 3608.000000 | ... | 1030.000000 | 2453.000000 | 304.000000 | 0.050306 | 33000.000000 | 24000.000000 | 42000.000000 | 1675.000000 | 1591.000000 | 340.000000 |
50% | 87.000000 | 3608.000000 | NaN | 15104.000000 | 5434.000000 | 8386.500000 | NaN | 0.534024 | 130.000000 | 11797.000000 | ... | 3299.000000 | 7413.000000 | 893.000000 | 0.067961 | 36000.000000 | 27000.000000 | 47000.000000 | 4390.000000 | 4595.000000 | 1231.000000 |
75% | 130.000000 | 5503.000000 | NaN | 38909.750000 | 14631.000000 | 22553.750000 | NaN | 0.703299 | 338.000000 | 31433.000000 | ... | 9948.000000 | 16891.000000 | 2393.000000 | 0.087557 | 45000.000000 | 33000.000000 | 60000.000000 | 14444.000000 | 11783.000000 | 3466.000000 |
max | 173.000000 | 6403.000000 | NaN | 393735.000000 | 173809.000000 | 307087.000000 | NaN | 0.968954 | 4212.000000 | 307933.000000 | ... | 115172.000000 | 199897.000000 | 28169.000000 | 0.177226 | 110000.000000 | 95000.000000 | 125000.000000 | 151643.000000 | 148395.000000 | 48207.000000 |
11 rows × 21 columns
Clean the data by removing rows with missing values
raw_data_count = recent_grads.shape[0]
recent_grads = recent_grads.dropna()
cleaned_data_count = recent_grads.shape[0]
print(raw_data_count, cleaned_data_count)
173 172
ax = recent_grads.plot(x='Sample_size', y='Employed', kind='scatter', title='Employed vs. Sample_size', figsize=(5,10))
recent_grads.plot(x='Sample_size', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40c1e01e80>
recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40c1db61d0>
recent_grads.plot(x='Full_time', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfd08898>
recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfcef4e0>
recent_grads.plot(x='Men', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfc56470>
recent_grads.plot(x='Women', y='Median', kind='scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfc2ed30>
Do students in more popular majors make more money?
Do students that majored in subjects that were majority female make more money?
Is there any link between the number of full-time employees and median salary?
recent_grads['Sample_size'].hist(bins=25, range=(0,5000))
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfcc0ac8>
recent_grads['Median'].hist(bins=25, range=(20000, 50000))
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfaeb470>
recent_grads['Employed'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bfa22fd0>
recent_grads['Full_time'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf964828>
recent_grads['ShareWomen'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf9235f8>
recent_grads['Unemployment_rate'].hist(bins=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf7d5f60>
recent_grads['Men'].hist(bins=25, range=(0,75000))
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf710780>
recent_grads['Women'].hist(bins=25, range=(0,75000))
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf60d6a0>
What percent of majors are predominantly male?
Predominantly female?
What's the most common median salary range?
from pandas.plotting import scatter_matrix
scatter_matrix(recent_grads[['Sample_size','Median']], figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf555208>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf512080>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf4dc208>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf4963c8>]], dtype=object)
scatter_matrix(recent_grads[['Sample_size','Median','Unemployment_rate']], figsize=(10,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf3c8240>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf3b2e80>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf300898>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf33b438>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf285588>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf241f60>], [<matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf210c50>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf1c5da0>, <matplotlib.axes._subplots.AxesSubplot object at 0x7f40bf194fd0>]], dtype=object)
recent_grads[:10].plot.bar(x='Major', y='ShareWomen')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf0a2390>
recent_grads[-10:].plot.bar(x='Major', y='ShareWomen')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40beffe940>
recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bef8b4a8>
recent_grads[-10:].plot.bar(x='Major', y='Unemployment_rate')
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bf295ac8>
This plot shows quantity of men and women in each category of majors
(recent_grads[['Major_category','Men','Women']]
.groupby(['Major_category'])
.sum()
.plot.bar())
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bef88e10>
Distributions of median salaries and unemployment rate
recent_grads[['Median']].boxplot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bed46470>
recent_grads[['Unemployment_rate']].boxplot()
<matplotlib.axes._subplots.AxesSubplot at 0x7f40bed0c320>
recent_grads.plot.hexbin(x='Women', y='Median',gridsize=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7f40beaf0a20>
recent_grads.plot.hexbin(x='Men', y='Median',gridsize=25)
<matplotlib.axes._subplots.AxesSubplot at 0x7f40be896d30>