# import the libraries
import csv
import pandas as pd
import geopy
import folium
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from pathlib import Path
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
import spacy # NLP Library
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import classification_report
from sklearn.svm import SVR
# import data to be explored
myVars = vars() # for make vars for dataframes
# use glob to get all the csv files in the folder
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "ref_data/*.csv"))
# loop over the list of csv files
for f in csv_files:
p = Path(f)
vname = p.stem
vname = vname.replace("-", "_")
# read the csv file
df = pd.read_csv(f)
myVars[vname] = df
# print the location and filename
print('Location:', f)
print('File Name:', f.split("\\")[-1])
# print the content
print('Content:')
display(df)
print("Column Types:")
print(df.dtypes)
print("\n")
Location: /Users/michelledavies/Documents/curri-app/ref_data/college_majors_dmgrph.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/college_majors_dmgrph.csv Content:
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.1206 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.0184 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.1019 | 7 | 640 | ... | 170 | 388 | 85 | 0.1172 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.1530 | 3 | 648 | ... | 133 | 340 | 16 | 0.0241 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.1073 | 16 | 758 | ... | 150 | 692 | 40 | 0.0501 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.3416 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.0611 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.6373 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.0463 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.8171 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.0651 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.7999 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.1490 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.7987 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.0536 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.8780 | 2 | 742 | ... | 237 | 410 | 87 | 0.1049 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
173 rows × 21 columns
Column Types: Rank int64 Major_code int64 Major object Total float64 Men float64 Women float64 Major_category object ShareWomen float64 Sample_size int64 Employed int64 Full_time int64 Part_time int64 Full_time_year_round int64 Unemployed int64 Unemployment_rate float64 Median int64 P25th int64 P75th int64 College_jobs int64 Non_college_jobs int64 Low_wage_jobs int64 dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/college_majors.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/college_majors.csv Content:
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.1206 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.0184 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.1019 | 7 | 640 | ... | 170 | 388 | 85 | 0.1172 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.1530 | 3 | 648 | ... | 133 | 340 | 16 | 0.0241 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.1073 | 16 | 758 | ... | 150 | 692 | 40 | 0.0501 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.3416 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.0611 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.6373 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.0463 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.8171 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.0651 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.7999 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.1490 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.7987 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.0536 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.8780 | 2 | 742 | ... | 237 | 410 | 87 | 0.1049 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
173 rows × 21 columns
Column Types: Rank int64 Major_code int64 Major object Total float64 Men float64 Women float64 Major_category object ShareWomen float64 Sample_size int64 Employed int64 Full_time int64 Part_time int64 Full_time_year_round int64 Unemployed int64 Unemployment_rate float64 Median int64 P25th int64 P75th int64 College_jobs int64 Non_college_jobs int64 Low_wage_jobs int64 dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/majors-list.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/majors-list.csv Content:
FOD1P | Major | Major_Category | |
---|---|---|---|
0 | 1100 | GENERAL AGRICULTURE | Agriculture & Natural Resources |
1 | 1101 | AGRICULTURE PRODUCTION AND MANAGEMENT | Agriculture & Natural Resources |
2 | 1102 | AGRICULTURAL ECONOMICS | Agriculture & Natural Resources |
3 | 1103 | ANIMAL SCIENCES | Agriculture & Natural Resources |
4 | 1104 | FOOD SCIENCE | Agriculture & Natural Resources |
... | ... | ... | ... |
169 | 5504 | GEOGRAPHY | Social Science |
170 | 5505 | INTERNATIONAL RELATIONS | Social Science |
171 | 5506 | POLITICAL SCIENCE AND GOVERNMENT | Social Science |
172 | 5507 | SOCIOLOGY | Social Science |
173 | 5599 | MISCELLANEOUS SOCIAL SCIENCES | Social Science |
174 rows × 3 columns
Column Types: FOD1P object Major object Major_Category object dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/all-ages.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/all-ages.csv Content:
Major_code | Major | Major_category | Total | Employed | Employed_full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1100 | GENERAL AGRICULTURE | Agriculture & Natural Resources | 128148 | 90245 | 74078 | 2423 | 0.026147 | 50000 | 34000 | 80000.0 |
1 | 1101 | AGRICULTURE PRODUCTION AND MANAGEMENT | Agriculture & Natural Resources | 95326 | 76865 | 64240 | 2266 | 0.028636 | 54000 | 36000 | 80000.0 |
2 | 1102 | AGRICULTURAL ECONOMICS | Agriculture & Natural Resources | 33955 | 26321 | 22810 | 821 | 0.030248 | 63000 | 40000 | 98000.0 |
3 | 1103 | ANIMAL SCIENCES | Agriculture & Natural Resources | 103549 | 81177 | 64937 | 3619 | 0.042679 | 46000 | 30000 | 72000.0 |
4 | 1104 | FOOD SCIENCE | Agriculture & Natural Resources | 24280 | 17281 | 12722 | 894 | 0.049188 | 62000 | 38500 | 90000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 6211 | HOSPITALITY MANAGEMENT | Business | 200854 | 163393 | 122499 | 8862 | 0.051447 | 49000 | 33000 | 70000.0 |
169 | 6212 | MANAGEMENT INFORMATION SYSTEMS AND STATISTICS | Business | 156673 | 134478 | 118249 | 6186 | 0.043977 | 72000 | 50000 | 100000.0 |
170 | 6299 | MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION | Business | 102753 | 77471 | 61603 | 4308 | 0.052679 | 53000 | 36000 | 83000.0 |
171 | 6402 | HISTORY | Humanities & Liberal Arts | 712509 | 478416 | 354163 | 33725 | 0.065851 | 50000 | 35000 | 80000.0 |
172 | 6403 | UNITED STATES HISTORY | Humanities & Liberal Arts | 17746 | 11887 | 8204 | 943 | 0.073500 | 50000 | 39000 | 81000.0 |
173 rows × 11 columns
Column Types: Major_code int64 Major object Major_category object Total int64 Employed int64 Employed_full_time_year_round int64 Unemployed int64 Unemployment_rate float64 Median int64 P25th int64 P75th float64 dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/women-stem.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/women-stem.csv Content:
Rank | Major_code | Major | Major_category | Total | Men | Women | ShareWomen | Median | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | Engineering | 2339 | 2057 | 282 | 0.120564 | 110000 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | Engineering | 756 | 679 | 77 | 0.101852 | 75000 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | Engineering | 856 | 725 | 131 | 0.153037 | 73000 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | Engineering | 1258 | 1123 | 135 | 0.107313 | 70000 |
4 | 5 | 2418 | NUCLEAR ENGINEERING | Engineering | 2573 | 2200 | 373 | 0.144967 | 65000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
71 | 72 | 3604 | ECOLOGY | Biology & Life Science | 9154 | 3878 | 5276 | 0.576360 | 33000 |
72 | 73 | 6109 | TREATMENT THERAPY PROFESSIONS | Health | 48491 | 13487 | 35004 | 0.721866 | 33000 |
73 | 74 | 6100 | GENERAL MEDICAL AND HEALTH SERVICES | Health | 33599 | 7574 | 26025 | 0.774577 | 32400 |
74 | 75 | 6102 | COMMUNICATION DISORDERS SCIENCES AND SERVICES | Health | 38279 | 1225 | 37054 | 0.967998 | 28000 |
75 | 76 | 3609 | ZOOLOGY | Biology & Life Science | 8409 | 3050 | 5359 | 0.637293 | 26000 |
76 rows × 9 columns
Column Types: Rank int64 Major_code int64 Major object Major_category object Total int64 Men int64 Women int64 ShareWomen float64 Median int64 dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/recent-grads.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/recent-grads.csv Content:
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.120564 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.018381 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.101852 | 7 | 640 | ... | 170 | 388 | 85 | 0.117241 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.153037 | 3 | 648 | ... | 133 | 340 | 16 | 0.024096 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.107313 | 16 | 758 | ... | 150 | 692 | 40 | 0.050125 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.341631 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.061098 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.637293 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.046320 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.817099 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.065112 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.799859 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.149048 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.798746 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.053621 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.877960 | 2 | 742 | ... | 237 | 410 | 87 | 0.104946 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
173 rows × 21 columns
Column Types: Rank int64 Major_code int64 Major object Total float64 Men float64 Women float64 Major_category object ShareWomen float64 Sample_size int64 Employed int64 Full_time int64 Part_time int64 Full_time_year_round int64 Unemployed int64 Unemployment_rate float64 Median int64 P25th int64 P75th int64 College_jobs int64 Non_college_jobs int64 Low_wage_jobs int64 dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/grad-students.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/grad-students.csv Content:
Major_code | Major | Major_category | Grad_total | Grad_sample_size | Grad_employed | Grad_full_time_year_round | Grad_unemployed | Grad_unemployment_rate | Grad_median | ... | Nongrad_total | Nongrad_employed | Nongrad_full_time_year_round | Nongrad_unemployed | Nongrad_unemployment_rate | Nongrad_median | Nongrad_P25 | Nongrad_P75 | Grad_share | Grad_premium | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5601 | CONSTRUCTION SERVICES | Industrial Arts & Consumer Services | 9173 | 200 | 7098 | 6511 | 681 | 0.087543 | 75000.0 | ... | 86062 | 73607 | 62435 | 3928 | 0.050661 | 65000.0 | 47000 | 98000.0 | 0.096320 | 0.153846 |
1 | 6004 | COMMERCIAL ART AND GRAPHIC DESIGN | Arts | 53864 | 882 | 40492 | 29553 | 2482 | 0.057756 | 60000.0 | ... | 461977 | 347166 | 250596 | 25484 | 0.068386 | 48000.0 | 34000 | 71000.0 | 0.104420 | 0.250000 |
2 | 6211 | HOSPITALITY MANAGEMENT | Business | 24417 | 437 | 18368 | 14784 | 1465 | 0.073867 | 65000.0 | ... | 179335 | 145597 | 113579 | 7409 | 0.048423 | 50000.0 | 35000 | 75000.0 | 0.119837 | 0.300000 |
3 | 2201 | COSMETOLOGY SERVICES AND CULINARY ARTS | Industrial Arts & Consumer Services | 5411 | 72 | 3590 | 2701 | 316 | 0.080901 | 47000.0 | ... | 37575 | 29738 | 23249 | 1661 | 0.052900 | 41600.0 | 29000 | 60000.0 | 0.125878 | 0.129808 |
4 | 2001 | COMMUNICATION TECHNOLOGIES | Computers & Mathematics | 9109 | 171 | 7512 | 5622 | 466 | 0.058411 | 57000.0 | ... | 53819 | 43163 | 34231 | 3389 | 0.072800 | 52000.0 | 36000 | 78000.0 | 0.144753 | 0.096154 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 5203 | COUNSELING PSYCHOLOGY | Psychology & Social Work | 51812 | 724 | 38468 | 28808 | 1420 | 0.035600 | 50000.0 | ... | 16781 | 12377 | 8502 | 835 | 0.063200 | 40000.0 | 25000 | 50000.0 | 0.755354 | 0.250000 |
169 | 5202 | CLINICAL PSYCHOLOGY | Psychology & Social Work | 22716 | 355 | 16612 | 12022 | 782 | 0.044958 | 70000.0 | ... | 6519 | 4368 | 3033 | 357 | 0.075556 | 46000.0 | 30000 | 70000.0 | 0.777014 | 0.521739 |
170 | 6106 | HEALTH AND MEDICAL PREPARATORY PROGRAMS | Health | 114971 | 1766 | 78132 | 58825 | 1732 | 0.021687 | 135000.0 | ... | 26320 | 16221 | 12185 | 1012 | 0.058725 | 51000.0 | 35000 | 87000.0 | 0.813718 | 1.647059 |
171 | 2303 | SCHOOL STUDENT COUNSELING | Education | 19841 | 260 | 11313 | 8130 | 613 | 0.051400 | 56000.0 | ... | 2232 | 1328 | 980 | 169 | 0.112892 | 42000.0 | 27000 | 51000.0 | 0.898881 | 0.333333 |
172 | 2301 | EDUCATIONAL ADMINISTRATION AND SUPERVISION | Education | 54159 | 841 | 34142 | 26850 | 582 | 0.016761 | 65000.0 | ... | 4003 | 3079 | 2434 | 0 | 0.000000 | 58000.0 | 45000 | 79000.0 | 0.931175 | 0.120690 |
173 rows × 22 columns
Column Types: Major_code int64 Major object Major_category object Grad_total int64 Grad_sample_size int64 Grad_employed int64 Grad_full_time_year_round int64 Grad_unemployed int64 Grad_unemployment_rate float64 Grad_median float64 Grad_P25 int64 Grad_P75 float64 Nongrad_total int64 Nongrad_employed int64 Nongrad_full_time_year_round int64 Nongrad_unemployed int64 Nongrad_unemployment_rate float64 Nongrad_median float64 Nongrad_P25 int64 Nongrad_P75 float64 Grad_share float64 Grad_premium float64 dtype: object Location: /Users/michelledavies/Documents/curri-app/ref_data/students_data.csv File Name: /Users/michelledavies/Documents/curri-app/ref_data/students_data.csv Content:
ID | class | gender | race | GPA | Algebra | Calculus1 | Calculus2 | Statistics | Probability | Measure | Functional_analysis | from1 | from2 | from3 | from4 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1141 | A | male | 1 | 73.47 | 64 | 81 | 87 | 60 | 74 | 71 | 60 | A | A | A | 3 | 0 |
1 | 1142 | A | female | 1 | 71.22 | 57 | 50 | 51 | 51 | 55 | 62 | 61 | B | A | A | 2 | 0 |
2 | 1143 | A | female | 2 | 74.56 | 47 | 48 | 71 | 60 | 61 | 68 | 64 | C | A | A | 0 | 1 |
3 | 1144 | A | female | 1 | 72.89 | 46 | 72 | 38 | 60 | 29 | 54 | 51 | D | A | A | 0 | 0 |
4 | 1145 | A | female | 1 | 70.11 | 49 | 45 | 63 | 60 | 66 | 66 | 61 | E | A | A | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
100 | 1241 | A | female | 1 | 88.34 | 87 | 83 | 92 | 98 | 93 | 86 | 90 | M | B | A | 0 | 1 |
101 | 1242 | B | male | 1 | 89.84 | 98 | 77 | 95 | 98 | 96 | 88 | 100 | A | B | A | 0 | 1 |
102 | 1243 | B | male | 1 | 88.82 | 83 | 80 | 91 | 98 | 93 | 95 | 71 | T | B | A | 0 | 2 |
103 | 1244 | A | male | 1 | 86.60 | 92 | 82 | 91 | 99 | 94 | 82 | 78 | S | B | A | 0 | 2 |
104 | 1245 | A | male | 1 | 93.71 | 93 | 97 | 99 | 100 | 97 | 90 | 90 | K | B | A | 0 | 2 |
105 rows × 17 columns
Column Types: ID int64 class object gender object race int64 GPA float64 Algebra int64 Calculus1 int64 Calculus2 int64 Statistics int64 Probability int64 Measure int64 Functional_analysis int64 from1 object from2 object from3 object from4 int64 y int64 dtype: object
# list var names
v = list(myVars.keys())
variables_list = v[v.index('college_majors_dmgrph'):v.index('grad_students')+1]
print("The variables are: \n {}".format(v))
print("The variables I want are: \n {}".format(variables_list))
The variables are: ['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', '_i', '_ii', '_iii', '_i1', 'csv', 'pd', 'geopy', 'folium', 'np', 'os', 'Path', 'glob', 'plt', 'sns', 'WordCloud', 'STOPWORDS', 'ImageColorGenerator', 'train_test_split', 'PassiveAggressiveRegressor', 'spacy', 'cross_val_score', 'StratifiedKFold', 'classification_report', 'confusion_matrix', 'accuracy_score', 'LogisticRegression', 'LinearRegression', 'DecisionTreeClassifier', 'KNeighborsClassifier', 'LinearDiscriminantAnalysis', 'GaussianNB', 'SVC', 'KFold', 'ClassificationModel', 'MultiOutputClassifier', 'MultiOutputRegressor', 'SVR', '_i2', 'myVars', 'path', 'csv_files', 'f', 'p', 'vname', 'df', 'college_majors_dmgrph', 'college_majors', 'majors_list', 'all_ages', 'women_stem', 'recent_grads', 'grad_students', 'students_data', '_i3'] The variables I want are: ['college_majors_dmgrph', 'college_majors', 'majors_list', 'all_ages', 'women_stem', 'recent_grads', 'grad_students']
I want to look for some patterns in the dataset that will allow me to build a foundation for my model.
# First, I'm checking for nulls
print("grad_students Nulls:")
print(grad_students.isnull().sum(), "\n")
print("recent_grads Nulls:")
print(recent_grads.isnull().sum(), "\n")
print("women_stem Nulls:")
print(women_stem.isnull().sum(), "\n")
print("all_ages Nulls:")
print(all_ages.isnull().sum(), "\n")
print("majors_list Nulls:")
print(majors_list.isnull().sum(), "\n")
print("college_majors Nulls:")
print(college_majors.isnull().sum(), "\n")
print("college_majors_dmgrph Nulls:")
print(college_majors_dmgrph.isnull().sum())
grad_students Nulls: Major_code 0 Major 0 Major_category 0 Grad_total 0 Grad_sample_size 0 Grad_employed 0 Grad_full_time_year_round 0 Grad_unemployed 0 Grad_unemployment_rate 0 Grad_median 0 Grad_P25 0 Grad_P75 0 Nongrad_total 0 Nongrad_employed 0 Nongrad_full_time_year_round 0 Nongrad_unemployed 0 Nongrad_unemployment_rate 0 Nongrad_median 0 Nongrad_P25 0 Nongrad_P75 0 Grad_share 0 Grad_premium 0 dtype: int64 recent_grads Nulls: Rank 0 Major_code 0 Major 0 Total 1 Men 1 Women 1 Major_category 0 ShareWomen 1 Sample_size 0 Employed 0 Full_time 0 Part_time 0 Full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 College_jobs 0 Non_college_jobs 0 Low_wage_jobs 0 dtype: int64 women_stem Nulls: Rank 0 Major_code 0 Major 0 Major_category 0 Total 0 Men 0 Women 0 ShareWomen 0 Median 0 dtype: int64 all_ages Nulls: Major_code 0 Major 0 Major_category 0 Total 0 Employed 0 Employed_full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 dtype: int64 majors_list Nulls: FOD1P 0 Major 0 Major_Category 1 dtype: int64 college_majors Nulls: Rank 0 Major_code 0 Major 0 Total 1 Men 1 Women 1 Major_category 0 ShareWomen 1 Sample_size 0 Employed 0 Full_time 0 Part_time 0 Full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 College_jobs 0 Non_college_jobs 0 Low_wage_jobs 0 dtype: int64 college_majors_dmgrph Nulls: Rank 0 Major_code 0 Major 0 Total 1 Men 1 Women 1 Major_category 0 ShareWomen 1 Sample_size 0 Employed 0 Full_time 0 Part_time 0 Full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 College_jobs 0 Non_college_jobs 0 Low_wage_jobs 0 dtype: int64
For the nulls, they seem to be connected to demographic information. I'm going to change the NaNs so that I can note it without throwing the record out or getting an error.
# replace nans
values = {"Men":0, "Women":0, "Major_Category":"Other", "ShareWomen":0, "Total":0} # assuming the total that is NaN is the row where the Men and Wome counts are not available
grad_students = grad_students.fillna(value=values)
recent_grads = recent_grads.fillna(value=values)
women_stem = women_stem.fillna(value=values)
all_ages = all_ages.fillna(value=values)
majors_list = majors_list.fillna(value=values)
college_majors = college_majors.fillna(value=values)
college_majors_dmgrph = college_majors_dmgrph.fillna(value=values)
# confirm changes
display(grad_students)
print("grad_students Nulls:")
print(grad_students.isnull().sum(), "\n")
display(recent_grads)
print("recent_grads Nulls:")
print(recent_grads.isnull().sum(), "\n")
display(women_stem)
print("women_stem Nulls:")
print(women_stem.isnull().sum(), "\n")
display(all_ages)
print("all_ages Nulls:")
print(all_ages.isnull().sum(), "\n")
display(majors_list)
print("majors_list Nulls:")
print(majors_list.isnull().sum(), "\n")
display(college_majors)
print("college_majors Nulls:")
print(college_majors.isnull().sum(), "\n")
display(college_majors_dmgrph)
print("college_majors_dmgrph Nulls:")
print(college_majors_dmgrph.isnull().sum())
Major_code | Major | Major_category | Grad_total | Grad_sample_size | Grad_employed | Grad_full_time_year_round | Grad_unemployed | Grad_unemployment_rate | Grad_median | ... | Nongrad_total | Nongrad_employed | Nongrad_full_time_year_round | Nongrad_unemployed | Nongrad_unemployment_rate | Nongrad_median | Nongrad_P25 | Nongrad_P75 | Grad_share | Grad_premium | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5601 | CONSTRUCTION SERVICES | Industrial Arts & Consumer Services | 9173 | 200 | 7098 | 6511 | 681 | 0.087543 | 75000.0 | ... | 86062 | 73607 | 62435 | 3928 | 0.050661 | 65000.0 | 47000 | 98000.0 | 0.096320 | 0.153846 |
1 | 6004 | COMMERCIAL ART AND GRAPHIC DESIGN | Arts | 53864 | 882 | 40492 | 29553 | 2482 | 0.057756 | 60000.0 | ... | 461977 | 347166 | 250596 | 25484 | 0.068386 | 48000.0 | 34000 | 71000.0 | 0.104420 | 0.250000 |
2 | 6211 | HOSPITALITY MANAGEMENT | Business | 24417 | 437 | 18368 | 14784 | 1465 | 0.073867 | 65000.0 | ... | 179335 | 145597 | 113579 | 7409 | 0.048423 | 50000.0 | 35000 | 75000.0 | 0.119837 | 0.300000 |
3 | 2201 | COSMETOLOGY SERVICES AND CULINARY ARTS | Industrial Arts & Consumer Services | 5411 | 72 | 3590 | 2701 | 316 | 0.080901 | 47000.0 | ... | 37575 | 29738 | 23249 | 1661 | 0.052900 | 41600.0 | 29000 | 60000.0 | 0.125878 | 0.129808 |
4 | 2001 | COMMUNICATION TECHNOLOGIES | Computers & Mathematics | 9109 | 171 | 7512 | 5622 | 466 | 0.058411 | 57000.0 | ... | 53819 | 43163 | 34231 | 3389 | 0.072800 | 52000.0 | 36000 | 78000.0 | 0.144753 | 0.096154 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 5203 | COUNSELING PSYCHOLOGY | Psychology & Social Work | 51812 | 724 | 38468 | 28808 | 1420 | 0.035600 | 50000.0 | ... | 16781 | 12377 | 8502 | 835 | 0.063200 | 40000.0 | 25000 | 50000.0 | 0.755354 | 0.250000 |
169 | 5202 | CLINICAL PSYCHOLOGY | Psychology & Social Work | 22716 | 355 | 16612 | 12022 | 782 | 0.044958 | 70000.0 | ... | 6519 | 4368 | 3033 | 357 | 0.075556 | 46000.0 | 30000 | 70000.0 | 0.777014 | 0.521739 |
170 | 6106 | HEALTH AND MEDICAL PREPARATORY PROGRAMS | Health | 114971 | 1766 | 78132 | 58825 | 1732 | 0.021687 | 135000.0 | ... | 26320 | 16221 | 12185 | 1012 | 0.058725 | 51000.0 | 35000 | 87000.0 | 0.813718 | 1.647059 |
171 | 2303 | SCHOOL STUDENT COUNSELING | Education | 19841 | 260 | 11313 | 8130 | 613 | 0.051400 | 56000.0 | ... | 2232 | 1328 | 980 | 169 | 0.112892 | 42000.0 | 27000 | 51000.0 | 0.898881 | 0.333333 |
172 | 2301 | EDUCATIONAL ADMINISTRATION AND SUPERVISION | Education | 54159 | 841 | 34142 | 26850 | 582 | 0.016761 | 65000.0 | ... | 4003 | 3079 | 2434 | 0 | 0.000000 | 58000.0 | 45000 | 79000.0 | 0.931175 | 0.120690 |
173 rows × 22 columns
grad_students Nulls: Major_code 0 Major 0 Major_category 0 Grad_total 0 Grad_sample_size 0 Grad_employed 0 Grad_full_time_year_round 0 Grad_unemployed 0 Grad_unemployment_rate 0 Grad_median 0 Grad_P25 0 Grad_P75 0 Nongrad_total 0 Nongrad_employed 0 Nongrad_full_time_year_round 0 Nongrad_unemployed 0 Nongrad_unemployment_rate 0 Nongrad_median 0 Nongrad_P25 0 Nongrad_P75 0 Grad_share 0 Grad_premium 0 dtype: int64
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.120564 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.018381 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.101852 | 7 | 640 | ... | 170 | 388 | 85 | 0.117241 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.153037 | 3 | 648 | ... | 133 | 340 | 16 | 0.024096 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.107313 | 16 | 758 | ... | 150 | 692 | 40 | 0.050125 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.341631 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.061098 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.637293 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.046320 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.817099 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.065112 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.799859 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.149048 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.798746 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.053621 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.877960 | 2 | 742 | ... | 237 | 410 | 87 | 0.104946 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
173 rows × 21 columns
recent_grads Nulls: Rank 0 Major_code 0 Major 0 Total 0 Men 0 Women 0 Major_category 0 ShareWomen 0 Sample_size 0 Employed 0 Full_time 0 Part_time 0 Full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 College_jobs 0 Non_college_jobs 0 Low_wage_jobs 0 dtype: int64
Rank | Major_code | Major | Major_category | Total | Men | Women | ShareWomen | Median | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | Engineering | 2339 | 2057 | 282 | 0.120564 | 110000 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | Engineering | 756 | 679 | 77 | 0.101852 | 75000 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | Engineering | 856 | 725 | 131 | 0.153037 | 73000 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | Engineering | 1258 | 1123 | 135 | 0.107313 | 70000 |
4 | 5 | 2418 | NUCLEAR ENGINEERING | Engineering | 2573 | 2200 | 373 | 0.144967 | 65000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
71 | 72 | 3604 | ECOLOGY | Biology & Life Science | 9154 | 3878 | 5276 | 0.576360 | 33000 |
72 | 73 | 6109 | TREATMENT THERAPY PROFESSIONS | Health | 48491 | 13487 | 35004 | 0.721866 | 33000 |
73 | 74 | 6100 | GENERAL MEDICAL AND HEALTH SERVICES | Health | 33599 | 7574 | 26025 | 0.774577 | 32400 |
74 | 75 | 6102 | COMMUNICATION DISORDERS SCIENCES AND SERVICES | Health | 38279 | 1225 | 37054 | 0.967998 | 28000 |
75 | 76 | 3609 | ZOOLOGY | Biology & Life Science | 8409 | 3050 | 5359 | 0.637293 | 26000 |
76 rows × 9 columns
women_stem Nulls: Rank 0 Major_code 0 Major 0 Major_category 0 Total 0 Men 0 Women 0 ShareWomen 0 Median 0 dtype: int64
Major_code | Major | Major_category | Total | Employed | Employed_full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1100 | GENERAL AGRICULTURE | Agriculture & Natural Resources | 128148 | 90245 | 74078 | 2423 | 0.026147 | 50000 | 34000 | 80000.0 |
1 | 1101 | AGRICULTURE PRODUCTION AND MANAGEMENT | Agriculture & Natural Resources | 95326 | 76865 | 64240 | 2266 | 0.028636 | 54000 | 36000 | 80000.0 |
2 | 1102 | AGRICULTURAL ECONOMICS | Agriculture & Natural Resources | 33955 | 26321 | 22810 | 821 | 0.030248 | 63000 | 40000 | 98000.0 |
3 | 1103 | ANIMAL SCIENCES | Agriculture & Natural Resources | 103549 | 81177 | 64937 | 3619 | 0.042679 | 46000 | 30000 | 72000.0 |
4 | 1104 | FOOD SCIENCE | Agriculture & Natural Resources | 24280 | 17281 | 12722 | 894 | 0.049188 | 62000 | 38500 | 90000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 6211 | HOSPITALITY MANAGEMENT | Business | 200854 | 163393 | 122499 | 8862 | 0.051447 | 49000 | 33000 | 70000.0 |
169 | 6212 | MANAGEMENT INFORMATION SYSTEMS AND STATISTICS | Business | 156673 | 134478 | 118249 | 6186 | 0.043977 | 72000 | 50000 | 100000.0 |
170 | 6299 | MISCELLANEOUS BUSINESS & MEDICAL ADMINISTRATION | Business | 102753 | 77471 | 61603 | 4308 | 0.052679 | 53000 | 36000 | 83000.0 |
171 | 6402 | HISTORY | Humanities & Liberal Arts | 712509 | 478416 | 354163 | 33725 | 0.065851 | 50000 | 35000 | 80000.0 |
172 | 6403 | UNITED STATES HISTORY | Humanities & Liberal Arts | 17746 | 11887 | 8204 | 943 | 0.073500 | 50000 | 39000 | 81000.0 |
173 rows × 11 columns
all_ages Nulls: Major_code 0 Major 0 Major_category 0 Total 0 Employed 0 Employed_full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 dtype: int64
FOD1P | Major | Major_Category | |
---|---|---|---|
0 | 1100 | GENERAL AGRICULTURE | Agriculture & Natural Resources |
1 | 1101 | AGRICULTURE PRODUCTION AND MANAGEMENT | Agriculture & Natural Resources |
2 | 1102 | AGRICULTURAL ECONOMICS | Agriculture & Natural Resources |
3 | 1103 | ANIMAL SCIENCES | Agriculture & Natural Resources |
4 | 1104 | FOOD SCIENCE | Agriculture & Natural Resources |
... | ... | ... | ... |
169 | 5504 | GEOGRAPHY | Social Science |
170 | 5505 | INTERNATIONAL RELATIONS | Social Science |
171 | 5506 | POLITICAL SCIENCE AND GOVERNMENT | Social Science |
172 | 5507 | SOCIOLOGY | Social Science |
173 | 5599 | MISCELLANEOUS SOCIAL SCIENCES | Social Science |
174 rows × 3 columns
majors_list Nulls: FOD1P 0 Major 0 Major_Category 0 dtype: int64
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.1206 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.0184 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.1019 | 7 | 640 | ... | 170 | 388 | 85 | 0.1172 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.1530 | 3 | 648 | ... | 133 | 340 | 16 | 0.0241 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.1073 | 16 | 758 | ... | 150 | 692 | 40 | 0.0501 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.3416 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.0611 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.6373 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.0463 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.8171 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.0651 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.7999 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.1490 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.7987 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.0536 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.8780 | 2 | 742 | ... | 237 | 410 | 87 | 0.1049 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
173 rows × 21 columns
college_majors Nulls: Rank 0 Major_code 0 Major 0 Total 0 Men 0 Women 0 Major_category 0 ShareWomen 0 Sample_size 0 Employed 0 Full_time 0 Part_time 0 Full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 College_jobs 0 Non_college_jobs 0 Low_wage_jobs 0 dtype: int64
Rank | Major_code | Major | Total | Men | Women | Major_category | ShareWomen | Sample_size | Employed | ... | Part_time | Full_time_year_round | Unemployed | Unemployment_rate | Median | P25th | P75th | College_jobs | Non_college_jobs | Low_wage_jobs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2419 | PETROLEUM ENGINEERING | 2339.0 | 2057.0 | 282.0 | Engineering | 0.1206 | 36 | 1976 | ... | 270 | 1207 | 37 | 0.0184 | 110000 | 95000 | 125000 | 1534 | 364 | 193 |
1 | 2 | 2416 | MINING AND MINERAL ENGINEERING | 756.0 | 679.0 | 77.0 | Engineering | 0.1019 | 7 | 640 | ... | 170 | 388 | 85 | 0.1172 | 75000 | 55000 | 90000 | 350 | 257 | 50 |
2 | 3 | 2415 | METALLURGICAL ENGINEERING | 856.0 | 725.0 | 131.0 | Engineering | 0.1530 | 3 | 648 | ... | 133 | 340 | 16 | 0.0241 | 73000 | 50000 | 105000 | 456 | 176 | 0 |
3 | 4 | 2417 | NAVAL ARCHITECTURE AND MARINE ENGINEERING | 1258.0 | 1123.0 | 135.0 | Engineering | 0.1073 | 16 | 758 | ... | 150 | 692 | 40 | 0.0501 | 70000 | 43000 | 80000 | 529 | 102 | 0 |
4 | 5 | 2405 | CHEMICAL ENGINEERING | 32260.0 | 21239.0 | 11021.0 | Engineering | 0.3416 | 289 | 25694 | ... | 5180 | 16697 | 1672 | 0.0611 | 65000 | 50000 | 75000 | 18314 | 4440 | 972 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
168 | 169 | 3609 | ZOOLOGY | 8409.0 | 3050.0 | 5359.0 | Biology & Life Science | 0.6373 | 47 | 6259 | ... | 2190 | 3602 | 304 | 0.0463 | 26000 | 20000 | 39000 | 2771 | 2947 | 743 |
169 | 170 | 5201 | EDUCATIONAL PSYCHOLOGY | 2854.0 | 522.0 | 2332.0 | Psychology & Social Work | 0.8171 | 7 | 2125 | ... | 572 | 1211 | 148 | 0.0651 | 25000 | 24000 | 34000 | 1488 | 615 | 82 |
170 | 171 | 5202 | CLINICAL PSYCHOLOGY | 2838.0 | 568.0 | 2270.0 | Psychology & Social Work | 0.7999 | 13 | 2101 | ... | 648 | 1293 | 368 | 0.1490 | 25000 | 25000 | 40000 | 986 | 870 | 622 |
171 | 172 | 5203 | COUNSELING PSYCHOLOGY | 4626.0 | 931.0 | 3695.0 | Psychology & Social Work | 0.7987 | 21 | 3777 | ... | 965 | 2738 | 214 | 0.0536 | 23400 | 19200 | 26000 | 2403 | 1245 | 308 |
172 | 173 | 3501 | LIBRARY SCIENCE | 1098.0 | 134.0 | 964.0 | Education | 0.8780 | 2 | 742 | ... | 237 | 410 | 87 | 0.1049 | 22000 | 20000 | 22000 | 288 | 338 | 192 |
173 rows × 21 columns
college_majors_dmgrph Nulls: Rank 0 Major_code 0 Major 0 Total 0 Men 0 Women 0 Major_category 0 ShareWomen 0 Sample_size 0 Employed 0 Full_time 0 Part_time 0 Full_time_year_round 0 Unemployed 0 Unemployment_rate 0 Median 0 P25th 0 P75th 0 College_jobs 0 Non_college_jobs 0 Low_wage_jobs 0 dtype: int64
# wordcloud of the Major name column to look at the most common majors in the dataset
## first, i need to get the majors together in a series / list
majors = grad_students["Major"].tolist() + recent_grads["Major"].tolist() + women_stem["Major"].tolist() + all_ages["Major"].tolist() + majors_list["Major"].tolist() + college_majors["Major"].tolist() + college_majors_dmgrph["Major"].tolist()
## now, I can build the cloud
text = " ".join(i for i in majors)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.style.use('classic')
plt.figure( figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# wordcloud of the Major category name column to look at the most common types in the dataset
## first, i need to get the majors together in a series / list
majors = grad_students["Major_category"].tolist() + recent_grads["Major_category"].tolist() + women_stem["Major_category"].tolist() + all_ages["Major_category"].tolist() + majors_list["Major_Category"].tolist() + college_majors["Major_category"].tolist() + college_majors_dmgrph["Major_category"].tolist()
## now, I can build the cloud
text = " ".join(i for i in majors)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.style.use('classic')
plt.figure( figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()