#!/usr/bin/env python # coding: utf-8 # This dataset contains 1295 records of American colleges and their properties, collected by the [US Department of Education](https://collegescorecard.ed.gov/data/documentation/). # In[ ]: import pandas as pd import lux # In[ ]: df = pd.read_csv("../data/college.csv") df # We see that the information about ACTMedian and SATAverage has a very strong correlation. This means that we could probably just keep one of the columns and still get about the same information. So let's drop the ACTMedian column. # In[ ]: df = df.drop(columns=["ACTMedian"]) df # From the Category tab, we see that there are few records where `PredominantDegree` is "Certificate". In addition, there are not a lot of colleges with "Private For-Profit" as `FundingModel`. # We can take a look at this by inspecting the `Series` corresponding to the column `PredominantDegree`. Note that Lux not only helps with visualizing dataframes, but also displays visualizations of Series objects. # In[ ]: df["PredominantDegree"] # In[ ]: df[df["PredominantDegree"]=="Certificate"].to_pandas() # Upon inspection, there is only a single record for Certificate, we look at the [webpage for programs offered at Cleveland State Community College](http://catalog.clevelandstatecc.edu/content.php?catoid=2&navoid=90) and it looks like there is a large number of associate as well as certificate degrees offered. So we decide that this is more appropriately labelled as "Associate" for the `PredominantDegree` field. # In[ ]: df.loc[df["PredominantDegree"]=="Certificate","PredominantDegree"] = "Associate" # By inspecting the subset of 9 colleges that are "Private For-Profit", we do not find any commonalities across them, so we can just leave the data as-is for now. # In[ ]: df[df["FundingModel"]=="Private For-Profit"] # Back to looking at the entire dataset: # In[ ]: df # We are interested in picking a college to attend and want to understand the `AverageCost` of attending different colleges and how that relates to other information in the dataset. # In[ ]: df.intent = ["AverageCost"] df # We see that there are a large number of colleges that cost around $20000 per year. We also see that Bachelor degree colleges and colleges in New England and large cities tend to have a higher `AverageCost` than its counterparts. # We are interested in the trend of `AverageCost` v.s. `SATAverage` since there is a rough upwards relationship above `AverageCost` of $30000, but below that the trend is less clear. # In[ ]: df.intent = ["AverageCost","SATAverage"] df # By adding the `FundingModel`, we see that the cluster of points on the left can clearly be attributed to public colleges, whereas private colleges more or less follow a trend that shows that colleges with higher `SATAverage` tends to have higher `AverageCost`.