`, which contains the section heading only. The Master's programs we are interested in can be identified by `` inside of the `

...

` element of this `div` element. # - Each program is separated into four sections: General, Admissions, Enrollment, Graduation. These are each contained in separate `divs` which are all siblings with each other and the `div` above. # - Each of these four `divs` contains a single description list `dl`, which in turn contains `dt` tags followed by one or more `dd` descriptions. # - Most `dd` elements consist of a simple value/description. # - Some `dd` descriptions consist of a key followed by one or more values. Such keys can be identified by the presence of a colon either in front of a numerical value on the same line, or at the end of a line when the subsequent `dd` elements represent its values. # # Thus, we will create a dictionary with the following structure: # # {school_name:
# {program_name:
# {section: # General, Admission, Enrollment, Graduation
# {dt_key:
# dd_value / [dd_value₁, ...] / {dd_subkey: dd_value,}, # depends on case
# },
# },
# city: city_name,
# state: state_abbreviation,
# zip: zip_code,
# },
# } # In[2]: # Read in the data for all programs def get_raw_program_data(): schools = sorted(os.listdir('slp_programs')) # filenames here are equivalent to school names response = dict() for s in schools: with open(os.path.join('slp_programs', s), 'r') as f: data = f.read() response[s] = data return(response) # In[3]: def extract_to_dictionary(programs_html): """ Inputs: programs_html (dict) - {school_name: raw_html} Outputs: dico (dict) """ dico = dict() # Loop over each program for school, val in programs_html.items(): # Parse the html for the program soup = BeautifulSoup(val, 'html.parser') # Locate the section for MA programs divs = soup.find_all('div', class_ = "headline") cur = None for div in divs: if div.find('a', attrs = {"name": "CED-SLP"}): cur = div program_name = cur.text dico[school] = {'program': { "CED-SLP": { 'name': program_name, }, }, } break # If the section was not found, skip this school if not cur: continue #Extract the city, state, and zip code pattern = re.compile('(?P[^,]+),? (?P[A-Z]{2}) (?P[0-9]{5}).*') location = {} brs = soup.find('p').find_all('br') for br in brs: text = br.next.strip() if text: result = pattern.match(text) if result: location['city'], location['state'], location['zip'] = result.groups() break # Update the dictionary with the city, state and zip code dico[school].update(location) # Add the section data for section in ["General", "Admissions", "Enrollment", "Graduation"]: # Set cur to the current section cur = cur.find_next_sibling('div') # Check that section is the expected one assert cur.h3.text == section # Find all

elements in current section dts = cur.find_all('dt') dts_dict = dict() for dt in dts: dds = [] dds_values_dict = {} dds_values_list = [] # Make a list of all

elements of current

dd = dt.find_next_sibling(['dt','dd']) while dd and dd.name != 'dt': dds.append(dd) dd = dd.find_next_sibling(['dt','dd']) # Loop through all

elements n = 0 while n < len(dds): dd = dds[n] # Lack of a colon indicates a simple value if ':' not in dd.text: dds_values_list.append(dd.text.strip()) n += 1 # If there is a colon, the value is a dictionary else: dd_text = dd.text.split(':') dd_key = dd_text[0].strip() dd_value = dd_text[1].strip() # If there is text following the colon, the key: value # pair is found in the same line if dd_value != '': dds_values_dict.update({dd_key: dd_value}) n += 1 # Otherwise, the value(s) is/are found in the following #

elements else: dd_value = [] n += 1 while n < len(dds) and not dds[n].findAll('b'): dd_value.append(dds[n].text.strip()) n += 1 dds_values_dict.update({dd_key: dd_value}) # Remove the colon from the key dt_key = dt.text.replace(':','').strip() # If no values of the

tag were {key: value} format... if len(dds_values_dict) == 0: # Convert lists of length 1 to simple strings and save as value if len(dds_values_list) == 1: dts_dict[dt_key] = dds_values_list[0] # Otherwise save list as value else: dts_dict[dt_key] = dds_values_list # If all values of the

tag were in {key: value} format, save # the dictionary as the value elif len(dds_values_list) == 0: dts_dict[dt_key] = dds_values_dict # If the values of the

tag were of mixed formats, convert the lists into # dictionaries, merge it with the dictionary for the

tags, and save this # dictionary as the value for the

element else: for val in dds_values_list: dds_values_dict.update({val: True}) dts_dict[dt_key] = dds_values_dict dico[school]['program']['CED-SLP'][section] = dts_dict return dico # In[4]: programs_html = get_raw_program_data() programs = extract_to_dictionary(programs_html) # In[5]: programs['University of Delaware']['program']['CED-SLP']['Admissions'] # ## Transform # We are only interested in the GPA and state data, so we'll extract this into a new dictionary and convert it into a dataframe. # In[6]: # Extract the relevant data into a new dictionary def get_gpa_info(program_dict): gpa_range = program_dict['program']['CED-SLP']['Admissions']['GPA Range for Applicants Offered Admission'] gpa_info = {'low_gpa': None, 'high_gpa': None} # GPA range represented as text without consistent representation (e.g., spacing, number of hyphens, etc.), # so we'll simply identify and extract the numerical characters and decimal point if '-' in gpa_range: gpa_range = re.match('(?P[\d\.]+)[^\d]+(?P[\d\.]+).*', gpa_range) low, high = gpa_range.groups() gpa_info['low_gpa'] = float(low) gpa_info['high_gpa'] = float(high) return(gpa_info) programs_min_dict = dict() for key in programs: programs_min_dict[key] = {'state': programs[key]['state'],} programs_min_dict[key].update(get_gpa_info(programs[key])) # In[7]: df = pd.DataFrame.from_dict(programs_min_dict, orient='index') # In[8]: df.head() # In[9]: len(df) # Out of 259 schools, we lack GPA information for 10 of them, so we'll remove them from our dataset. # In[10]: df.isna().sum() # In[11]: df.dropna(axis=0, inplace=True) len(df) # Sorting the values by low GPA reveals that one row appears to have the low and high GPAs reversed. We'll want to swap the reversed values before continuing. # In[12]: df.sort_values(by='low_gpa', ascending=False)[:10] # In[13]: df[df['low_gpa'] > df['high_gpa']] # In[14]: rows_to_swap = df['low_gpa'] > df['high_gpa'] df.loc[rows_to_swap, ['low_gpa', 'high_gpa']] = df.loc[rows_to_swap, ['high_gpa', 'low_gpa']].values df.sort_values(by='low_gpa', ascending=False)[:10] # Finally, we look at how many schools from each state are represented in this data. Ten states only have one MA SLP program, while New York has the highest number of programs at 26. # In[15]: by_state = df.groupby(by='state').size().reset_index(name='count').sort_values('count') # In[16]: by_state.T # In[17]: by_state[by_state['count'] == 1].count() # More detailed investigation of the data reveals that 3 states have 0 MA SLP programs represented in the data set, and two 'states' in our data set are not actually states: Washington, D.C. (DC) and Puerto Rico (PR). # In[18]: by_state.count() # In[19]: by_state.sort_index() # ## Analysis and Visualization # #### Basic Descriptive Statistics # We first look at some descriptive statistics of the dataset. # In[20]: df.describe() # Unsurpisingly, we notice that there is almost no variation in the highest GPAs of candidates offered admission. In contrast, a GPA of 3.2 or higher is required for 50% of SLP programs in the dataset, with a standard deviation of 0.25 points. # #### Counts by Accepted GPA Scores # Plotting out the number of schools that accept any given GPA scores reveals an initial jump starting at 3.0, suggesting that the large majority of MA SLP programs have a GPA threshold of 3.0 or higher. # In[21]: # For each GPA point, calculate sum of school for which it suffices low_gpa = [[x/100, len(df[df["low_gpa"] <= x/100])] for x in range(230,401)] low_gpa = pd.DataFrame(low_gpa, columns=['Min_GPA', 'Num_Schools']).set_index('Min_GPA') low_gpa.plot() ax = plt.gca() ax.get_legend().remove() ax.set_xlabel('GPA') ax.set_ylabel('Number of Schools') plt.show() # #### Estimated GPA Cut-off Scores # Assuming that schools' GPA cut-off points are usually set at the level of a single demical point, we can estimate the cut-off points to be the floor of the lowest GPA values to one demical point. To plot this out, we can simply bin scores in each decimal point range. # In[22]: plt.xticks(np.arange(2.3, 4.01, 0.1)) plt.hist(df['low_gpa'], bins = 17, range=(2.3, 4.), cumulative = False) ax = plt.gca() ax.set_xlabel('GPA Threshold') ax.set_ylabel('Number of Schools') plt.show() # In line with earlier observations, the graph above indicates that most common (assumed) cut-off limits are at 3.0 and 3.2. # #### Lowest Accepted GPA by State # In my experience, most MA students in SLP seem to prefer to attend a school close to where they (or their family) lives. For students with regional preferences, mapping out the lowest and average GPA scores accepted in different states can be helpful. # The first map shows the _lowest_ GPA score accepted in each state. # In[23]: # Find the minimum values by state min_by_state = df.groupby(by='state').min() # URL for US states mapping data url = ( "https://raw.githubusercontent.com/python-visualization/folium/main/examples/data" ) state_geo = f"{url}/us-states.json" # Initialize the map m = folium.Map(location=[48,-102], zoom_start=3, tiles="cartodb positron") m.save('slp_usa.html') # Plot the data on the map folium.Choropleth( geo_data=state_geo, name="choropleth", data=min_by_state.reset_index(), columns=["state", "low_gpa"], key_on="feature.id", fill_color="BuGn", nan_fill_color="gray", nan_fill_opacity=0.4, fill_opacity=0.7, line_opacity=0.2, bins=9, legend_name="Minimum GPA by State", ).add_to(m) #m.save('slp_min_by_state.html') m # #### Average of Lowest Accepted GPA Scores by State # The second map shows the average lowest accepted GPA score for each state. # In[24]: # Find the average values by state mean_by_state = df.groupby(by='state').mean() # Initialize the map m = folium.Map(location=[48,-102], zoom_start=3, tiles="cartodb positron") # Plot the data on the map folium.Choropleth( geo_data=state_geo, name="choropleth", data=mean_by_state.reset_index(), columns=["state", "low_gpa"], key_on="feature.id", fill_color="BuGn", nan_fill_color="gray", nan_fill_opacity=0.4, fill_opacity=0.7, line_opacity=0.2, bins=9, legend_name="Mean Minimum GPA by State", ).add_to(m) #m.save('slp_avg_by_state.html') m