#!/usr/bin/env python # coding: utf-8 # # Minimum GPA for Graduate Schools in Speech-Language Pathology # # ## Background # # One requirement for certification as a speech-language pathologist is the completion of a Master's degree is Speech-Language Pathology / Communication Sciences and Disorders (MA SLP). As the career pays well and opportunities are plentiful across the country, there is a lot of competition to enter into Master's programs in the field, so graduate programs will often set a minimum GPA threshold for applicants to be considered. # # To help students in our Pre-Professional Speech Language Pathology program understand the importance of maintaining a high GPA, I carried out the following analysis using data collected from the American Speech-Language-Hearing Association's [EdFind app](https://find.asha.org/ed/). (Note that the data used below was collected in 2019, and the EdFind app and html structure/format has changed since then.) # # ## Primary Questions # # - How many schools accept GPA scores of _x_ or higher? # - What are the GPA cut-off points for most schools? # - What is the lowest accepted GPA score by state? # - What is the average of lowest accepted GPA scores by state? # # ## Overview # # - 1. [Scrape webpages for data](#webscraping) # - 2. [Extract relevant data from raw html](#extract) # - 3. [Transform](#transform) # - 4. [Analysis and Visualization](#analysis) # - 4.1 [Basic Descriptive Statistics](#descriptive) # - 4.2 [Counts by Accepted GPA Scores](#lineplot) # - 4.3 [Estimated GPA Cut-off Scores](#cutoff) # - 4.4 [Lowest Accepted GPA by State](#low-by-state) # - 4.5 [Average of Lowest Accepted GPA by State](#avg-low-by-state) # In[1]: import folium, os, re import numpy as np import pandas as pd import matplotlib.pyplot as plt from bs4 import BeautifulSoup # ## Step 1: Webscraping # # Python Selenium was used to scrape data from ASHA's EdFind app for all MA SLP programs in the US, with the raw contents of the body section for each school saved as separate file. The code in the following sections imports data from these saved html files. # ## Step 2: Extract Relevant Data # Manual review of the data structure reveals the following: # - Sections of CAA Accreditation Information and each program of study begin with `
`, which contains the section heading only. The Master's programs we are interested in can be identified by `` inside of the `

...

` element of this `div` element. # - Each program is separated into four sections: General, Admissions, Enrollment, Graduation. These are each contained in separate `divs` which are all siblings with each other and the `div` above. # - Each of these four `divs` contains a single description list `dl`, which in turn contains `dt` tags followed by one or more `dd` descriptions. # - Most `dd` elements consist of a simple value/description. # - Some `dd` descriptions consist of a key followed by one or more values. Such keys can be identified by the presence of a colon either in front of a numerical value on the same line, or at the end of a line when the subsequent `dd` elements represent its values. # # Thus, we will create a dictionary with the following structure: # # {school_name:
#  {program_name:
#   {section: # General, Admission, Enrollment, Graduation
#    {dt_key:
#     dd_value / [dd_value1, ...] / {dd_subkey: dd_value,}, # depends on case
#    },
#   },
#  city: city_name,
#  state: state_abbreviation,
#  zip: zip_code,
#  },
# } # In[2]: # Read in the data for all programs def get_raw_program_data(): schools = sorted(os.listdir('slp_programs')) # filenames here are equivalent to school names response = dict() for s in schools: with open(os.path.join('slp_programs', s), 'r') as f: data = f.read() response[s] = data return(response) # In[3]: def extract_to_dictionary(programs_html): """ Inputs: programs_html (dict) - {school_name: raw_html} Outputs: dico (dict) """ dico = dict() # Loop over each program for school, val in programs_html.items(): # Parse the html for the program soup = BeautifulSoup(val, 'html.parser') # Locate the section for MA programs divs = soup.find_all('div', class_ = "headline") cur = None for div in divs: if div.find('a', attrs = {"name": "CED-SLP"}): cur = div program_name = cur.text dico[school] = {'program': { "CED-SLP": { 'name': program_name, }, }, } break # If the section was not found, skip this school if not cur: continue #Extract the city, state, and zip code pattern = re.compile('(?P[^,]+),? (?P[A-Z]{2}) (?P[0-9]{5}).*') location = {} brs = soup.find('p').find_all('br') for br in brs: text = br.next.strip() if text: result = pattern.match(text) if result: location['city'], location['state'], location['zip'] = result.groups() break # Update the dictionary with the city, state and zip code dico[school].update(location) # Add the section data for section in ["General", "Admissions", "Enrollment", "Graduation"]: # Set cur to the current section cur = cur.find_next_sibling('div') # Check that section is the expected one assert cur.h3.text == section # Find all
elements in current section dts = cur.find_all('dt') dts_dict = dict() for dt in dts: dds = [] dds_values_dict = {} dds_values_list = [] # Make a list of all
elements of current
dd = dt.find_next_sibling(['dt','dd']) while dd and dd.name != 'dt': dds.append(dd) dd = dd.find_next_sibling(['dt','dd']) # Loop through all
elements n = 0 while n < len(dds): dd = dds[n] # Lack of a colon indicates a simple value if ':' not in dd.text: dds_values_list.append(dd.text.strip()) n += 1 # If there is a colon, the value is a dictionary else: dd_text = dd.text.split(':') dd_key = dd_text[0].strip() dd_value = dd_text[1].strip() # If there is text following the colon, the key: value # pair is found in the same line if dd_value != '': dds_values_dict.update({dd_key: dd_value}) n += 1 # Otherwise, the value(s) is/are found in the following #
elements else: dd_value = [] n += 1 while n < len(dds) and not dds[n].findAll('b'): dd_value.append(dds[n].text.strip()) n += 1 dds_values_dict.update({dd_key: dd_value}) # Remove the colon from the key dt_key = dt.text.replace(':','').strip() # If no values of the
tag were {key: value} format... if len(dds_values_dict) == 0: # Convert lists of length 1 to simple strings and save as value if len(dds_values_list) == 1: dts_dict[dt_key] = dds_values_list[0] # Otherwise save list as value else: dts_dict[dt_key] = dds_values_list # If all values of the
tag were in {key: value} format, save # the dictionary as the value elif len(dds_values_list) == 0: dts_dict[dt_key] = dds_values_dict # If the values of the
tag were of mixed formats, convert the lists into # dictionaries, merge it with the dictionary for the
tags, and save this # dictionary as the value for the
element else: for val in dds_values_list: dds_values_dict.update({val: True}) dts_dict[dt_key] = dds_values_dict dico[school]['program']['CED-SLP'][section] = dts_dict return dico # In[4]: programs_html = get_raw_program_data() programs = extract_to_dictionary(programs_html) # In[5]: programs['University of Delaware']['program']['CED-SLP']['Admissions'] # ## Transform # We are only interested in the GPA and state data, so we'll extract this into a new dictionary and convert it into a dataframe. # In[6]: # Extract the relevant data into a new dictionary def get_gpa_info(program_dict): gpa_range = program_dict['program']['CED-SLP']['Admissions']['GPA Range for Applicants Offered Admission'] gpa_info = {'low_gpa': None, 'high_gpa': None} # GPA range represented as text without consistent representation (e.g., spacing, number of hyphens, etc.), # so we'll simply identify and extract the numerical characters and decimal point if '-' in gpa_range: gpa_range = re.match('(?P[\d\.]+)[^\d]+(?P[\d\.]+).*', gpa_range) low, high = gpa_range.groups() gpa_info['low_gpa'] = float(low) gpa_info['high_gpa'] = float(high) return(gpa_info) programs_min_dict = dict() for key in programs: programs_min_dict[key] = {'state': programs[key]['state'],} programs_min_dict[key].update(get_gpa_info(programs[key])) # In[7]: df = pd.DataFrame.from_dict(programs_min_dict, orient='index') # In[8]: df.head() # In[9]: len(df) # Out of 259 schools, we lack GPA information for 10 of them, so we'll remove them from our dataset. # In[10]: df.isna().sum() # In[11]: df.dropna(axis=0, inplace=True) len(df) # Sorting the values by low GPA reveals that one row appears to have the low and high GPAs reversed. We'll want to swap the reversed values before continuing. # In[12]: df.sort_values(by='low_gpa', ascending=False)[:10] # In[13]: df[df['low_gpa'] > df['high_gpa']] # In[14]: rows_to_swap = df['low_gpa'] > df['high_gpa'] df.loc[rows_to_swap, ['low_gpa', 'high_gpa']] = df.loc[rows_to_swap, ['high_gpa', 'low_gpa']].values df.sort_values(by='low_gpa', ascending=False)[:10] # Finally, we look at how many schools from each state are represented in this data. Ten states only have one MA SLP program, while New York has the highest number of programs at 26. # In[15]: by_state = df.groupby(by='state').size().reset_index(name='count').sort_values('count') # In[16]: by_state.T # In[17]: by_state[by_state['count'] == 1].count() # More detailed investigation of the data reveals that 3 states have 0 MA SLP programs represented in the data set, and two 'states' in our data set are not actually states: Washington, D.C. (DC) and Puerto Rico (PR). # In[18]: by_state.count() # In[19]: by_state.sort_index() # ## Analysis and Visualization # #### Basic Descriptive Statistics # We first look at some descriptive statistics of the dataset. # In[20]: df.describe() # Unsurpisingly, we notice that there is almost no variation in the highest GPAs of candidates offered admission. In contrast, a GPA of 3.2 or higher is required for 50% of SLP programs in the dataset, with a standard deviation of 0.25 points. # #### Counts by Accepted GPA Scores # Plotting out the number of schools that accept any given GPA scores reveals an initial jump starting at 3.0, suggesting that the large majority of MA SLP programs have a GPA threshold of 3.0 or higher. # In[21]: # For each GPA point, calculate sum of school for which it suffices low_gpa = [[x/100, len(df[df["low_gpa"] <= x/100])] for x in range(230,401)] low_gpa = pd.DataFrame(low_gpa, columns=['Min_GPA', 'Num_Schools']).set_index('Min_GPA') low_gpa.plot() ax = plt.gca() ax.get_legend().remove() ax.set_xlabel('GPA') ax.set_ylabel('Number of Schools') plt.show() # #### Estimated GPA Cut-off Scores # Assuming that schools' GPA cut-off points are usually set at the level of a single demical point, we can estimate the cut-off points to be the floor of the lowest GPA values to one demical point. To plot this out, we can simply bin scores in each decimal point range. # In[22]: plt.xticks(np.arange(2.3, 4.01, 0.1)) plt.hist(df['low_gpa'], bins = 17, range=(2.3, 4.), cumulative = False) ax = plt.gca() ax.set_xlabel('GPA Threshold') ax.set_ylabel('Number of Schools') plt.show() # In line with earlier observations, the graph above indicates that most common (assumed) cut-off limits are at 3.0 and 3.2. # #### Lowest Accepted GPA by State # In my experience, most MA students in SLP seem to prefer to attend a school close to where they (or their family) lives. For students with regional preferences, mapping out the lowest and average GPA scores accepted in different states can be helpful. # The first map shows the _lowest_ GPA score accepted in each state. # In[23]: # Find the minimum values by state min_by_state = df.groupby(by='state').min() # URL for US states mapping data url = ( "https://raw.githubusercontent.com/python-visualization/folium/main/examples/data" ) state_geo = f"{url}/us-states.json" # Initialize the map m = folium.Map(location=[48,-102], zoom_start=3, tiles="cartodb positron") m.save('slp_usa.html') # Plot the data on the map folium.Choropleth( geo_data=state_geo, name="choropleth", data=min_by_state.reset_index(), columns=["state", "low_gpa"], key_on="feature.id", fill_color="BuGn", nan_fill_color="gray", nan_fill_opacity=0.4, fill_opacity=0.7, line_opacity=0.2, bins=9, legend_name="Minimum GPA by State", ).add_to(m) #m.save('slp_min_by_state.html') m # #### Average of Lowest Accepted GPA Scores by State # The second map shows the average lowest accepted GPA score for each state. # In[24]: # Find the average values by state mean_by_state = df.groupby(by='state').mean() # Initialize the map m = folium.Map(location=[48,-102], zoom_start=3, tiles="cartodb positron") # Plot the data on the map folium.Choropleth( geo_data=state_geo, name="choropleth", data=mean_by_state.reset_index(), columns=["state", "low_gpa"], key_on="feature.id", fill_color="BuGn", nan_fill_color="gray", nan_fill_opacity=0.4, fill_opacity=0.7, line_opacity=0.2, bins=9, legend_name="Mean Minimum GPA by State", ).add_to(m) #m.save('slp_avg_by_state.html') m