import pandas as pd
import numpy as np
import numpy
import re
%matplotlib inline
data_files = [
"ap_2010.csv",
"class_size.csv",
"demographics.csv",
"graduation.csv",
"hs_directory.csv",
"sat_results.csv"
]
data = {}
for f in data_files:
d = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\{0}".format(f))
data[f.replace(".csv", "")] = d
all_survey = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)
survey["DBN"] = survey["dbn"]
survey_fields = [
"DBN",
"rr_s",
"rr_t",
"rr_p",
"N_s",
"N_t",
"N_p",
"saf_p_11",
"com_p_11",
"eng_p_11",
"aca_p_11",
"saf_t_11",
"com_t_11",
"eng_t_11",
"aca_t_11",
"saf_s_11",
"com_s_11",
"eng_s_11",
"aca_s_11",
"saf_tot_11",
"com_tot_11",
"eng_tot_11",
"aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]
def pad_csd(num):
string_representation = str(num)
if len(string_representation) > 1:
return string_representation
else:
return "0" + string_representation
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")
data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]
def find_lat(loc):
coords = re.findall("\(.+, .+\)", loc)
lat = coords[0].split(",")[0].replace("(", "")
return lat
def find_lon(loc):
coords = re.findall("\(.+, .+\)", loc)
lon = coords[0].split(",")[1].replace(")", "").strip()
return lon
data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)
data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]
class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size
data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]
data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']
for col in cols:
data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")
combined = data["sat_results"]
combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")
to_merge = ["class_size", "demographics", "survey", "hs_directory"]
for m in to_merge:
combined = combined.merge(data[m], on="DBN", how="inner")
combined = combined.fillna(combined.mean())
combined = combined.fillna(0)
def get_first_two_chars(dbn):
return dbn[0:2]
combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN")
combined.corr()['sat_score'][survey_fields].plot.bar()
Looking at the bar plot above we can consider fields correlating highly with sat_score
if R value is above 0.23. The following fields seem to have the highest correlation: N-s, N-t, N_p, saf_t_11, saf_s_11, aca_s_11, saf_tot_11
.
When the correlation of the number of respondents (students, teachers, and parents) with sat_score
is not that surprising, there are three fields with high correlation that relate to safety and respect score at school. That means the relationship between safety and respect at school and SAT scores is something worth looking into.
combined.plot.scatter(x='saf_s_11', y='sat_score')
Above we plotted the saf_s_11
column vs. sat_score
and we can see that the correlation is not that strong since the dots are not aligned and we can see that even when the safety score is high at some schools the SAT score is still low.
Now, we will map out safety scores for each district by grouping combined
by school_dist
and compute the average values using agg
function and numpy.mean
function.
districts = combined.groupby('school_dist').agg(numpy.mean)
districts.reset_index(inplace=True)
districts
Now, we will plot the map of NYC and then we can map out the safety scores by districts on top of it using scatter plot which would help us explore the school districts with high/low safety scores.
# Plotting the map of NYC
import os
os.environ['PROJ_LIB'] = r'C:\Users\Alima\anaconda3\pkgs\proj4-5.2.0-ha925a31_1\Library\share'
from mpl_toolkits.basemap import Basemap
m = Basemap(
projection='merc',
llcrnrlat=40.496044,
urcrnrlat=40.915256,
llcrnrlon=-74.255735,
urcrnrlon=-73.700272,
resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)
longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()
m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts['saf_s_11'], cmap='summer')
If we take a look at our map, we can observe districts where schools with high safety score shaded in yellow. Most of them are located in Manhattan, Bronx and very little are in Queens and Brooklyn. Whereas the majority of schools with low safety score are saturated in Brooklyn, and very little in Bronx. Safety level of schools in Staten Island can be considered medium.
race = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
combined.corr()['sat_score'][race].plot.bar()
The bar plot demonstrates the racial inequality in SAT performance. The high percentage of white and asian students correlate positively with high SAT scores which means the students of these racial groups consistently perfrom better on the SAT, whereas the percentage of black and hispanic students correlate negatively. This inequality serves as an evidence that the SAT might be unfair towards certain racial groups.
Next step is to explore schools with low SAT scores and high values for hispanic_per
.
combined.plot.scatter(x='hispanic_per', y='sat_score')
The scatter plot above indicates that schools with high concentration of hispanic students (more than 50%) do not score higher than 1400 points on SAT. Only small fraction of schools with under 20% of hispanic students reach higher SAT scores.
We will research schools with a hispanic_per
greater than 95%:
# Filtering the dataframe
hispanic_high = combined[combined['hispanic_per'] > 95]
# Identifying school names
hispanic_high['SCHOOL NAME']
First, we will create a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov for the schools mentioned above.
We selected the following criteria that can aid us in getting some insights about the performance of the schools with high hispanic_per
:
Economic Need Index
Student Achievement Score
College Readiness Rate
Graduation Rate
# Creating a new dataframe
hispanic_high_perf = {'School': ['MANHATTAN BRIDGES HIGH SCHOOL','WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL','GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M...','ACADEMY FOR LANGUAGE AND TECHNOLOGY', 'INTERNATIONAL SCHOOL FOR LIBERAL ARTS', 'PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE', 'MULTICULTURAL HIGH SCHOOL', 'PAN AMERICAN INTERNATIONAL HIGH SCHOOL'],
'Economic Need Index(%)': [91, 82, 99, 99, 98, 98, 99, 95],
'Student Achievement Score(/5)': [4.51, 4.51, 3.47, 4.75, 2.50, 4.89, 3.34, 4.66],
'College Readiness Rate(%)': [77, np.NaN, 48, 40, 32, 48, 25, 33],
'Graduation Rate(%)': [95, np.NaN, 87, 99, 65, 84, 81, 85]
}
performance_high = pd.DataFrame(hispanic_high_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
performance_high
# Calculate the mean for College Readiness column
mean_col_read = performance_high['College Readiness Rate(%)'].mean()
# Fill in the missing values with the mean
performance_high['College Readiness Rate(%)'].fillna(mean_col_read)
# Calculate the mean for Graduation Rate column
mean_grad = performance_high['Graduation Rate(%)'].mean()
# Fill in the missing values with the mean
performance_high['Graduation Rate(%)'].fillna(mean_grad)
Now that our dataframe is ready, we can perform some manipulations and make observations.
# Plotting Economic Need Index for each school on horizontal bar plot
performance_high.plot.barh(x='School', y='Economic Need Index(%)', legend=False)
performance_high['Economic Need Index(%)'].describe()
All 8 schools have an extremely high Economic Need Index with 95% on average. Economic Need Index is an estimated percentage of students at school facing economic hardship, based on temporary housing, eligibility for public assistance, and Census tract poverty rates.
# Plotting Student Achievement Score for each school on horizontal bar plot
performance_high.plot.barh(x='School', y='Student Achievement Score(/5)', legend=False)
As the bar plot shows, the Student Achievement Scores
vary depending on the school with an average value of 4 out of 5 points. The poorest performance was demonstrated at INTERNATIONAL SCHOOL FOR LIBERAL ARTS
. This particular school is 100% hispanic students.
performance_high['College Readiness Rate(%)'].describe()
performance_high['Graduation Rate(%)'].describe()
The Graduation rate
among the schools with high percentage of hispanic students is about 85% on average. On the downside, the College Readiness Rate
is very low, only about 43% of students are ready to go to college which makes it unlikely for them to get high scores on SAT.
hispanic_per
and high SAT score¶Now we will reaserch the schools with hispanic_per
lower than 10%, and SAT scores greater than 1800.
# Filtering the dataframe
hispanic_low = combined[combined['hispanic_per'] < 10]
hispanic_low = combined[combined['sat_score'] > 1800]
# Identifying school names
hispanic_low['SCHOOL NAME']
We will create a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov for the schools mentioned below. Then, we will manipulate this dataframe to gain some insights about schools where hispanic representation is low and SAT scores are high.
hispanic_low_perf = {'School': ['BARD HIGH SCHOOL EARLY COLLEGE','STUYVESANT HIGH SCHOOL',' HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINERING','BRONX HIGH SCHOOL OF SCIENCE', 'HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE', 'BROOKLYN TECHNICAL HIGH SCHOOL', ' TOWNSEND HARRIS HIGH SCHOOL', ' QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEGE', ' STATEN ISLAND TECHNICAL HIGH SCHOOL' ],
'Economic Need Index(%)': [40, 40, 40, 39, 26, 52, 39, 49, 35],
'Student Achievement Score(/5)': [4.49, 4.83, 4.65, 4.90, 4.86, 4.58, 4.90, 4.80, 4.77],
'College Readiness Rate(%)': [98, 99, 99, 100, 100, 97, 100, 100, 100],
'Graduation Rate(%)': [99, 99, 99, 100, 100, 97, 100, 100, 100]
}
performance_low = pd.DataFrame(hispanic_low_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
performance_low
# Plotting Economic Need Index for each school on horizontal bar plot
performance_low.plot.barh(x='School', y='Economic Need Index(%)', legend=False)
The bar plor above shows that on average only 40% of students fall into economically challenged category among all 9 schools unlike all hispanic schools we researched earlier.
performance_low['Student Achievement Score(/5)'].describe()
The average of Student Achievement Score is 4.75 points out of 5 which means students from these schools perform well academically.
performance_low['College Readiness Rate(%)'].describe()
All 9 schools indicate high levels of College Readiness Rate
, 99.22% on average to be precise, the same can be observed for Graduation Rate
since the values are almost identical. That means students from these schools will definitely graduate and they are better prepared to take college entry exams like SAT.
We will plot out the correlations between the percentages of each gender (male_per
and female_per
) and sat_score
.
gender = ['male_per', 'female_per']
combined.corr()['sat_score'][gender].plot.bar()
The bar plot above shows that the number of female students correlate positively with the SAT score, whereas the number of male students tend to correlate negatively. Could that mean girls in general do better on SAT? To find out, we would have to research more deeply the relationship between female_per
and sat_score
.
# Making a scatter plot of female_per vs. sat_score
combined.plot.scatter(x='female_per', y='sat_score')
The values seem to be plotted out sporadically and do not follow any particular pattern, therefore the correlation is rather weak.
# Filtering the dataframe
female_high = combined[combined['female_per'] > 60]
female_high = combined[combined['sat_score'] > 1700]
# Identifying school names
female_high['SCHOOL NAME']
We will follow the same approach as before by creating a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov.
# Creating a new dataframe
female_high_perf = {'School': ['BARD HIGH SCHOOL EARLY COLLEGE', 'ELEANOR ROOSEVELT HIGH SCHOOL', 'STUYVESANT HIGH SCHOOL', 'BEACON HIGH SCHOOL', 'FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & ARTS', 'HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINERING','BRONX HIGH SCHOOL OF SCIENCE', 'HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE', 'BROOKLYN TECHNICAL HIGH SCHOOL', 'BROOKLYN LATIN SCHOOL, THE', 'TOWNSEND HARRIS HIGH SCHOOL', ' QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEGE', ' STATEN ISLAND TECHNICAL HIGH SCHOOL' ],
'Economic Need Index(%)': [40, 20, 40, 30, 30, 40, 39, 26, 52, 55, 39, 49, 35],
'Student Achievement Score(/5)': [4.49, 4.79, 4.83, 4.33, 4.69, 4.65, 4.90, 4.86, 4.58, 4.64, 4.90, 4.80, 4.77],
'College Readiness Rate(%)': [98, 100, 99, 97, 99, 99, 100, 100, 97, 97, 100, 100, 100],
'Graduation Rate(%)': [99, 100, 99, 98, 99, 99, 100, 100, 97, 97, 100, 100, 100]
}
female_perf = pd.DataFrame(female_high_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
female_perf
female_perf['Economic Need Index(%)'].describe()
female_perf['Student Achievement Score(/5)'].describe()
female_perf['College Readiness Rate(%)'].describe()
female_perf['Graduation Rate(%)'].describe()
After researching the data on schools where the majority of students are female, and SAT scores are high we can make the following observations:
The Economic Need Index
is relatively low, only 38% on average require funding Student Achievement Rate
These metrics indicate that these schools have good chances to get high scores on SAT.
First step is to calculate the percentage of students in each school that took an AP exam by dividing the AP Test Takers
column by the total_enrollment
column.
combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']
combined['ap_per']
Next step is to investigate the relationship between AP Scores and SAT Scores by making a scatter plot.
combined.plot.scatter(x= 'ap_per', y='sat_score')
According to the scatter plot the correlation between sat_score
and ap_per
doesn't seem to be so strong because schools with the higher percentage of AP test takers do not perform better on SAT.
class_size_cols = ['AVERAGE CLASS SIZE', 'NUMBER OF STUDENTS / SEATS FILLED' ]
combined.corr()['sat_score'][class_size_cols].plot.barh()
The bar plot shows that both average class size and the number of seats filled correlate positively with SAT scores, but to understand these relationships deeper we would plot each of the columns on a scatter plot.
combined.plot.scatter(x= 'NUMBER OF STUDENTS / SEATS FILLED', y='sat_score')
combined.plot.scatter(x= 'AVERAGE CLASS SIZE', y='sat_score')
Both scatter plots do not indicate very strong relationship between class size and SAT scores, so we will reseach how small and large classes perform academically and compare.
# Researching schools with small classes and high SAT scores
small_class = combined[combined['AVERAGE CLASS SIZE'] < 20 ]
small_class = combined[combined['sat_score'] > 1700]
small_class['SCHOOL NAME']
# Researching schools with large classes and high SAT scores
large_class = combined[combined['AVERAGE CLASS SIZE'] > 35 ]
large_class = combined[combined['sat_score'] > 1700]
large_class['SCHOOL NAME']
Notice that the aforementioned schools are the exact same schools that have high percentage of females and high SAT scores. We've taken a look at schools with both small and large classes that get high SAT scores, and the results are identical. That means the class size doesn't really affect the performance on SAT.