Read in the data

In [70]:
import pandas as pd
import numpy as np
import numpy
import re


%matplotlib inline 


data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]

data = {}

for f in data_files:
    d = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\{0}".format(f))
    data[f.replace(".csv", "")] = d

Read in the surveys

In [71]:
all_survey = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv(r"C:\Users\Alima\OneDrive\Рабочий стол\PYTHON\Projects\SAT Schools Project\survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)

survey["DBN"] = survey["dbn"]

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_11", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey

Add DBN columns

In [72]:
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return "0" + string_representation
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]

Convert columns to numeric

In [73]:
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

def find_lat(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

def find_lon(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)

data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")

Condense datasets

In [74]:
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]

class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size

data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]

data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]

Convert AP scores to numeric

In [75]:
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")

Combine the datasets

In [76]:
combined = data["sat_results"]

combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")

to_merge = ["class_size", "demographics", "survey", "hs_directory"]

for m in to_merge:
    combined = combined.merge(data[m], on="DBN", how="inner")

combined = combined.fillna(combined.mean())
combined = combined.fillna(0)

Add a school district column for mapping

In [77]:
def get_first_two_chars(dbn):
    return dbn[0:2]

combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)

Find correlations

In [78]:
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)
SAT Critical Reading Avg. Score    0.986820
SAT Math Avg. Score                0.972643
SAT Writing Avg. Score             0.987771
sat_score                          1.000000
AP Test Takers                     0.523140
                                     ...   
Census Tract                       0.048737
BIN                                0.052232
BBL                                0.044427
lat                               -0.121029
lon                               -0.132222
Name: sat_score, Length: 85, dtype: float64

Plotting survey correlations

In [79]:
# Remove DBN since it's a unique identifier, not a useful numerical value for correlation.
survey_fields.remove("DBN") 
In [80]:
combined.corr()['sat_score'][survey_fields].plot.bar()
Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x286597c5748>

Looking at the bar plot above we can consider fields correlating highly with sat_score if R value is above 0.23. The following fields seem to have the highest correlation: N-s, N-t, N_p, saf_t_11, saf_s_11, aca_s_11, saf_tot_11. When the correlation of the number of respondents (students, teachers, and parents) with sat_score is not that surprising, there are three fields with high correlation that relate to safety and respect score at school. That means the relationship between safety and respect at school and SAT scores is something worth looking into.

Exploring Safety and SAT Scores

In [81]:
combined.plot.scatter(x='saf_s_11', y='sat_score')
Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb9cfc8>

Above we plotted the saf_s_11 column vs. sat_score and we can see that the correlation is not that strong since the dots are not aligned and we can see that even when the safety score is high at some schools the SAT score is still low.

Now, we will map out safety scores for each district by grouping combined by school_dist and compute the average values using agg function and numpy.mean function.

In [82]:
districts = combined.groupby('school_dist').agg(numpy.mean)
districts.reset_index(inplace=True)
districts
Out[82]:
school_dist SAT Critical Reading Avg. Score SAT Math Avg. Score SAT Writing Avg. Score sat_score AP Test Takers Total Exams Taken Number of Exams with scores 3 4 or 5 Total Cohort Total Grads - % of cohort ... priority08 priority09 priority10 Community Board Council District Census Tract BIN BBL lat lon
0 01 441.833333 473.333333 439.333333 1354.500000 116.681090 173.019231 135.800000 93.500000 71.333333 ... 0.0 0.0 0.0 3.000000 1.500000 1106.833333 1.004144e+06 1.003302e+09 40.719022 -73.982377
1 02 426.619092 444.186256 424.832836 1295.638184 128.908454 201.516827 157.495833 158.647849 75.453574 ... 0.0 0.0 0.0 4.204005 2.963296 1132.479744 1.103862e+06 1.070438e+09 40.739699 -73.991386
2 03 428.529851 437.997512 426.915672 1293.443035 156.183494 244.522436 193.087500 183.384409 74.029937 ... 0.0 0.0 0.0 7.500000 6.500000 166.666667 1.034931e+06 1.012833e+09 40.781574 -73.977370
3 04 402.142857 416.285714 405.714286 1224.142857 129.016484 183.879121 151.035714 113.857143 76.257143 ... 0.0 0.0 0.0 11.000000 8.000000 2637.000000 1.055874e+06 1.016681e+09 40.793449 -73.943215
4 05 427.159915 438.236674 419.666098 1285.062687 85.722527 115.725275 142.464286 143.677419 75.200881 ... 0.0 0.0 0.0 9.428571 8.142857 9372.571429 1.063080e+06 1.019721e+09 40.817077 -73.949251
5 06 382.011940 400.565672 382.066269 1164.643881 108.711538 159.715385 105.425000 180.848387 69.811233 ... 0.0 0.0 0.0 11.179224 10.823823 2765.702770 1.226037e+06 1.170781e+09 40.848970 -73.932502
6 07 376.461538 380.461538 371.923077 1128.846154 73.703402 112.476331 105.276923 105.605459 67.146628 ... 0.0 0.0 0.0 1.692308 12.846154 58.538462 2.032562e+06 2.024050e+09 40.816815 -73.919971
7 08 386.214383 395.542741 377.908005 1159.665129 118.379371 168.020979 144.731818 215.510264 59.514477 ... 0.0 0.0 0.0 7.272727 16.727273 67.909091 2.037043e+06 2.036723e+09 40.823803 -73.866087
8 09 373.755970 383.582836 374.633134 1131.971940 71.411538 104.265385 98.470000 113.330645 67.607962 ... 0.0 0.0 0.0 3.550000 15.750000 5243.750000 2.022457e+06 2.027913e+09 40.836349 -73.906240
9 10 403.363636 418.000000 400.863636 1222.227273 132.231206 226.914336 191.618182 161.318182 68.656958 ... 0.0 0.0 0.0 7.000000 12.227273 1686.045455 1.856268e+06 1.850558e+09 40.870345 -73.898360
10 11 389.866667 394.533333 380.600000 1165.000000 83.813462 122.484615 108.833333 122.866667 70.713333 ... 0.0 0.0 0.0 11.466667 12.266667 6469.733333 2.064514e+06 2.046691e+09 40.873138 -73.856120
11 12 364.769900 379.109453 357.943781 1101.823134 93.102564 139.442308 153.450000 110.467742 65.459361 ... 0.0 0.0 0.0 5.083333 17.000000 120.083333 2.024516e+06 2.032603e+09 40.831412 -73.886946
12 13 409.393800 424.127440 403.666361 1237.187600 232.931953 382.704142 320.773077 224.595533 74.685090 ... 0.0 0.0 0.0 2.615385 33.846154 100.846154 3.149543e+06 3.009073e+09 40.692865 -73.977016
13 14 395.937100 398.189765 385.333049 1179.459915 77.798077 114.873626 123.282143 112.347926 69.436155 ... 0.0 0.0 0.0 1.142857 33.642857 491.142857 3.142769e+06 3.026164e+09 40.711599 -73.948360
14 15 395.679934 404.628524 390.295854 1190.604312 94.574786 141.581197 153.450000 104.207885 64.505898 ... 0.0 0.0 0.0 5.666667 36.222222 95.333333 3.091899e+06 3.006329e+09 40.675972 -73.989255
15 16 371.529851 379.164179 369.415672 1120.109701 82.264423 126.519231 153.450000 247.185484 64.326541 ... 0.0 0.0 0.0 3.000000 37.250000 307.000000 3.131325e+06 3.016038e+09 40.688008 -73.929686
16 17 386.571429 394.071429 380.785714 1161.428571 105.583791 163.087912 111.360714 121.357143 68.564286 ... 0.0 0.0 0.0 10.785714 37.857143 556.142857 3.178521e+06 3.033695e+09 40.660313 -73.955636
17 18 373.454545 373.090909 371.454545 1118.000000 129.028846 197.038462 153.450000 72.771261 70.453082 ... 0.0 0.0 0.0 17.636364 45.272727 950.727273 3.182085e+06 3.068627e+09 40.641863 -73.914726
18 19 367.083333 377.583333 359.166667 1103.833333 88.097756 124.769231 120.670833 114.322581 61.209361 ... 0.0 0.0 0.0 5.000000 39.500000 1158.833333 3.108754e+06 3.039994e+09 40.676547 -73.882158
19 20 406.223881 465.731343 401.732537 1273.687761 227.805769 359.407692 177.690000 591.374194 65.830616 ... 0.0 0.0 0.0 11.000000 43.400000 199.200000 3.143345e+06 3.058004e+09 40.626751 -74.006191
20 21 395.283582 421.786974 389.242062 1206.312619 135.467657 203.835664 142.377273 275.351906 62.987204 ... 0.0 0.0 0.0 12.909091 46.818182 360.727273 3.187467e+06 3.069480e+09 40.593596 -73.978465
21 22 473.500000 502.750000 474.250000 1450.500000 391.007212 614.509615 370.362500 580.250000 86.325000 ... 0.0 0.0 0.0 12.750000 45.000000 633.500000 3.260954e+06 3.071058e+09 40.618285 -73.952288
22 23 380.666667 398.666667 378.000000 1157.333333 29.000000 31.000000 153.450000 87.000000 68.766667 ... 0.0 0.0 0.0 16.000000 41.000000 12783.000000 3.067259e+06 3.028433e+09 40.668586 -73.912298
23 24 405.846154 434.000000 402.153846 1242.000000 126.474852 179.094675 115.165385 234.682382 69.546628 ... 0.0 0.0 0.0 2.846154 25.692308 3626.538462 4.116109e+06 4.009562e+09 40.740621 -73.911518
24 25 437.250000 483.500000 436.250000 1357.000000 205.260817 279.889423 174.793750 268.733871 71.356635 ... 0.0 0.0 0.0 8.000000 21.875000 10598.875000 4.217178e+06 4.058888e+09 40.745414 -73.815558
25 26 445.200000 487.600000 444.800000 1377.600000 410.605769 632.407692 392.090000 825.600000 76.180000 ... 0.0 0.0 0.0 11.800000 21.600000 28776.600000 4.228215e+06 4.072510e+09 40.748507 -73.759176
26 27 407.800000 422.200000 394.300000 1224.300000 100.611538 145.315385 95.125000 288.961290 69.831233 ... 0.0 0.0 0.0 12.000000 30.500000 39899.900000 4.285880e+06 4.131440e+09 40.638828 -73.807823
27 28 445.941655 465.997286 435.908005 1347.846947 182.010490 273.559441 175.336364 351.214076 77.315037 ... 0.0 0.0 0.0 8.727273 25.818182 498.272727 4.336396e+06 4.086008e+09 40.709344 -73.806367
28 29 395.764925 399.457090 386.707836 1181.929851 63.385817 96.514423 135.268750 98.108871 73.044135 ... 0.0 0.0 0.0 12.250000 28.500000 13638.750000 4.256972e+06 4.119641e+09 40.685276 -73.752740
29 30 430.679934 465.961857 429.740299 1326.382090 157.231838 252.123932 115.150000 310.526882 73.611796 ... 0.0 0.0 0.0 1.222222 25.111111 52.777778 4.166018e+06 4.004830e+09 40.755398 -73.932306
30 31 457.500000 472.500000 452.500000 1382.500000 228.908654 355.111538 194.435000 450.787097 77.345308 ... 0.0 0.0 0.0 1.800000 50.000000 6752.500000 5.093752e+06 5.022487e+09 40.595680 -74.125726
31 32 371.500000 385.833333 362.166667 1119.500000 70.342949 100.179487 83.558333 105.333333 65.466667 ... 0.0 0.0 0.0 4.000000 36.000000 420.666667 3.131944e+06 3.033185e+09 40.696295 -73.917124

32 rows × 86 columns

Now, we will plot the map of NYC and then we can map out the safety scores by districts on top of it using scatter plot which would help us explore the school districts with high/low safety scores.

In [83]:
# Plotting the map of NYC 

import os
os.environ['PROJ_LIB'] = r'C:\Users\Alima\anaconda3\pkgs\proj4-5.2.0-ha925a31_1\Library\share'
from mpl_toolkits.basemap import Basemap

m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)

m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitudes = districts['lon'].tolist()
latitudes = districts['lat'].tolist()

m.scatter(longitudes, latitudes, s=50, zorder=2, latlon=True, c=districts['saf_s_11'], cmap='summer')
C:\Users\Alima\anaconda3\lib\site-packages\ipykernel_launcher.py:13: MatplotlibDeprecationWarning: 
The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.
  del sys.path[0]
C:\Users\Alima\anaconda3\lib\site-packages\ipykernel_launcher.py:18: MatplotlibDeprecationWarning: 
The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.
Out[83]:
<matplotlib.collections.PathCollection at 0x2865bb6c748>

If we take a look at our map, we can observe districts where schools with high safety score shaded in yellow. Most of them are located in Manhattan, Bronx and very little are in Queens and Brooklyn. Whereas the majority of schools with low safety score are saturated in Brooklyn, and very little in Bronx. Safety level of schools in Staten Island can be considered medium.

Exploring Race and SAT Scores

Plotting the correlation between race and SAT score

By plotting out the correlations between columns that represent the percentage of each race at a given school and sat_score, we can determine whether there are any racial differences in SAT performance.

In [84]:
race = ['white_per', 'asian_per', 'black_per', 'hispanic_per']
combined.corr()['sat_score'][race].plot.bar()
Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb66908>

The bar plot demonstrates the racial inequality in SAT performance. The high percentage of white and asian students correlate positively with high SAT scores which means the students of these racial groups consistently perfrom better on the SAT, whereas the percentage of black and hispanic students correlate negatively. This inequality serves as an evidence that the SAT might be unfair towards certain racial groups.

Exploring SAT scores among hispanic students

Next step is to explore schools with low SAT scores and high values for hispanic_per.

In [85]:
combined.plot.scatter(x='hispanic_per', y='sat_score')
Out[85]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb45648>

The scatter plot above indicates that schools with high concentration of hispanic students (more than 50%) do not score higher than 1400 points on SAT. Only small fraction of schools with under 20% of hispanic students reach higher SAT scores.

Researching schools with a hispanic_per greater than 95%

We will research schools with a hispanic_per greater than 95%:

  • find their school names
  • research them using Google search
  • see if there's anything interesting about these particulat schools
In [86]:
# Filtering the dataframe 
hispanic_high = combined[combined['hispanic_per'] > 95]

# Identifying school names 
hispanic_high['SCHOOL NAME']
Out[86]:
44                         MANHATTAN BRIDGES HIGH SCHOOL
82      WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL
89     GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M...
125                  ACADEMY FOR LANGUAGE AND TECHNOLOGY
141                INTERNATIONAL SCHOOL FOR LIBERAL ARTS
176     PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE
253                            MULTICULTURAL HIGH SCHOOL
286               PAN AMERICAN INTERNATIONAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

First, we will create a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov for the schools mentioned above.

We selected the following criteria that can aid us in getting some insights about the performance of the schools with high hispanic_per:

  • Economic Need Index
  • Student Achievement Score
  • College Readiness Rate
  • Graduation Rate
In [87]:
# Creating a new dataframe
hispanic_high_perf = {'School': ['MANHATTAN BRIDGES HIGH SCHOOL','WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL','GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M...','ACADEMY FOR LANGUAGE AND TECHNOLOGY', 'INTERNATIONAL SCHOOL FOR LIBERAL ARTS', 'PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE', 'MULTICULTURAL HIGH SCHOOL', 'PAN AMERICAN INTERNATIONAL HIGH SCHOOL'],
        'Economic Need Index(%)': [91, 82, 99, 99, 98, 98, 99, 95], 
        'Student Achievement Score(/5)': [4.51, 4.51, 3.47, 4.75, 2.50, 4.89, 3.34, 4.66], 
        'College Readiness Rate(%)': [77, np.NaN, 48, 40, 32, 48, 25, 33], 
        'Graduation Rate(%)': [95, np.NaN, 87, 99, 65, 84, 81, 85]
        }

performance_high = pd.DataFrame(hispanic_high_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
performance_high
Out[87]:
School Economic Need Index(%) Student Achievement Score(/5) College Readiness Rate(%) Graduation Rate(%)
0 MANHATTAN BRIDGES HIGH SCHOOL 91 4.51 77.0 95.0
1 WASHINGTON HEIGHTS EXPEDITIONARY LEARNING SCHOOL 82 4.51 NaN NaN
2 GREGORIO LUPERON HIGH SCHOOL FOR SCIENCE AND M... 99 3.47 48.0 87.0
3 ACADEMY FOR LANGUAGE AND TECHNOLOGY 99 4.75 40.0 99.0
4 INTERNATIONAL SCHOOL FOR LIBERAL ARTS 98 2.50 32.0 65.0
5 PAN AMERICAN INTERNATIONAL HIGH SCHOOL AT MONROE 98 4.89 48.0 84.0
6 MULTICULTURAL HIGH SCHOOL 99 3.34 25.0 81.0
7 PAN AMERICAN INTERNATIONAL HIGH SCHOOL 95 4.66 33.0 85.0
In [88]:
# Calculate the mean for College Readiness column
mean_col_read = performance_high['College Readiness Rate(%)'].mean()

# Fill in the missing values with the mean
performance_high['College Readiness Rate(%)'].fillna(mean_col_read)
Out[88]:
0    77.000000
1    43.285714
2    48.000000
3    40.000000
4    32.000000
5    48.000000
6    25.000000
7    33.000000
Name: College Readiness Rate(%), dtype: float64
In [89]:
# Calculate the mean for Graduation Rate column
mean_grad = performance_high['Graduation Rate(%)'].mean()

# Fill in the missing values with the mean
performance_high['Graduation Rate(%)'].fillna(mean_grad)
Out[89]:
0    95.000000
1    85.142857
2    87.000000
3    99.000000
4    65.000000
5    84.000000
6    81.000000
7    85.000000
Name: Graduation Rate(%), dtype: float64

Now that our dataframe is ready, we can perform some manipulations and make observations.

In [90]:
# Plotting Economic Need Index for each school on horizontal bar plot
performance_high.plot.barh(x='School', y='Economic Need Index(%)', legend=False)
Out[90]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb239c8>
In [91]:
performance_high['Economic Need Index(%)'].describe()
Out[91]:
count     8.000000
mean     95.125000
std       5.986592
min      82.000000
25%      94.000000
50%      98.000000
75%      99.000000
max      99.000000
Name: Economic Need Index(%), dtype: float64

All 8 schools have an extremely high Economic Need Index with 95% on average. Economic Need Index is an estimated percentage of students at school facing economic hardship, based on temporary housing, eligibility for public assistance, and Census tract poverty rates.

In [92]:
# Plotting Student Achievement Score for each school on horizontal bar plot
performance_high.plot.barh(x='School', y='Student Achievement Score(/5)', legend=False)
Out[92]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bb05f08>

As the bar plot shows, the Student Achievement Scores vary depending on the school with an average value of 4 out of 5 points. The poorest performance was demonstrated at INTERNATIONAL SCHOOL FOR LIBERAL ARTS. This particular school is 100% hispanic students.

In [93]:
performance_high['College Readiness Rate(%)'].describe()
Out[93]:
count     7.000000
mean     43.285714
std      17.124196
min      25.000000
25%      32.500000
50%      40.000000
75%      48.000000
max      77.000000
Name: College Readiness Rate(%), dtype: float64
In [94]:
performance_high['Graduation Rate(%)'].describe()
Out[94]:
count     7.000000
mean     85.142857
std      10.930516
min      65.000000
25%      82.500000
50%      85.000000
75%      91.000000
max      99.000000
Name: Graduation Rate(%), dtype: float64

The Graduation rate among the schools with high percentage of hispanic students is about 85% on average. On the downside, the College Readiness Rate is very low, only about 43% of students are ready to go to college which makes it unlikely for them to get high scores on SAT.

Researching schools with low hispanic_per and high SAT score

Now we will reaserch the schools with hispanic_per lower than 10%, and SAT scores greater than 1800.

In [95]:
# Filtering the dataframe 
hispanic_low = combined[combined['hispanic_per'] < 10]
hispanic_low = combined[combined['sat_score'] > 1800]

# Identifying school names 
hispanic_low['SCHOOL NAME']
Out[95]:
5                         BARD HIGH SCHOOL EARLY COLLEGE
37                                STUYVESANT HIGH SCHOOL
79     HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...
151                         BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
302                          TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

We will create a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov for the schools mentioned below. Then, we will manipulate this dataframe to gain some insights about schools where hispanic representation is low and SAT scores are high.

In [96]:
hispanic_low_perf = {'School': ['BARD HIGH SCHOOL EARLY COLLEGE','STUYVESANT HIGH SCHOOL',' HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINERING','BRONX HIGH SCHOOL OF SCIENCE', 'HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE', 'BROOKLYN TECHNICAL HIGH SCHOOL', ' TOWNSEND HARRIS HIGH SCHOOL', ' QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEGE', ' STATEN ISLAND TECHNICAL HIGH SCHOOL' ],
        'Economic Need Index(%)': [40, 40, 40, 39, 26, 52, 39, 49, 35], 
        'Student Achievement Score(/5)': [4.49, 4.83, 4.65, 4.90, 4.86, 4.58, 4.90, 4.80, 4.77], 
        'College Readiness Rate(%)': [98, 99, 99, 100, 100, 97, 100, 100, 100], 
        'Graduation Rate(%)': [99, 99, 99, 100, 100, 97, 100, 100, 100]
        }

performance_low = pd.DataFrame(hispanic_low_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
performance_low
Out[96]:
School Economic Need Index(%) Student Achievement Score(/5) College Readiness Rate(%) Graduation Rate(%)
0 BARD HIGH SCHOOL EARLY COLLEGE 40 4.49 98 99
1 STUYVESANT HIGH SCHOOL 40 4.83 99 99
2 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGI... 40 4.65 99 99
3 BRONX HIGH SCHOOL OF SCIENCE 39 4.90 100 100
4 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 26 4.86 100 100
5 BROOKLYN TECHNICAL HIGH SCHOOL 52 4.58 97 97
6 TOWNSEND HARRIS HIGH SCHOOL 39 4.90 100 100
7 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK C... 49 4.80 100 100
8 STATEN ISLAND TECHNICAL HIGH SCHOOL 35 4.77 100 100
In [97]:
# Plotting Economic Need Index for each school on horizontal bar plot
performance_low.plot.barh(x='School', y='Economic Need Index(%)', legend=False)
Out[97]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bae7a08>

The bar plor above shows that on average only 40% of students fall into economically challenged category among all 9 schools unlike all hispanic schools we researched earlier.

In [98]:
performance_low['Student Achievement Score(/5)'].describe()
Out[98]:
count    9.000000
mean     4.753333
std      0.146969
min      4.490000
25%      4.650000
50%      4.800000
75%      4.860000
max      4.900000
Name: Student Achievement Score(/5), dtype: float64

The average of Student Achievement Score is 4.75 points out of 5 which means students from these schools perform well academically.

In [99]:
performance_low['College Readiness Rate(%)'].describe()
Out[99]:
count      9.000000
mean      99.222222
std        1.092906
min       97.000000
25%       99.000000
50%      100.000000
75%      100.000000
max      100.000000
Name: College Readiness Rate(%), dtype: float64

All 9 schools indicate high levels of College Readiness Rate, 99.22% on average to be precise, the same can be observed for Graduation Rate since the values are almost identical. That means students from these schools will definitely graduate and they are better prepared to take college entry exams like SAT.

Exploring Gender and SAT Scores

We will plot out the correlations between the percentages of each gender (male_per and female_per) and sat_score.

In [100]:
gender = ['male_per', 'female_per']
combined.corr()['sat_score'][gender].plot.bar()
Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x286599f4ec8>

The bar plot above shows that the number of female students correlate positively with the SAT score, whereas the number of male students tend to correlate negatively. Could that mean girls in general do better on SAT? To find out, we would have to research more deeply the relationship between female_per and sat_score.

In [101]:
# Making a scatter plot of female_per vs. sat_score
combined.plot.scatter(x='female_per', y='sat_score')
Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x28659d02248>

The values seem to be plotted out sporadically and do not follow any particular pattern, therefore the correlation is rather weak.

In [102]:
# Filtering the dataframe 
female_high = combined[combined['female_per'] > 60]
female_high = combined[combined['sat_score'] > 1700]

# Identifying school names 
female_high['SCHOOL NAME']
Out[102]:
5                         BARD HIGH SCHOOL EARLY COLLEGE
26                         ELEANOR ROOSEVELT HIGH SCHOOL
37                                STUYVESANT HIGH SCHOOL
60                                    BEACON HIGH SCHOOL
61     FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...
79     HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...
151                         BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
198                           BROOKLYN LATIN SCHOOL, THE
302                          TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

We will follow the same approach as before by creating a new dataframe using the data from "School Performance Dashboard" from https://www.schools.nyc.gov.

In [103]:
# Creating a new dataframe
female_high_perf = {'School': ['BARD HIGH SCHOOL EARLY COLLEGE', 'ELEANOR ROOSEVELT HIGH SCHOOL', 'STUYVESANT HIGH SCHOOL', 'BEACON HIGH SCHOOL', 'FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & ARTS', 'HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGINERING','BRONX HIGH SCHOOL OF SCIENCE', 'HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE', 'BROOKLYN TECHNICAL HIGH SCHOOL', 'BROOKLYN LATIN SCHOOL, THE', 'TOWNSEND HARRIS HIGH SCHOOL', ' QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK COLLEGE', ' STATEN ISLAND TECHNICAL HIGH SCHOOL' ],
        'Economic Need Index(%)': [40, 20, 40, 30, 30, 40, 39, 26, 52, 55, 39, 49, 35], 
        'Student Achievement Score(/5)': [4.49, 4.79, 4.83, 4.33, 4.69, 4.65, 4.90, 4.86, 4.58, 4.64, 4.90, 4.80, 4.77], 
        'College Readiness Rate(%)': [98, 100, 99, 97, 99, 99, 100, 100, 97, 97, 100, 100, 100], 
        'Graduation Rate(%)': [99, 100, 99, 98, 99, 99, 100, 100, 97, 97, 100, 100, 100]
        }

female_perf = pd.DataFrame(female_high_perf, columns = ['School', 'Economic Need Index(%)', 'Student Achievement Score(/5)', 'College Readiness Rate(%)', 'Graduation Rate(%)'])
female_perf
Out[103]:
School Economic Need Index(%) Student Achievement Score(/5) College Readiness Rate(%) Graduation Rate(%)
0 BARD HIGH SCHOOL EARLY COLLEGE 40 4.49 98 99
1 ELEANOR ROOSEVELT HIGH SCHOOL 20 4.79 100 100
2 STUYVESANT HIGH SCHOOL 40 4.83 99 99
3 BEACON HIGH SCHOOL 30 4.33 97 98
4 FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & ARTS 30 4.69 99 99
5 HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN... 40 4.65 99 99
6 BRONX HIGH SCHOOL OF SCIENCE 39 4.90 100 100
7 HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE 26 4.86 100 100
8 BROOKLYN TECHNICAL HIGH SCHOOL 52 4.58 97 97
9 BROOKLYN LATIN SCHOOL, THE 55 4.64 97 97
10 TOWNSEND HARRIS HIGH SCHOOL 39 4.90 100 100
11 QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK C... 49 4.80 100 100
12 STATEN ISLAND TECHNICAL HIGH SCHOOL 35 4.77 100 100
In [104]:
female_perf['Economic Need Index(%)'].describe()
Out[104]:
count    13.000000
mean     38.076923
std      10.103312
min      20.000000
25%      30.000000
50%      39.000000
75%      40.000000
max      55.000000
Name: Economic Need Index(%), dtype: float64
In [105]:
female_perf['Student Achievement Score(/5)'].describe()
Out[105]:
count    13.000000
mean      4.710000
std       0.169755
min       4.330000
25%       4.640000
50%       4.770000
75%       4.830000
max       4.900000
Name: Student Achievement Score(/5), dtype: float64
In [106]:
female_perf['College Readiness Rate(%)'].describe()
Out[106]:
count     13.000000
mean      98.923077
std        1.255756
min       97.000000
25%       98.000000
50%       99.000000
75%      100.000000
max      100.000000
Name: College Readiness Rate(%), dtype: float64
In [107]:
female_perf['Graduation Rate(%)'].describe()
Out[107]:
count     13.000000
mean      99.076923
std        1.115164
min       97.000000
25%       99.000000
50%       99.000000
75%      100.000000
max      100.000000
Name: Graduation Rate(%), dtype: float64

After researching the data on schools where the majority of students are female, and SAT scores are high we can make the following observations:

  • The Economic Need Index is relatively low, only 38% on average require funding
  • Great academic performance demonstrated by high Student Achievement Rate
  • At least 98.9% of students graduate and ready for college.

These metrics indicate that these schools have good chances to get high scores on SAT.

Exploring AP Scores vs. SAT Scores

First step is to calculate the percentage of students in each school that took an AP exam by dividing the AP Test Takers column by the total_enrollment column.

In [108]:
combined['ap_per'] = combined['AP Test Takers '] / combined['total_enrollment']
combined['ap_per']
Out[108]:
0      0.305756
1      0.098985
2      0.031773
3      0.351577
4      0.158091
         ...   
358    0.077558
359    0.297301
360    0.190955
361    0.026616
362    0.073593
Name: ap_per, Length: 363, dtype: float64

Next step is to investigate the relationship between AP Scores and SAT Scores by making a scatter plot.

In [109]:
combined.plot.scatter(x= 'ap_per', y='sat_score')
Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bacc248>

According to the scatter plot the correlation between sat_score and ap_per doesn't seem to be so strong because schools with the higher percentage of AP test takers do not perform better on SAT.

Further Analysis

Exploring Class Size vs. SAT Score

We are going to determine wheter there's a correlation between class size and SAT scores.

In [110]:
class_size_cols = ['AVERAGE CLASS SIZE', 'NUMBER OF STUDENTS / SEATS FILLED' ]
combined.corr()['sat_score'][class_size_cols].plot.barh()
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865bab89c8>

The bar plot shows that both average class size and the number of seats filled correlate positively with SAT scores, but to understand these relationships deeper we would plot each of the columns on a scatter plot.

In [111]:
combined.plot.scatter(x= 'NUMBER OF STUDENTS / SEATS FILLED', y='sat_score')
Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865ba97688>
In [112]:
combined.plot.scatter(x= 'AVERAGE CLASS SIZE', y='sat_score')
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x2865ba7b108>

Both scatter plots do not indicate very strong relationship between class size and SAT scores, so we will reseach how small and large classes perform academically and compare.

In [113]:
# Researching schools with small classes and high SAT scores
small_class = combined[combined['AVERAGE CLASS SIZE'] < 20 ]
small_class = combined[combined['sat_score'] > 1700]
small_class['SCHOOL NAME']
Out[113]:
5                         BARD HIGH SCHOOL EARLY COLLEGE
26                         ELEANOR ROOSEVELT HIGH SCHOOL
37                                STUYVESANT HIGH SCHOOL
60                                    BEACON HIGH SCHOOL
61     FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...
79     HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...
151                         BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
198                           BROOKLYN LATIN SCHOOL, THE
302                          TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object
In [114]:
# Researching schools with large classes and high SAT scores
large_class = combined[combined['AVERAGE CLASS SIZE'] > 35 ]
large_class = combined[combined['sat_score'] > 1700]
large_class['SCHOOL NAME']
Out[114]:
5                         BARD HIGH SCHOOL EARLY COLLEGE
26                         ELEANOR ROOSEVELT HIGH SCHOOL
37                                STUYVESANT HIGH SCHOOL
60                                    BEACON HIGH SCHOOL
61     FIORELLO H. LAGUARDIA HIGH SCHOOL OF MUSIC & A...
79     HIGH SCHOOL FOR MATHEMATICS, SCIENCE AND ENGIN...
151                         BRONX HIGH SCHOOL OF SCIENCE
155    HIGH SCHOOL OF AMERICAN STUDIES AT LEHMAN COLLEGE
187                       BROOKLYN TECHNICAL HIGH SCHOOL
198                           BROOKLYN LATIN SCHOOL, THE
302                          TOWNSEND HARRIS HIGH SCHOOL
327    QUEENS HIGH SCHOOL FOR THE SCIENCES AT YORK CO...
356                  STATEN ISLAND TECHNICAL HIGH SCHOOL
Name: SCHOOL NAME, dtype: object

Notice that the aforementioned schools are the exact same schools that have high percentage of females and high SAT scores. We've taken a look at schools with both small and large classes that get high SAT scores, and the results are identical. That means the class size doesn't really affect the performance on SAT.