%matplotlib inline
# from pylab import *
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
data = pd.read_csv('Chicago_Public_Schools_-_Elementary_School_Progress_Report__2013-2014_.csv',index_col='School ID')
# import progress report data, titles are included in the file
#data = pd.read_csv('Chicago_Public_Schools_-_Elementary_School_Progress_Report__2013-2014_.csv').dropna()
data.head(5) # print the first 5 rows
Name of School | Street Address | City | State | ZIP Code | Phone Number | Website | Blue Ribbon Award | CPS Performance Policy Level | CPS Performance Policy Status | Probation Length | My Voice, My School Overall Rating | Student Response Rate | Teacher Response Rate | Involved Family | Supportive Environment | Ambitious Instruction | Effective Leaders | Collaborative Teachers | Safe | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
School ID | |||||||||||||||||||||
400009 | Academy for Global Citizenship Elementary School | 4647 W 47th St | Chicago | IL | 60632 | (773) 582-1100 | http://www.agcchicago.org/ | NaN | LEVEL 1 | NOT APPLICABLE | NaN | NOT ENOUGH DATA | 0% | 47.6% | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | ... |
400011 | Alain Locke Charter Elementary Academy | 3141 W Jackson Blvd | Chicago | IL | 60612 | (773) 265-7232 | http://www.alainlocke.org | NaN | LEVEL 1 | NOT APPLICABLE | NaN | NOT ENOUGH DATA | 0.9% | 0% | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | ... |
400017 | ASPIRA Charter - Haugan Campus | 3729 W Leland Ave | Chicago | IL | 60625 | (773) 252-0970 | http://www.aspirail.org | NaN | LEVEL 2 | NOT APPLICABLE | NaN | ORGANIZED | 93.2% | 86.8% | NEUTRAL | STRONG | STRONG | NEUTRAL | NEUTRAL | NEUTRAL | ... |
400019 | Bronzeville Lighthouse Charter Elementary School | 8 W Root St | Chicago | IL | 60609 | (773) 535-1460 | http://www.lighthouse-academies.org/BZLCS.htm | NaN | LEVEL 3 | NOT APPLICABLE | NaN | NOT ENOUGH DATA | 2.4% | 0% | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | ... |
400020 | Catalyst Charter ES - Howland | 1616 S Spaulding Ave | Chicago | IL | 60623 | (773) 527-7330 | http://www.catalystschools.org | NaN | LEVEL 3 | NOT APPLICABLE | NaN | PARTIALLY ORGANIZED | 65.2% | 84% | VERY WEAK | STRONG | NEUTRAL | VERY WEAK | VERY WEAK | NEUTRAL | ... |
5 rows × 73 columns
print data.describe()
school_zip Blue Ribbon Award prob_length rea_grw_perc_all \ count 483.000000 3.000000 158.000000 442.000000 mean 60630.784679 2012.333333 5.303797 60.237557 std 23.354458 1.154701 3.347608 36.393480 min 60605.000000 2011.000000 1.000000 0.000000 25% 60618.000000 2012.000000 2.000000 24.000000 50% 60628.000000 2013.000000 6.000000 72.500000 75% 60640.500000 2013.000000 7.000000 95.000000 max 60827.000000 2013.000000 18.000000 99.000000 NWEA Reading Growth Percentile Grade 3 \ count 425.000000 mean 53.555294 std 32.516213 min 0.000000 25% 24.000000 50% 55.000000 75% 84.000000 max 99.000000 NWEA Reading Growth Percentile Grade 4 \ count 420.000000 mean 55.657143 std 33.530331 min 0.000000 25% 27.000000 50% 57.000000 75% 89.250000 max 99.000000 NWEA Reading Growth Percentile Grade 5 \ count 420.000000 mean 59.576190 std 32.180449 min 0.000000 25% 33.000000 50% 63.500000 75% 90.000000 max 99.000000 NWEA Reading Growth Percentile Grade 6 \ count 411.000000 mean 58.199513 std 30.868709 min 1.000000 25% 31.000000 50% 60.000000 75% 86.000000 max 99.000000 NWEA Reading Growth Percentile Grade 7 \ count 400.000000 mean 60.972500 std 31.587437 min 0.000000 25% 36.000000 50% 67.000000 75% 92.000000 max 99.000000 NWEA Reading Growth Percentile Grade 8 math_grw_perc_all \ count 396.000000 443.000000 mean 56.659091 70.934537 std 32.107532 33.754876 min 0.000000 0.000000 25% 27.750000 46.500000 50% 59.000000 88.000000 75% 87.000000 99.000000 max 99.000000 99.000000 NWEA Math Growth Percentile Grade 3 \ count 425.000000 mean 65.703529 std 33.301674 min 0.000000 25% 43.000000 50% 78.000000 75% 96.000000 max 99.000000 NWEA Math Growth Percentile Grade 4 \ count 422.000000 mean 67.187204 std 32.211969 min 0.000000 25% 44.000000 50% 79.000000 75% 96.000000 max 99.000000 NWEA Math Growth Percentile Grade 5 \ count 423.000000 mean 62.264775 std 33.903485 min 0.000000 25% 30.500000 50% 73.000000 75% 95.000000 max 99.000000 NWEA Math Growth Percentile Grade 6 \ count 412.000000 mean 61.194175 std 32.043935 min 0.000000 25% 35.000000 50% 65.000000 75% 93.000000 max 99.000000 NWEA Math Growth Percentile Grade 7 \ count 399.000000 mean 63.779449 std 33.353090 min 0.000000 25% 38.000000 50% 73.000000 75% 95.000000 max 99.000000 NWEA Math Growth Percentile Grade 8 rea_att_perc_all \ count 396.000000 447.000000 mean 66.492424 36.874720 std 34.030998 31.120948 min 0.000000 0.000000 25% 36.000000 8.000000 50% 82.000000 30.000000 75% 97.000000 61.000000 max 99.000000 99.000000 NWEA Reading Attainment Percentile Grade 2 \ count 417.000000 mean 40.681055 std 35.366304 min 0.000000 25% 6.000000 50% 30.000000 75% 74.000000 max 99.000000 NWEA Reading Attainment Percentile Grade 3 count 426.000000 ... mean 39.521127 ... std 33.197318 ... min 0.000000 ... 25% 8.000000 ... 50% 31.000000 ... 75% 68.750000 ... max 99.000000 ... [8 rows x 42 columns]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx] C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future score = values[idx]
#rename features so that they're python-friendly
data.rename(columns={'School ID':'school_id'}, inplace=True)
data.rename(columns={'Name of School':'school_name'}, inplace=True)
data.rename(columns={'Street Address':'school_add'}, inplace=True)
data.rename(columns={'City':'school_city'}, inplace=True)
data.rename(columns={'State':'school_state'}, inplace=True)
data.rename(columns={'ZIP Code':'school_zip'}, inplace=True)
data.rename(columns={'Student Response Rate':'student_resp_rate'}, inplace=True)
data.rename(columns={'Teacher Response Rate':'teacher_resp_rate'}, inplace=True)
data.rename(columns={'Probation Length':'prob_length'}, inplace=True)
data.rename(columns={'Involved Family':'inv_family'}, inplace=True)
data.rename(columns={'Supportive Environment':'supp_env'}, inplace=True)
data.rename(columns={'Ambitious Instruction':'amb_inst'}, inplace=True)
data.rename(columns={'Effective Leaders':'eff_lead'}, inplace=True)
data.rename(columns={'Collaborative Teachers':'coll_tea'}, inplace=True)
data.rename(columns={'NWEA Reading Growth Percentile All Grades':'rea_grw_perc_all'}, inplace=True)
data.rename(columns={'NWEA Reading Attainment Percentile All Grades':'rea_att_perc_all'}, inplace=True)
data.rename(columns={'CPS Performance Policy Level':'cps_perf'}, inplace=True)
data.rename(columns={'School Communit':'sch_comm'}, inplace=True)
data.rename(columns={'Parent-Teacher Partnership':'par_tea_partn'}, inplace=True)
data.rename(columns={'Quality of Facilities':'fac_qual'}, inplace=True)
data.rename(columns={'Healthy Schools Certification':'health_cert'}, inplace=True)
data.rename(columns={'Creative Schools Certification':'creative_cert'}, inplace=True)
data.rename(columns={'NWEA Math Growth Percentile All Grades':'math_grw_perc_all'}, inplace=True)
data.rename(columns={'Suspensions Per 100 students 2013':'suspensions'}, inplace=True)
data.rename(columns={'Student Attendance Percentage 2013':'stud_att'}, inplace=True)
data.rename(columns={'Teacher Attendance Percentage 2013':'teach_att'}, inplace=True)
data.rename(columns={'One-Year Drop Out Rate Percentage 2013':'drop_out'}, inplace=True)
#Safe
data.head(5)
school_name | school_add | school_city | school_state | school_zip | Phone Number | Website | Blue Ribbon Award | cps_perf | CPS Performance Policy Status | prob_length | My Voice, My School Overall Rating | student_resp_rate | teacher_resp_rate | inv_family | supp_env | amb_inst | eff_inst | coll_tea | Safe | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
School ID | |||||||||||||||||||||
400009 | Academy for Global Citizenship Elementary School | 4647 W 47th St | Chicago | IL | 60632 | (773) 582-1100 | http://www.agcchicago.org/ | NaN | LEVEL 1 | NOT APPLICABLE | NaN | NOT ENOUGH DATA | 0% | 47.6% | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | ... |
400011 | Alain Locke Charter Elementary Academy | 3141 W Jackson Blvd | Chicago | IL | 60612 | (773) 265-7232 | http://www.alainlocke.org | NaN | LEVEL 1 | NOT APPLICABLE | NaN | NOT ENOUGH DATA | 0.9% | 0% | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | ... |
400017 | ASPIRA Charter - Haugan Campus | 3729 W Leland Ave | Chicago | IL | 60625 | (773) 252-0970 | http://www.aspirail.org | NaN | LEVEL 2 | NOT APPLICABLE | NaN | ORGANIZED | 93.2% | 86.8% | NEUTRAL | STRONG | STRONG | NEUTRAL | NEUTRAL | NEUTRAL | ... |
400019 | Bronzeville Lighthouse Charter Elementary School | 8 W Root St | Chicago | IL | 60609 | (773) 535-1460 | http://www.lighthouse-academies.org/BZLCS.htm | NaN | LEVEL 3 | NOT APPLICABLE | NaN | NOT ENOUGH DATA | 2.4% | 0% | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | NOT ENOUGH DATA | ... |
400020 | Catalyst Charter ES - Howland | 1616 S Spaulding Ave | Chicago | IL | 60623 | (773) 527-7330 | http://www.catalystschools.org | NaN | LEVEL 3 | NOT APPLICABLE | NaN | PARTIALLY ORGANIZED | 65.2% | 84% | VERY WEAK | STRONG | NEUTRAL | VERY WEAK | VERY WEAK | NEUTRAL | ... |
5 rows × 73 columns
# Return the original value if a string, otherwise return 'Unknown'
def cleanPerf(perf):
if type(perf) is str:
return perf
else:
return 'Unknown'
#create ordinal variable for size
def catPerf(perf):
if 'LEVEL 1' in perf:
return 1
elif 'LEVEL 2' in perf:
return 2
elif 'LEVEL 3' in perf:
return 3
else:
return 0
#apply perf function for categorical feature
data['cps_clean_cat']=data['cps_perf'].apply(cleanPerf)
data['cps_perf_cat'] = data['cps_clean_cat'].apply(catPerf)
print data['cps_perf_cat'].unique()
[1 2 3 0]
data['inv_fam_fixed']=data['inv_family']
data['supp_env_fixed']=data['supp_env']
data['amb_inst_fixed']=data['amb_inst']
data['eff_lead_fixed']=data['eff_lead']
data['coll_tea_fixed']=data['coll_tea']
data['safe_fixed']=data['Safe']
#recode not enough data inv_family supp_env amb_inst eff_inst coll_tea Safe
data.inv_fam_fixed[data.inv_fam_fixed=='NOT ENOUGH DATA'] = 'other'
data.supp_env_fixed[data.supp_env_fixed=='NOT ENOUGH DATA'] = 'other'
data.amb_inst_fixed[data.amb_inst_fixed=='NOT ENOUGH DATA'] = 'other'
data.eff_lead_fixed[data.eff_lead_fixed=='NOT ENOUGH DATA'] = 'other'
data.coll_tea_fixed[data.coll_tea_fixed=='NOT ENOUGH DATA'] = 'other'
data.safe_fixed[data.safe_fixed=='NOT ENOUGH DATA'] = 'other'
#groupby categorical data
print data.groupby('supp_env_fixed').count()
school_name school_add school_city school_state \ supp_env_fixed NEUTRAL 188 188 188 188 STRONG 121 121 121 121 VERY STRONG 47 47 47 47 VERY WEAK 6 6 6 6 WEAK 67 67 67 67 other 54 54 54 54 school_zip Phone Number Website Blue Ribbon Award \ supp_env_fixed NEUTRAL 188 188 188 2 STRONG 121 121 121 1 VERY STRONG 47 47 47 0 VERY WEAK 6 6 6 0 WEAK 67 67 67 0 other 54 49 54 0 cps_perf CPS Performance Policy Status prob_length \ supp_env_fixed NEUTRAL 186 186 70 STRONG 121 121 32 VERY STRONG 47 47 9 VERY WEAK 5 5 4 WEAK 67 67 38 other 49 49 5 My Voice, My School Overall Rating student_resp_rate \ supp_env_fixed NEUTRAL 188 188 STRONG 121 121 VERY STRONG 47 47 VERY WEAK 6 6 WEAK 67 67 other 54 49 teacher_resp_rate inv_family supp_env amb_inst eff_inst \ supp_env_fixed NEUTRAL 188 188 188 188 188 STRONG 121 121 121 121 121 VERY STRONG 47 47 47 47 47 VERY WEAK 6 6 6 6 6 WEAK 67 67 67 67 67 other 49 54 54 54 54 coll_tea Safe supp_env_fixed NEUTRAL 188 188 ... STRONG 121 121 ... VERY STRONG 47 47 ... VERY WEAK 6 6 ... WEAK 67 67 ... other 54 54 ... [6 rows x 86 columns]
#groupby categorical data inv_fam_fixed
#create numerical variable for family involvement
def cat(var_name):
if 'VERY STRONG' in var_name:
return 1
elif 'STRONG' in var_name:
return 2
elif 'NEUTRAL' in var_name:
return 3
elif 'WEAK' in var_name:
return 4
elif 'VERY WEAK' in var_name:
return 5
else:
return 0
data['inv_fam_cat'] = data['inv_family'].apply(cat)
data['supp_env_cat'] = data['supp_env'].apply(cat)
data['amb_inst_cat'] = data['amb_inst'].apply(cat)
data['eff_lead_cat'] = data['eff_lead'].apply(cat)
data['coll_tea_cat'] = data['coll_tea'].apply(cat)
data['safe_cat'] = data['Safe'].apply(cat)
#create dummy variables for family involvement
dummy_inv_fam = pd.get_dummies(data['inv_fam_cat'], prefix='fam')
dummy_supp_env = pd.get_dummies(data['supp_env_cat'], prefix='env')
dummy_amb_inst = pd.get_dummies(data['amb_inst_cat'], prefix='amb_inst')
dummy_eff_lead = pd.get_dummies(data['eff_lead_cat'], prefix='eff_lead')
dummy_coll_tea = pd.get_dummies(data['coll_tea_cat'], prefix='coll_tea')
dummy_inv_fam.columns = ['fam_1', 'fam_2','fam_3','fam_4','fam_5']
dummy_supp_env.columns = ['env_1', 'env_2','env_3','env_4','env_5']
dummy_amb_inst.columns = ['amb_inst_1', 'amb_inst_2','amb_inst_3','amb_inst_4','amb_inst_5']
dummy_eff_lead.columns = ['eff_lead_1', 'eff_lead_2','eff_lead_3','eff_lead_4','eff_lead_5']
dummy_coll_tea.columns = ['coll_tea_1', 'coll_tea_2','coll_tea_3','coll_tea_4','coll_tea_5']
data['fam_1'] = dummy_inv_fam['fam_1']
data['fam_2'] = dummy_inv_fam['fam_2']
data['fam_3'] = dummy_inv_fam['fam_3']
data['fam_4'] = dummy_inv_fam['fam_4']
data['fam_5'] = dummy_inv_fam['fam_5']
data['env_1'] = dummy_supp_env['env_1']
data['env_2'] = dummy_supp_env['env_2']
data['env_3'] = dummy_supp_env['env_3']
data['env_4'] = dummy_supp_env['env_4']
data['env_5'] = dummy_supp_env['env_5']
data['amb_inst_1'] = dummy_amb_inst['amb_inst_1']
data['amb_inst_2'] = dummy_amb_inst['amb_inst_2']
data['amb_inst_3'] = dummy_amb_inst['amb_inst_3']
data['amb_inst_4'] = dummy_amb_inst['amb_inst_4']
data['amb_inst_5'] = dummy_amb_inst['amb_inst_5']
data['eff_lead_1'] = dummy_eff_lead['eff_lead_1']
data['eff_lead_2'] = dummy_eff_lead['eff_lead_2']
data['eff_lead_3'] = dummy_eff_lead['eff_lead_3']
data['eff_lead_4'] = dummy_eff_lead['eff_lead_4']
data['eff_lead_5'] = dummy_eff_lead['eff_lead_5']
data['coll_tea_1'] = dummy_coll_tea['coll_tea_1']
data['coll_tea_2'] = dummy_coll_tea['coll_tea_2']
data['coll_tea_3'] = dummy_coll_tea['coll_tea_3']
data['coll_tea_4'] = dummy_coll_tea['coll_tea_4']
data['coll_tea_5'] = dummy_coll_tea['coll_tea_5']
print data['teacher_resp_rate_st'].unique()
['47.6%' '0%' '86.8%' '84%' '91.3%' '84.6%' '70.8%' '93.8%' '89.7%' '87.5%' '100%' '96.6%' '97.1%' '83.3%' '61.1%' '78.6%' '76.9%' '91.4%' '96%' '74.3%' '81.8%' '52%' '76%' '60.5%' '55%' '93.6%' '53.3%' '94.7%' '80.7%' '82.4%' '77.8%' '74.2%' '82.6%' '52.8%' '90.6%' '70.3%' '56.3%' '82.8%' '80%' '93.1%' '70%' '84.4%' nan '91.8%' '85.3%' '76.5%' '92%' '79.3%' '88.2%' '77.4%' '95.2%' '90.7%' '88.6%' '66.7%' '94.6%' '93.9%' '94.1%' '88.5%' '85%' '61.5%' '82.2%' '89.8%' '91.1%' '73%' '90%' '78.4%' '88.9%' '90.5%' '92.2%' '86.3%' '73.3%' '69.6%' '95.5%' '81.5%' '91.2%' '57.9%' '97.7%' '84.1%' '68.9%' '68.2%' '70.6%' '87%' '95%' '35.3%' '93.3%' '72.4%' '71.9%' '96.9%' '86.7%' '78.1%' '79%' '85.7%' '94.9%' '57.1%' '88%' '80.8%' '81.3%' '86.4%' '72.2%' '97%' '50%' '86.5%' '60%' '69%' '95.7%' '59.2%' '97.6%' '45%' '69.2%' '68%' '56.8%' '69.7%' '65.2%' '72.1%' '65.7%' '97.8%' '35%' '97.9%' '80.6%' '76.3%' '90.9%' '74.1%' '75%' '89.5%' '75.5%' '97.2%' '73.9%' '86%' '87.1%' '92.5%' '84.7%' '72.7%' '89.4%' '87.9%' '96.3%' '92.9%' '90.3%' '47.4%' '82.9%' '68.1%' '58.6%' '64%' '81.4%' '75.4%' '79.2%' '92.3%' '69.4%' '54.3%' '68.8%' '82.1%' '74.5%' '96.4%' '91.7%' '63%' '53.9%' '80.4%' '77.5%' '71%' '63.2%' '98.6%' '81%' '70.2%' '59.3%' '58.1%' '86.1%' '78.3%' '71.1%' '64.3%' '84.2%' '77.2%' '68.4%' '79.1%' '98%' '13.9%' '78.7%' '67.8%' '44.2%' '94.4%' '95.8%' '45.3%' '27%' '84.9%' '64.7%' '58.7%' '84.3%' '63.6%' '76.2%' '71.4%' '34.8%' '84.8%' '42.9%' '82.7%' '67.9%' '57%' '86.2%' '73.7%' '46.2%' '92.6%']
#data['teacher_resp_rate_st']=data['teacher_resp_rate'].strip('%');
#data.teacher_resp_rate_st[data.teacher_resp_rate_st=='%'] = ''
def rem_per(x):
return (x.strip("%"))#doesn't work
data['teacher_resp_rate_st'] = data['teacher_resp_rate']
#data['teacher_resp_rate_st'] = data['teacher_resp_rate_st'].apply(rem_per)
#histogram
data.rea_grw_perc_all.hist()
<matplotlib.axes.AxesSubplot at 0x17488be0>
#scatterplot
# relatively strong relationship with performance: supp_env_cat, collaborative teacher rea_grw_perc_all math_grw_perc_all
plt.scatter(data.cps_perf_cat, data.coll_tea_cat, lw=10, alpha=.1, color='b')
plt.xlabel("school performance")
plt.ylabel("collaborative teachers")
<matplotlib.text.Text at 0x18343400>
#print data['teacher_resp_rate'].unique
data[['teacher_resp_rate']].describe()
teacher_resp_rate | |
---|---|
count | 478 |
unique | 197 |
top | 100% |
freq | 58 |
4 rows × 1 columns
from sklearn import linear_model, datasets
from sklearn.cross_validation import train_test_split
%pylab inline
# define X; supp_env_cat amb_inst_cat eff_lead_cat coll_tea_cat safe_cat
features=['amb_inst_cat','eff_lead_cat','coll_tea_cat','safe_cat']
X=data[features]
# Define Y/target
Y = (data.cps_perf_cat.values)==1
# Slice off a test set
# Use train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25)
#create classifier
clf=linear_model.LogisticRegression().fit(x_train,y_train)
print clf.score(x_train,y_train)
print clf.score(x_test,y_test)
0.709944751381 0.719008264463
from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n"
if show_classification_report:
print "Classification report"
print metrics.classification_report(y,y_pred),"\n"
if show_confusion_matrix:
print "Confusion matrix"
print metrics.confusion_matrix(y,y_pred),"\n"
print measure_performance(x_train, y_train, clf)
Accuracy:0.710 Classification report precision recall f1-score support False 0.74 0.88 0.80 243 True 0.60 0.35 0.44 119 avg / total 0.69 0.71 0.69 362 Confusion matrix [[215 28] [ 77 42]] None
print clf.coef_
[[ 0.76397835 0.06849151 -0.28981508 -0.98393147]]