In [1]:

%matplotlib inline
# from pylab import *

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.naive_bayes import MultinomialNB

In [23]:

data = pd.read_csv('Chicago_Public_Schools_-_Elementary_School_Progress_Report__2013-2014_.csv',index_col='School ID')

In [5]:

# import progress report data, titles are included in the file
#data = pd.read_csv('Chicago_Public_Schools_-_Elementary_School_Progress_Report__2013-2014_.csv').dropna()

In [6]:

data.head(5)  # print the first 5 rows

Out[6]:

	Name of School	Street Address	City	State	ZIP Code	Phone Number	Website	Blue Ribbon Award	CPS Performance Policy Level	CPS Performance Policy Status	Probation Length	My Voice, My School Overall Rating	Student Response Rate	Teacher Response Rate	Involved Family	Supportive Environment	Ambitious Instruction	Effective Leaders	Collaborative Teachers	Safe
School ID
400009	Academy for Global Citizenship Elementary School	4647 W 47th St	Chicago	IL	60632	(773) 582-1100	http://www.agcchicago.org/	NaN	LEVEL 1	NOT APPLICABLE	NaN	NOT ENOUGH DATA	0%	47.6%	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	...
400011	Alain Locke Charter Elementary Academy	3141 W Jackson Blvd	Chicago	IL	60612	(773) 265-7232	http://www.alainlocke.org	NaN	LEVEL 1	NOT APPLICABLE	NaN	NOT ENOUGH DATA	0.9%	0%	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	...
400017	ASPIRA Charter - Haugan Campus	3729 W Leland Ave	Chicago	IL	60625	(773) 252-0970	http://www.aspirail.org	NaN	LEVEL 2	NOT APPLICABLE	NaN	ORGANIZED	93.2%	86.8%	NEUTRAL	STRONG	STRONG	NEUTRAL	NEUTRAL	NEUTRAL	...
400019	Bronzeville Lighthouse Charter Elementary School	8 W Root St	Chicago	IL	60609	(773) 535-1460	http://www.lighthouse-academies.org/BZLCS.htm	NaN	LEVEL 3	NOT APPLICABLE	NaN	NOT ENOUGH DATA	2.4%	0%	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	...
400020	Catalyst Charter ES - Howland	1616 S Spaulding Ave	Chicago	IL	60623	(773) 527-7330	http://www.catalystschools.org	NaN	LEVEL 3	NOT APPLICABLE	NaN	PARTIALLY ORGANIZED	65.2%	84%	VERY WEAK	STRONG	NEUTRAL	VERY WEAK	VERY WEAK	NEUTRAL	...

5 rows × 73 columns

In [13]:

print data.describe()

         school_zip  Blue Ribbon Award  prob_length  rea_grw_perc_all  \
count    483.000000           3.000000   158.000000        442.000000   
mean   60630.784679        2012.333333     5.303797         60.237557   
std       23.354458           1.154701     3.347608         36.393480   
min    60605.000000        2011.000000     1.000000          0.000000   
25%    60618.000000        2012.000000     2.000000         24.000000   
50%    60628.000000        2013.000000     6.000000         72.500000   
75%    60640.500000        2013.000000     7.000000         95.000000   
max    60827.000000        2013.000000    18.000000         99.000000   

       NWEA Reading Growth Percentile Grade 3  \
count                              425.000000   
mean                                53.555294   
std                                 32.516213   
min                                  0.000000   
25%                                 24.000000   
50%                                 55.000000   
75%                                 84.000000   
max                                 99.000000   

       NWEA Reading Growth Percentile Grade 4  \
count                              420.000000   
mean                                55.657143   
std                                 33.530331   
min                                  0.000000   
25%                                 27.000000   
50%                                 57.000000   
75%                                 89.250000   
max                                 99.000000   

       NWEA Reading Growth Percentile Grade 5  \
count                              420.000000   
mean                                59.576190   
std                                 32.180449   
min                                  0.000000   
25%                                 33.000000   
50%                                 63.500000   
75%                                 90.000000   
max                                 99.000000   

       NWEA Reading Growth Percentile Grade 6  \
count                              411.000000   
mean                                58.199513   
std                                 30.868709   
min                                  1.000000   
25%                                 31.000000   
50%                                 60.000000   
75%                                 86.000000   
max                                 99.000000   

       NWEA Reading Growth Percentile Grade 7  \
count                              400.000000   
mean                                60.972500   
std                                 31.587437   
min                                  0.000000   
25%                                 36.000000   
50%                                 67.000000   
75%                                 92.000000   
max                                 99.000000   

       NWEA Reading Growth Percentile Grade 8  math_grw_perc_all  \
count                              396.000000         443.000000   
mean                                56.659091          70.934537   
std                                 32.107532          33.754876   
min                                  0.000000           0.000000   
25%                                 27.750000          46.500000   
50%                                 59.000000          88.000000   
75%                                 87.000000          99.000000   
max                                 99.000000          99.000000   

       NWEA Math Growth Percentile Grade 3  \
count                           425.000000   
mean                             65.703529   
std                              33.301674   
min                               0.000000   
25%                              43.000000   
50%                              78.000000   
75%                              96.000000   
max                              99.000000   

       NWEA Math Growth Percentile Grade 4  \
count                           422.000000   
mean                             67.187204   
std                              32.211969   
min                               0.000000   
25%                              44.000000   
50%                              79.000000   
75%                              96.000000   
max                              99.000000   

       NWEA Math Growth Percentile Grade 5  \
count                           423.000000   
mean                             62.264775   
std                              33.903485   
min                               0.000000   
25%                              30.500000   
50%                              73.000000   
75%                              95.000000   
max                              99.000000   

       NWEA Math Growth Percentile Grade 6  \
count                           412.000000   
mean                             61.194175   
std                              32.043935   
min                               0.000000   
25%                              35.000000   
50%                              65.000000   
75%                              93.000000   
max                              99.000000   

       NWEA Math Growth Percentile Grade 7  \
count                           399.000000   
mean                             63.779449   
std                              33.353090   
min                               0.000000   
25%                              38.000000   
50%                              73.000000   
75%                              95.000000   
max                              99.000000   

       NWEA Math Growth Percentile Grade 8  rea_att_perc_all  \
count                           396.000000        447.000000   
mean                             66.492424         36.874720   
std                              34.030998         31.120948   
min                               0.000000          0.000000   
25%                              36.000000          8.000000   
50%                              82.000000         30.000000   
75%                              97.000000         61.000000   
max                              99.000000         99.000000   

       NWEA Reading Attainment Percentile Grade 2  \
count                                  417.000000   
mean                                    40.681055   
std                                     35.366304   
min                                      0.000000   
25%                                      6.000000   
50%                                     30.000000   
75%                                     74.000000   
max                                     99.000000   

       NWEA Reading Attainment Percentile Grade 3      
count                                  426.000000 ...  
mean                                    39.521127 ...  
std                                     33.197318 ...  
min                                      0.000000 ...  
25%                                      8.000000 ...  
50%                                     31.000000 ...  
75%                                     68.750000 ...  
max                                     99.000000 ...  

[8 rows x 42 columns]

C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]
C:\Users\ADukes\Anaconda\lib\site-packages\pandas\compat\scipy.py:68: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  score = values[idx]

In [24]:

#rename features so that they're python-friendly
data.rename(columns={'School ID':'school_id'}, inplace=True)	
data.rename(columns={'Name of School':'school_name'}, inplace=True)	
data.rename(columns={'Street Address':'school_add'}, inplace=True)	
data.rename(columns={'City':'school_city'}, inplace=True)	
data.rename(columns={'State':'school_state'}, inplace=True)	
data.rename(columns={'ZIP Code':'school_zip'}, inplace=True)	
data.rename(columns={'Student Response Rate':'student_resp_rate'}, inplace=True)	
data.rename(columns={'Teacher Response Rate':'teacher_resp_rate'}, inplace=True)	
data.rename(columns={'Probation Length':'prob_length'}, inplace=True)	
data.rename(columns={'Involved Family':'inv_family'}, inplace=True)	
data.rename(columns={'Supportive Environment':'supp_env'}, inplace=True)	
data.rename(columns={'Ambitious Instruction':'amb_inst'}, inplace=True)	
data.rename(columns={'Effective Leaders':'eff_lead'}, inplace=True)
data.rename(columns={'Collaborative Teachers':'coll_tea'}, inplace=True)	
data.rename(columns={'NWEA Reading Growth Percentile All Grades':'rea_grw_perc_all'}, inplace=True)	
data.rename(columns={'NWEA Reading Attainment Percentile All Grades':'rea_att_perc_all'}, inplace=True)	
data.rename(columns={'CPS Performance Policy Level':'cps_perf'}, inplace=True)	
data.rename(columns={'School Communit':'sch_comm'}, inplace=True)	
data.rename(columns={'Parent-Teacher Partnership':'par_tea_partn'}, inplace=True)	
data.rename(columns={'Quality of Facilities':'fac_qual'}, inplace=True)	
data.rename(columns={'Healthy Schools Certification':'health_cert'}, inplace=True)	
data.rename(columns={'Creative Schools Certification':'creative_cert'}, inplace=True)	
data.rename(columns={'NWEA Math Growth Percentile All Grades':'math_grw_perc_all'}, inplace=True)	
data.rename(columns={'Suspensions Per 100 students 2013':'suspensions'}, inplace=True)	
data.rename(columns={'Student Attendance Percentage 2013':'stud_att'}, inplace=True)	
data.rename(columns={'Teacher Attendance Percentage 2013':'teach_att'}, inplace=True)
data.rename(columns={'One-Year Drop Out Rate Percentage 2013':'drop_out'}, inplace=True)

#Safe

In [16]:

data.head(5)

Out[16]:

	school_name	school_add	school_city	school_state	school_zip	Phone Number	Website	Blue Ribbon Award	cps_perf	CPS Performance Policy Status	prob_length	My Voice, My School Overall Rating	student_resp_rate	teacher_resp_rate	inv_family	supp_env	amb_inst	eff_inst	coll_tea	Safe
School ID
400009	Academy for Global Citizenship Elementary School	4647 W 47th St	Chicago	IL	60632	(773) 582-1100	http://www.agcchicago.org/	NaN	LEVEL 1	NOT APPLICABLE	NaN	NOT ENOUGH DATA	0%	47.6%	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	...
400011	Alain Locke Charter Elementary Academy	3141 W Jackson Blvd	Chicago	IL	60612	(773) 265-7232	http://www.alainlocke.org	NaN	LEVEL 1	NOT APPLICABLE	NaN	NOT ENOUGH DATA	0.9%	0%	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	...
400017	ASPIRA Charter - Haugan Campus	3729 W Leland Ave	Chicago	IL	60625	(773) 252-0970	http://www.aspirail.org	NaN	LEVEL 2	NOT APPLICABLE	NaN	ORGANIZED	93.2%	86.8%	NEUTRAL	STRONG	STRONG	NEUTRAL	NEUTRAL	NEUTRAL	...
400019	Bronzeville Lighthouse Charter Elementary School	8 W Root St	Chicago	IL	60609	(773) 535-1460	http://www.lighthouse-academies.org/BZLCS.htm	NaN	LEVEL 3	NOT APPLICABLE	NaN	NOT ENOUGH DATA	2.4%	0%	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	NOT ENOUGH DATA	...
400020	Catalyst Charter ES - Howland	1616 S Spaulding Ave	Chicago	IL	60623	(773) 527-7330	http://www.catalystschools.org	NaN	LEVEL 3	NOT APPLICABLE	NaN	PARTIALLY ORGANIZED	65.2%	84%	VERY WEAK	STRONG	NEUTRAL	VERY WEAK	VERY WEAK	NEUTRAL	...

5 rows × 73 columns

In [25]:

# Return the original value if a string, otherwise return 'Unknown'
def cleanPerf(perf):
    if type(perf) is str:
        return perf
    else:
        return 'Unknown'

In [26]:

#create ordinal variable for size
def catPerf(perf):
    if 'LEVEL 1' in perf:
        return 1
    elif 'LEVEL 2' in perf:
        return 2
    elif 'LEVEL 3' in perf:
        return 3
    else:
         return 0

In [27]:

#apply perf function for categorical feature
data['cps_clean_cat']=data['cps_perf'].apply(cleanPerf)

In [28]:

data['cps_perf_cat'] = data['cps_clean_cat'].apply(catPerf)

In [35]:

print data['cps_perf_cat'].unique()

[1 2 3 0]

In [55]:

data['inv_fam_fixed']=data['inv_family']
data['supp_env_fixed']=data['supp_env']
data['amb_inst_fixed']=data['amb_inst']
data['eff_lead_fixed']=data['eff_lead']
data['coll_tea_fixed']=data['coll_tea']
data['safe_fixed']=data['Safe']

In [56]:

#recode not enough data inv_family	supp_env	amb_inst	eff_inst	coll_tea	Safe
data.inv_fam_fixed[data.inv_fam_fixed=='NOT ENOUGH DATA'] = 'other'
data.supp_env_fixed[data.supp_env_fixed=='NOT ENOUGH DATA'] = 'other'
data.amb_inst_fixed[data.amb_inst_fixed=='NOT ENOUGH DATA'] = 'other'
data.eff_lead_fixed[data.eff_lead_fixed=='NOT ENOUGH DATA'] = 'other'
data.coll_tea_fixed[data.coll_tea_fixed=='NOT ENOUGH DATA'] = 'other'
data.safe_fixed[data.safe_fixed=='NOT ENOUGH DATA'] = 'other'

In [54]:

#groupby categorical data
print data.groupby('supp_env_fixed').count()

                school_name  school_add  school_city  school_state  \
supp_env_fixed                                                       
NEUTRAL                 188         188          188           188   
STRONG                  121         121          121           121   
VERY STRONG              47          47           47            47   
VERY WEAK                 6           6            6             6   
WEAK                     67          67           67            67   
other                    54          54           54            54   

                school_zip  Phone Number  Website  Blue Ribbon Award  \
supp_env_fixed                                                         
NEUTRAL                188           188      188                  2   
STRONG                 121           121      121                  1   
VERY STRONG             47            47       47                  0   
VERY WEAK                6             6        6                  0   
WEAK                    67            67       67                  0   
other                   54            49       54                  0   

                cps_perf  CPS Performance Policy Status  prob_length  \
supp_env_fixed                                                         
NEUTRAL              186                            186           70   
STRONG               121                            121           32   
VERY STRONG           47                             47            9   
VERY WEAK              5                              5            4   
WEAK                  67                             67           38   
other                 49                             49            5   

                My Voice, My School Overall Rating  student_resp_rate  \
supp_env_fixed                                                          
NEUTRAL                                        188                188   
STRONG                                         121                121   
VERY STRONG                                     47                 47   
VERY WEAK                                        6                  6   
WEAK                                            67                 67   
other                                           54                 49   

                teacher_resp_rate  inv_family  supp_env  amb_inst  eff_inst  \
supp_env_fixed                                                                
NEUTRAL                       188         188       188       188       188   
STRONG                        121         121       121       121       121   
VERY STRONG                    47          47        47        47        47   
VERY WEAK                       6           6         6         6         6   
WEAK                           67          67        67        67        67   
other                          49          54        54        54        54   

                coll_tea  Safe      
supp_env_fixed                      
NEUTRAL              188   188 ...  
STRONG               121   121 ...  
VERY STRONG           47    47 ...  
VERY WEAK              6     6 ...  
WEAK                  67    67 ...  
other                 54    54 ...  

[6 rows x 86 columns]

In [ ]:

#groupby categorical data inv_fam_fixed

In [32]:

#create numerical variable for family involvement
def cat(var_name):
    if 'VERY STRONG' in var_name:
        return 1
    elif 'STRONG' in var_name:
        return 2
    elif 'NEUTRAL' in var_name:
        return 3
    elif 'WEAK' in var_name:
        return 4
    elif 'VERY WEAK' in var_name:
        return 5
    else:
         return 0

In [57]:

data['inv_fam_cat'] = data['inv_family'].apply(cat)
data['supp_env_cat'] = data['supp_env'].apply(cat)
data['amb_inst_cat'] = data['amb_inst'].apply(cat)
data['eff_lead_cat'] = data['eff_lead'].apply(cat)
data['coll_tea_cat'] = data['coll_tea'].apply(cat)
data['safe_cat'] = data['Safe'].apply(cat)

In [35]:

#create dummy variables for family involvement 
dummy_inv_fam = pd.get_dummies(data['inv_fam_cat'], prefix='fam')
dummy_supp_env = pd.get_dummies(data['supp_env_cat'], prefix='env')
dummy_amb_inst = pd.get_dummies(data['amb_inst_cat'], prefix='amb_inst')
dummy_eff_lead = pd.get_dummies(data['eff_lead_cat'], prefix='eff_lead')
dummy_coll_tea = pd.get_dummies(data['coll_tea_cat'], prefix='coll_tea')

In [37]:

dummy_inv_fam.columns = ['fam_1', 'fam_2','fam_3','fam_4','fam_5']
dummy_supp_env.columns = ['env_1', 'env_2','env_3','env_4','env_5']
dummy_amb_inst.columns = ['amb_inst_1', 'amb_inst_2','amb_inst_3','amb_inst_4','amb_inst_5']
dummy_eff_lead.columns = ['eff_lead_1', 'eff_lead_2','eff_lead_3','eff_lead_4','eff_lead_5']
dummy_coll_tea.columns = ['coll_tea_1', 'coll_tea_2','coll_tea_3','coll_tea_4','coll_tea_5']

In [38]:

data['fam_1'] = dummy_inv_fam['fam_1']
data['fam_2'] = dummy_inv_fam['fam_2']
data['fam_3'] = dummy_inv_fam['fam_3']
data['fam_4'] = dummy_inv_fam['fam_4']
data['fam_5'] = dummy_inv_fam['fam_5']

In [39]:

data['env_1'] = dummy_supp_env['env_1']
data['env_2'] = dummy_supp_env['env_2']
data['env_3'] = dummy_supp_env['env_3']
data['env_4'] = dummy_supp_env['env_4']
data['env_5'] = dummy_supp_env['env_5']

In [40]:

data['amb_inst_1'] = dummy_amb_inst['amb_inst_1']
data['amb_inst_2'] = dummy_amb_inst['amb_inst_2']
data['amb_inst_3'] = dummy_amb_inst['amb_inst_3']
data['amb_inst_4'] = dummy_amb_inst['amb_inst_4']
data['amb_inst_5'] = dummy_amb_inst['amb_inst_5']

In [41]:

data['eff_lead_1'] = dummy_eff_lead['eff_lead_1']
data['eff_lead_2'] = dummy_eff_lead['eff_lead_2']
data['eff_lead_3'] = dummy_eff_lead['eff_lead_3']
data['eff_lead_4'] = dummy_eff_lead['eff_lead_4']
data['eff_lead_5'] = dummy_eff_lead['eff_lead_5']

In [42]:

data['coll_tea_1'] = dummy_coll_tea['coll_tea_1']
data['coll_tea_2'] = dummy_coll_tea['coll_tea_2']
data['coll_tea_3'] = dummy_coll_tea['coll_tea_3']
data['coll_tea_4'] = dummy_coll_tea['coll_tea_4']
data['coll_tea_5'] = dummy_coll_tea['coll_tea_5']

In [105]:

print data['teacher_resp_rate_st'].unique()

['47.6%' '0%' '86.8%' '84%' '91.3%' '84.6%' '70.8%' '93.8%' '89.7%' '87.5%'
 '100%' '96.6%' '97.1%' '83.3%' '61.1%' '78.6%' '76.9%' '91.4%' '96%'
 '74.3%' '81.8%' '52%' '76%' '60.5%' '55%' '93.6%' '53.3%' '94.7%' '80.7%'
 '82.4%' '77.8%' '74.2%' '82.6%' '52.8%' '90.6%' '70.3%' '56.3%' '82.8%'
 '80%' '93.1%' '70%' '84.4%' nan '91.8%' '85.3%' '76.5%' '92%' '79.3%'
 '88.2%' '77.4%' '95.2%' '90.7%' '88.6%' '66.7%' '94.6%' '93.9%' '94.1%'
 '88.5%' '85%' '61.5%' '82.2%' '89.8%' '91.1%' '73%' '90%' '78.4%' '88.9%'
 '90.5%' '92.2%' '86.3%' '73.3%' '69.6%' '95.5%' '81.5%' '91.2%' '57.9%'
 '97.7%' '84.1%' '68.9%' '68.2%' '70.6%' '87%' '95%' '35.3%' '93.3%'
 '72.4%' '71.9%' '96.9%' '86.7%' '78.1%' '79%' '85.7%' '94.9%' '57.1%'
 '88%' '80.8%' '81.3%' '86.4%' '72.2%' '97%' '50%' '86.5%' '60%' '69%'
 '95.7%' '59.2%' '97.6%' '45%' '69.2%' '68%' '56.8%' '69.7%' '65.2%'
 '72.1%' '65.7%' '97.8%' '35%' '97.9%' '80.6%' '76.3%' '90.9%' '74.1%'
 '75%' '89.5%' '75.5%' '97.2%' '73.9%' '86%' '87.1%' '92.5%' '84.7%'
 '72.7%' '89.4%' '87.9%' '96.3%' '92.9%' '90.3%' '47.4%' '82.9%' '68.1%'
 '58.6%' '64%' '81.4%' '75.4%' '79.2%' '92.3%' '69.4%' '54.3%' '68.8%'
 '82.1%' '74.5%' '96.4%' '91.7%' '63%' '53.9%' '80.4%' '77.5%' '71%'
 '63.2%' '98.6%' '81%' '70.2%' '59.3%' '58.1%' '86.1%' '78.3%' '71.1%'
 '64.3%' '84.2%' '77.2%' '68.4%' '79.1%' '98%' '13.9%' '78.7%' '67.8%'
 '44.2%' '94.4%' '95.8%' '45.3%' '27%' '84.9%' '64.7%' '58.7%' '84.3%'
 '63.6%' '76.2%' '71.4%' '34.8%' '84.8%' '42.9%' '82.7%' '67.9%' '57%'
 '86.2%' '73.7%' '46.2%' '92.6%']

In [119]:

#data['teacher_resp_rate_st']=data['teacher_resp_rate'].strip('%');

#data.teacher_resp_rate_st[data.teacher_resp_rate_st=='%'] = ''

def rem_per(x):
    return (x.strip("%"))#doesn't work

In [122]:

data['teacher_resp_rate_st'] = data['teacher_resp_rate']

#data['teacher_resp_rate_st'] = data['teacher_resp_rate_st'].apply(rem_per)

In [162]:

#histogram
data.rea_grw_perc_all.hist()

Out[162]:

<matplotlib.axes.AxesSubplot at 0x17488be0>

In [173]:

#scatterplot
# relatively strong relationship with performance: supp_env_cat, collaborative teacher rea_grw_perc_all math_grw_perc_all

plt.scatter(data.cps_perf_cat, data.coll_tea_cat, lw=10, alpha=.1, color='b')
plt.xlabel("school performance")
plt.ylabel("collaborative teachers")

Out[173]:

<matplotlib.text.Text at 0x18343400>

In [126]:

#print data['teacher_resp_rate'].unique

data[['teacher_resp_rate']].describe()

Out[126]:

	teacher_resp_rate
count	478
unique	197
top	100%
freq	58

4 rows × 1 columns

In [63]:

from sklearn import linear_model, datasets
from sklearn.cross_validation import train_test_split

In [ ]:

%pylab inline

In [175]:

# define X; supp_env_cat amb_inst_cat eff_lead_cat coll_tea_cat safe_cat

features=['amb_inst_cat','eff_lead_cat','coll_tea_cat','safe_cat']

X=data[features]

In [176]:

# Define Y/target
Y = (data.cps_perf_cat.values)==1

In [177]:

# Slice off a test set
# Use train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25)

In [178]:

#create classifier 
clf=linear_model.LogisticRegression().fit(x_train,y_train)
print clf.score(x_train,y_train)

print clf.score(x_test,y_test)

0.709944751381
0.719008264463

In [179]:

from sklearn import metrics
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True):
    y_pred=clf.predict(X)   
    if show_accuracy:
        print "Accuracy:{0:.3f}".format(metrics.accuracy_score(y,y_pred)),"\n"
 
    if show_classification_report:
        print "Classification report"
        print metrics.classification_report(y,y_pred),"\n"
        
    if show_confusion_matrix:
        print "Confusion matrix"
        print metrics.confusion_matrix(y,y_pred),"\n"
 
print measure_performance(x_train, y_train, clf)

Accuracy:0.710 

Classification report
             precision    recall  f1-score   support

      False       0.74      0.88      0.80       243
       True       0.60      0.35      0.44       119

avg / total       0.69      0.71      0.69       362


Confusion matrix
[[215  28]
 [ 77  42]] 

None

In [180]:

print clf.coef_

[[ 0.76397835  0.06849151 -0.28981508 -0.98393147]]

In [ ]: