#Kaggle旧金山犯罪类型分类问题,Kaggle地址为https://www.kaggle.com/c/sf-crime
#这是作为讲述朴素贝叶斯分类器的博客示例做的ipython notebook
#博文地址http://blog.csdn.net/han_xiaoyang/article/details/50629608,欢迎提建议
import pandas as pd
import numpy as np
#用pandas载入csv训练数据,并解析第一列为日期格式
train=pd.read_csv('/Users/Hanxiaoyang/sf_crime_data/train.csv', parse_dates = ['Dates'])
test=pd.read_csv('/Users/Hanxiaoyang/sf_crime_data/test.csv', parse_dates = ['Dates'])
train
Dates | Category | Descript | DayOfWeek | PdDistrict | Resolution | Address | X | Y | |
---|---|---|---|---|---|---|---|---|---|
0 | 2015-05-13 23:53:00 | WARRANTS | WARRANT ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
1 | 2015-05-13 23:53:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | OAK ST / LAGUNA ST | -122.425892 | 37.774599 |
2 | 2015-05-13 23:33:00 | OTHER OFFENSES | TRAFFIC VIOLATION ARREST | Wednesday | NORTHERN | ARREST, BOOKED | VANNESS AV / GREENWICH ST | -122.424363 | 37.800414 |
3 | 2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | 1500 Block of LOMBARD ST | -122.426995 | 37.800873 |
4 | 2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | PARK | NONE | 100 Block of BRODERICK ST | -122.438738 | 37.771541 |
5 | 2015-05-13 23:30:00 | LARCENY/THEFT | GRAND THEFT FROM UNLOCKED AUTO | Wednesday | INGLESIDE | NONE | 0 Block of TEDDY AV | -122.403252 | 37.713431 |
6 | 2015-05-13 23:30:00 | VEHICLE THEFT | STOLEN AUTOMOBILE | Wednesday | INGLESIDE | NONE | AVALON AV / PERU AV | -122.423327 | 37.725138 |
7 | 2015-05-13 23:30:00 | VEHICLE THEFT | STOLEN AUTOMOBILE | Wednesday | BAYVIEW | NONE | KIRKWOOD AV / DONAHUE ST | -122.371274 | 37.727564 |
8 | 2015-05-13 23:00:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | RICHMOND | NONE | 600 Block of 47TH AV | -122.508194 | 37.776601 |
9 | 2015-05-13 23:00:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | CENTRAL | NONE | JEFFERSON ST / LEAVENWORTH ST | -122.419088 | 37.807802 |
10 | 2015-05-13 22:58:00 | LARCENY/THEFT | PETTY THEFT FROM LOCKED AUTO | Wednesday | CENTRAL | NONE | JEFFERSON ST / LEAVENWORTH ST | -122.419088 | 37.807802 |
11 | 2015-05-13 22:30:00 | OTHER OFFENSES | MISCELLANEOUS INVESTIGATION | Wednesday | TARAVAL | NONE | 0 Block of ESCOLTA WY | -122.487983 | 37.737667 |
12 | 2015-05-13 22:30:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Wednesday | TENDERLOIN | NONE | TURK ST / JONES ST | -122.412414 | 37.783004 |
13 | 2015-05-13 22:06:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | FILLMORE ST / GEARY BL | -122.432915 | 37.784353 |
14 | 2015-05-13 22:00:00 | NON-CRIMINAL | FOUND PROPERTY | Wednesday | BAYVIEW | NONE | 200 Block of WILLIAMS AV | -122.397744 | 37.729935 |
15 | 2015-05-13 22:00:00 | NON-CRIMINAL | FOUND PROPERTY | Wednesday | BAYVIEW | NONE | 0 Block of MENDELL ST | -122.383692 | 37.743189 |
16 | 2015-05-13 22:00:00 | ROBBERY | ROBBERY, ARMED WITH A KNIFE | Wednesday | TENDERLOIN | NONE | EDDY ST / JONES ST | -122.412597 | 37.783932 |
17 | 2015-05-13 21:55:00 | ASSAULT | AGGRAVATED ASSAULT WITH BODILY FORCE | Wednesday | INGLESIDE | NONE | GODEUS ST / MISSION ST | -122.421682 | 37.742822 |
18 | 2015-05-13 21:40:00 | OTHER OFFENSES | TRAFFIC VIOLATION | Wednesday | BAYVIEW | ARREST, BOOKED | MENDELL ST / HUDSON AV | -122.386401 | 37.738983 |
19 | 2015-05-13 21:30:00 | NON-CRIMINAL | FOUND PROPERTY | Wednesday | TENDERLOIN | NONE | 100 Block of JONES ST | -122.412250 | 37.782556 |
20 | 2015-05-13 21:30:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | INGLESIDE | NONE | 200 Block of EVELYN WY | -122.449389 | 37.742669 |
21 | 2015-05-13 21:17:00 | ROBBERY | ROBBERY, BODILY FORCE | Wednesday | INGLESIDE | NONE | 1600 Block of VALENCIA ST | -122.420272 | 37.747332 |
22 | 2015-05-13 21:11:00 | WARRANTS | WARRANT ARREST | Wednesday | TENDERLOIN | NONE | 100 Block of JONES ST | -122.412250 | 37.782556 |
23 | 2015-05-13 21:11:00 | NON-CRIMINAL | STAY AWAY OR COURT ORDER, NON-DV RELATED | Wednesday | TENDERLOIN | NONE | 100 Block of JONES ST | -122.412250 | 37.782556 |
24 | 2015-05-13 21:10:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | FILLMORE ST / LOMBARD ST | -122.436049 | 37.799841 |
25 | 2015-05-13 21:00:00 | NON-CRIMINAL | LOST PROPERTY | Wednesday | TENDERLOIN | NONE | 300 Block of OFARRELL ST | -122.410509 | 37.786043 |
26 | 2015-05-13 21:00:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | NORTHERN | NONE | 2000 Block of BUSH ST | -122.431018 | 37.787388 |
27 | 2015-05-13 21:00:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Wednesday | INGLESIDE | NONE | 500 Block of COLLEGE AV | -122.423656 | 37.732556 |
28 | 2015-05-13 21:00:00 | LARCENY/THEFT | ATTEMPTED THEFT FROM LOCKED VEHICLE | Wednesday | TARAVAL | NONE | 19TH AV / SANTIAGO ST | -122.475773 | 37.744919 |
29 | 2015-05-13 20:56:00 | OTHER OFFENSES | MISCELLANEOUS INVESTIGATION | Wednesday | TARAVAL | NONE | 2000 Block of 41ST AV | -122.499787 | 37.748518 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
878019 | 2003-01-06 02:37:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | SOUTHERN | ARREST, CITED | 6TH ST / MARKET ST | -122.410294 | 37.782231 |
878020 | 2003-01-06 02:32:00 | OTHER OFFENSES | TRAFFIC VIOLATION | Monday | NORTHERN | ARREST, CITED | VAN NESS AV / TURK ST | -122.420642 | 37.781961 |
878021 | 2003-01-06 02:24:00 | VANDALISM | MALICIOUS MISCHIEF | Monday | NORTHERN | NOT PROSECUTED | SANCHEZ ST / 14TH ST | -122.431191 | 37.767595 |
878022 | 2003-01-06 02:16:00 | VEHICLE THEFT | RECOVERED VEHICLE - STOLEN OUTSIDE SF | Monday | MISSION | NONE | 17TH ST / MISSION ST | -122.419516 | 37.763429 |
878023 | 2003-01-06 02:15:00 | LARCENY/THEFT | GRAND THEFT PICKPOCKET | Monday | TENDERLOIN | NONE | 600 Block of ELLIS ST | -122.416894 | 37.784286 |
878024 | 2003-01-06 02:09:00 | OTHER OFFENSES | VIOLATION OF MUNICIPAL POLICE CODE | Monday | PARK | ARREST, CITED | 600 Block of DIVISADERO ST | -122.437781 | 37.775483 |
878025 | 2003-01-06 02:06:00 | OTHER OFFENSES | TRAFFIC VIOLATION | Monday | BAYVIEW | ARREST, BOOKED | NEWHALL ST / GALVEZ AV | -122.387710 | 37.740674 |
878026 | 2003-01-06 02:06:00 | WARRANTS | WARRANT ARREST | Monday | BAYVIEW | ARREST, BOOKED | NEWHALL ST / GALVEZ AV | -122.387710 | 37.740674 |
878027 | 2003-01-06 02:00:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Monday | SOUTHERN | ARREST, BOOKED | 900 Block of MARKET ST | -122.409708 | 37.782828 |
878028 | 2003-01-06 02:00:00 | ASSAULT | AGGRAVATED ASSAULT WITH BODILY FORCE | Monday | SOUTHERN | NONE | 6TH ST / MARKET ST | -122.410294 | 37.782231 |
878029 | 2003-01-06 01:54:00 | OTHER OFFENSES | PROBATION VIOLATION | Monday | TENDERLOIN | ARREST, BOOKED | 1400 Block of GOLDEN GATE AV | -122.434423 | 37.779193 |
878030 | 2003-01-06 01:54:00 | SEX OFFENSES FORCIBLE | FORCIBLE RAPE, BODILY FORCE | Monday | TENDERLOIN | ARREST, BOOKED | 1400 Block of GOLDEN GATE AV | -122.434423 | 37.779193 |
878031 | 2003-01-06 01:50:00 | ASSAULT | BATTERY | Monday | BAYVIEW | NONE | 3RD ST / NEWCOMB AV | -122.390417 | 37.735593 |
878032 | 2003-01-06 01:36:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | NORTHERN | ARREST, CITED | GEARY BL / FRANKLIN ST | -122.423031 | 37.785482 |
878033 | 2003-01-06 01:30:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Monday | RICHMOND | ARREST, CITED | 1000 Block of 22ND AV | -122.391668 | 37.757793 |
878034 | 2003-01-06 01:30:00 | TRESPASS | TRESPASSING | Monday | RICHMOND | ARREST, CITED | 1000 Block of 22ND AV | -122.391668 | 37.757793 |
878035 | 2003-01-06 00:55:00 | ASSAULT | BATTERY | Monday | NORTHERN | NONE | 1300 Block of WEBSTER ST | -122.431046 | 37.783030 |
878036 | 2003-01-06 00:55:00 | LARCENY/THEFT | PETTY THEFT SHOPLIFTING | Monday | NORTHERN | NONE | 1300 Block of WEBSTER ST | -122.431046 | 37.783030 |
878037 | 2003-01-06 00:55:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM | Monday | NORTHERN | NONE | 1300 Block of WEBSTER ST | -122.431046 | 37.783030 |
878038 | 2003-01-06 00:42:00 | WARRANTS | ENROUTE TO OUTSIDE JURISDICTION | Monday | TENDERLOIN | ARREST, BOOKED | TAYLOR ST / GEARY ST | -122.411519 | 37.786941 |
878039 | 2003-01-06 00:40:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | NORTHERN | ARREST, CITED | POLK ST / CALIFORNIA ST | -122.420692 | 37.790577 |
878040 | 2003-01-06 00:33:00 | ASSAULT | INFLICT INJURY ON COHABITEE | Monday | MISSION | NONE | 2800 Block of FOLSOM ST | -122.414073 | 37.751685 |
878041 | 2003-01-06 00:31:00 | OTHER OFFENSES | DRIVERS LICENSE, SUSPENDED OR REVOKED | Monday | RICHMOND | ARREST, CITED | CLEMENT ST / 14TH AV | -122.472985 | 37.782552 |
878042 | 2003-01-06 00:20:00 | ASSAULT | ATTEMPTED HOMICIDE WITH A GUN | Monday | BAYVIEW | ARREST, BOOKED | 1500 Block of SHAFTER AV | -122.389769 | 37.730564 |
878043 | 2003-01-06 00:20:00 | OTHER OFFENSES | PAROLE VIOLATION | Monday | BAYVIEW | ARREST, BOOKED | 1500 Block of SHAFTER AV | -122.389769 | 37.730564 |
878044 | 2003-01-06 00:15:00 | ROBBERY | ROBBERY ON THE STREET WITH A GUN | Monday | TARAVAL | NONE | FARALLONES ST / CAPITOL AV | -122.459033 | 37.714056 |
878045 | 2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | INGLESIDE | NONE | 600 Block of EDNA ST | -122.447364 | 37.731948 |
878046 | 2003-01-06 00:01:00 | LARCENY/THEFT | GRAND THEFT FROM LOCKED AUTO | Monday | SOUTHERN | NONE | 5TH ST / FOLSOM ST | -122.403390 | 37.780266 |
878047 | 2003-01-06 00:01:00 | VANDALISM | MALICIOUS MISCHIEF, VANDALISM OF VEHICLES | Monday | SOUTHERN | NONE | TOWNSEND ST / 2ND ST | -122.390531 | 37.780607 |
878048 | 2003-01-06 00:01:00 | FORGERY/COUNTERFEITING | CHECKS, FORGERY (FELONY) | Monday | BAYVIEW | NONE | 1800 Block of NEWCOMB AV | -122.394926 | 37.738212 |
878049 rows × 9 columns
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
#用LabelEncoder对不同的犯罪类型编号
leCrime = preprocessing.LabelEncoder()
crime = leCrime.fit_transform(train.Category)
#因子化星期几,街区,小时等特征
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)
#组合特征
trainData = pd.concat([hour, days, district], axis=1)
trainData['crime']=crime
#对于测试数据做相同的处理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)
hour = test.Dates.dt.hour
hour = pd.get_dummies(hour)
testData = pd.concat([hour, days, district], axis=1)
trainData
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | CENTRAL | INGLESIDE | MISSION | NORTHERN | PARK | RICHMOND | SOUTHERN | TARAVAL | TENDERLOIN | crime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 37 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 21 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 21 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 16 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 16 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 |
7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 16 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 21 |
12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 35 |
13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 16 |
14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 |
15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 |
16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 25 |
17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 |
19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 20 |
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 |
22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 37 |
23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 20 |
24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 16 |
25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 20 |
26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 16 |
27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 16 |
29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 21 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
878019 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 21 |
878020 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 21 |
878021 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 35 |
878022 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 36 |
878023 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 16 |
878024 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 21 |
878025 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 |
878026 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 |
878027 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 37 |
878028 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
878029 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 21 |
878030 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 28 |
878031 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
878032 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 21 |
878033 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 35 |
878034 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 34 |
878035 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
878036 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 16 |
878037 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 35 |
878038 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 37 |
878039 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 21 |
878040 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
878041 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 21 |
878042 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
878043 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 |
878044 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 25 |
878045 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 |
878046 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 16 |
878047 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 35 |
878048 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 |
878049 rows × 42 columns
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import time
# 只取星期几和街区作为分类器输入特征
features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
# 分割训练集(3/5)和测试集(2/5)
training, validation = train_test_split(trainData, train_size=.60)
# 朴素贝叶斯建模,计算log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features], training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print "朴素贝叶斯建模耗时 %f 秒" %(nbCostTime)
print "朴素贝叶斯log损失为 %f" %(log_loss(validation['crime'], predicted))
#逻辑回归建模,计算log_loss
model = LogisticRegression(C=.01)
lrStart= time.time()
model.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print "逻辑回归建模耗时 %f 秒" %(lrCostTime)
print "逻辑回归log损失为 %f" %(log_loss(validation['crime'], predicted))
朴素贝叶斯建模耗时 0.655217 秒 朴素贝叶斯log损失为 2.613483 逻辑回归建模耗时 45.460170 秒 逻辑回归log损失为 2.620500
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import time
# 添加犯罪的小时时间点作为特征
features = ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday',
'Wednesday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION',
'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
hourFea = [x for x in range(0,24)]
features = features + hourFea
# 分割训练集(3/5)和测试集(2/5)
training, validation = train_test_split(trainData, train_size=.60)
# 朴素贝叶斯建模,计算log_loss
model = BernoulliNB()
nbStart = time.time()
model.fit(training[features], training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model.predict_proba(validation[features]))
print "朴素贝叶斯建模耗时 %f 秒" %(nbCostTime)
print "朴素贝叶斯log损失为 %f" %(log_loss(validation['crime'], predicted))
#逻辑回归建模,计算log_loss
model = LogisticRegression(C=.01)
lrStart= time.time()
model.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print "逻辑回归建模耗时 %f 秒" %(lrCostTime)
print "逻辑回归log损失为 %f" %(log_loss(validation['crime'], predicted))
朴素贝叶斯建模耗时 1.350199 秒 朴素贝叶斯log损失为 2.582355 逻辑回归建模耗时 60.785606 秒 逻辑回归log损失为 2.591964