!pip install scikit-learn
!pip install --upgrade pixiedust
Requirement already satisfied: scikit-learn in /opt/conda/envs/Python36/lib/python3.6/site-packages (0.20.3) Requirement already satisfied: numpy>=1.8.2 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from scikit-learn) (1.15.4) Requirement already satisfied: scipy>=0.13.3 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from scikit-learn) (1.2.0) Requirement already up-to-date: pixiedust in /opt/conda/envs/Python36/lib/python3.6/site-packages (1.1.18) Requirement already satisfied, skipping upgrade: markdown in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (3.0.1) Requirement already satisfied, skipping upgrade: requests in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (2.21.0) Requirement already satisfied, skipping upgrade: colour in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (0.1.5) Requirement already satisfied, skipping upgrade: mpld3 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (0.3) Requirement already satisfied, skipping upgrade: astunparse in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (1.6.2) Requirement already satisfied, skipping upgrade: lxml in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (4.3.1) Requirement already satisfied, skipping upgrade: geojson in /opt/conda/envs/Python36/lib/python3.6/site-packages (from pixiedust) (2.4.1) Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from requests->pixiedust) (3.0.4) Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.21.1 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from requests->pixiedust) (1.24.1) Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from requests->pixiedust) (2.8) Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from requests->pixiedust) (2019.11.28) Requirement already satisfied, skipping upgrade: wheel<1.0,>=0.23.0 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from astunparse->pixiedust) (0.32.3) Requirement already satisfied, skipping upgrade: six<2.0,>=1.6.1 in /opt/conda/envs/Python36/lib/python3.6/site-packages (from astunparse->pixiedust) (1.12.0)
import pixiedust
import sklearn
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.io import arff
import brunel
from watson_machine_learning_client import WatsonMachineLearningAPIClient
raw_df = pixiedust.sampleData('https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv')
Downloading 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv' from https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv Downloaded 463947 bytes Creating pandas DataFrame for 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv'. Please wait... Loading file using 'pandas' Successfully created pandas DataFrame for 'https://raw.githubusercontent.com/apischdo/skillsacademy/master/Denormalized%20claims%20data.csv'
display(raw_df)
%brunel data("raw_df") x(FLAG_FOR_FRAUD_INV) y(CLAIM_AMOUNT)
rel_cols = ['HOUSEHOLD_ID','DRIVER_ID','POLICY_ID','CLAIM_ID','LOSS_EVENT_TIME','CLAIM_INIT_TIME', 'ODOMETER_AT_LOSS','CLAIMS_AT_LOSS_DATE','LOSS_LOCATION_LAT','LOSS_LOCATION_LONG','CLAIM_AMOUNT','FLAG_FOR_FRAUD_INV']
df_claim = raw_df[rel_cols]
df_claim
HOUSEHOLD_ID | DRIVER_ID | POLICY_ID | CLAIM_ID | LOSS_EVENT_TIME | CLAIM_INIT_TIME | ODOMETER_AT_LOSS | CLAIMS_AT_LOSS_DATE | LOSS_LOCATION_LAT | LOSS_LOCATION_LONG | CLAIM_AMOUNT | FLAG_FOR_FRAUD_INV | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CH42335 | XZJ2837 | NW5567882 | A-2017-UU907 | 2017-04-25 00:00:00.0 | 2017-04-28 00:00:00.0 | 157654.9 | 1 | 41.902103 | -87.755624 | 35765.0 | 1 |
1 | IH49805 | VVR6423 | UR4864804 | A-2018-FI481 | 2018-08-26 00:00:00.0 | 2018-08-31 00:00:00.0 | 226154.5 | 1 | 41.963562 | -87.731397 | 1909.0 | 0 |
2 | AF28736 | UQM2512 | RR8595908 | A-2016-ZG694 | 2016-01-07 00:00:00.0 | 2016-01-11 00:00:00.0 | 83968.6 | 2 | 41.736602 | -87.604968 | 25730.0 | 1 |
3 | EF53594 | YDT5591 | RN5640634 | A-2016-NG783 | 2016-12-11 00:00:00.0 | 2016-12-18 00:00:00.0 | 309570.3 | 1 | 41.909925 | -87.731557 | 40880.0 | 1 |
4 | LD32277 | ONM5465 | YY1229530 | A-2017-ZO863 | 2017-06-06 00:00:00.0 | 2017-06-08 00:00:00.0 | 136633.9 | 1 | 41.923750 | -87.789881 | 2130.0 | 0 |
5 | DM94074 | GBU7751 | XP3473763 | A-2018-XB432 | 2018-03-06 00:00:00.0 | 2018-03-19 00:00:00.0 | 326514.1 | 1 | 41.909257 | -87.785057 | 1970.0 | 0 |
6 | MD38210 | CBR4335 | US5444269 | A-2017-XP758 | 2017-08-13 00:00:00.0 | 2017-08-18 00:00:00.0 | 58477.9 | 1 | 41.928395 | -87.796468 | 2290.0 | 0 |
7 | GL77908 | HZF3884 | XR1994270 | A-2017-QY946 | 2017-02-10 00:00:00.0 | 2017-02-18 00:00:00.0 | 176476.9 | 1 | 41.885716 | -87.728376 | 2990.0 | 0 |
8 | BA26199 | CSE9523 | VP6368585 | A-2018-LB818 | 2018-02-16 00:00:00.0 | 2018-02-24 00:00:00.0 | 277812.7 | 1 | 41.895054 | -87.745662 | 1170.0 | 0 |
9 | EA38976 | HFX7408 | YP9758006 | A-2016-QD338 | 2016-12-28 00:00:00.0 | 2017-01-11 00:00:00.0 | 190541.3 | 1 | 41.891976 | -87.614580 | 2090.0 | 0 |
10 | GB64343 | PXE3728 | XY6800348 | A-2017-CK710 | 2017-08-12 00:00:00.0 | 2017-08-25 00:00:00.0 | 290975.1 | 1 | 41.911223 | -87.638656 | 2130.0 | 0 |
11 | JG99629 | OKH5337 | ZK6994471 | A-2018-WF114 | 2018-07-30 00:00:00.0 | 2018-07-30 00:00:00.0 | 159873.2 | 5 | 41.735835 | -87.667866 | 33040.0 | 1 |
12 | FH81231 | RQZ1566 | ZR4462879 | A-2018-VT470 | 2018-02-22 00:00:00.0 | 2018-02-26 00:00:00.0 | 391866.9 | 1 | 41.734164 | -87.551211 | 2400.0 | 0 |
13 | KI98597 | VRX5780 | ZW4263453 | A-2017-SA279 | 2017-02-06 00:00:00.0 | 2017-02-06 00:00:00.0 | 384000.6 | 3 | 41.791852 | -87.801378 | 21330.0 | 1 |
14 | CG61685 | TIA1702 | UW8176531 | A-2018-EJ368 | 2018-07-10 00:00:00.0 | 2018-07-18 00:00:00.0 | 251644.8 | 1 | 42.005931 | -87.680660 | 3427.0 | 0 |
15 | AB12181 | OBD8151 | RW8652538 | A-2017-CT587 | 2017-07-29 00:00:00.0 | 2017-08-10 00:00:00.0 | 300279.3 | 1 | 41.693401 | -87.612322 | 35385.5 | 1 |
16 | BJ63668 | WDC2749 | PQ3189850 | A-2017-CU817 | 2017-12-30 00:00:00.0 | 2018-01-03 00:00:00.0 | 73419.5 | 1 | 41.840145 | -87.661339 | 1570.0 | 0 |
17 | MG83193 | XJB5149 | XN9015815 | A-2018-LF972 | 2018-01-16 00:00:00.0 | 2018-01-28 00:00:00.0 | 118923.9 | 1 | 41.975175 | -87.768344 | 2370.0 | 0 |
18 | IJ01709 | CRP8660 | SV1659058 | A-2017-ML677 | 2017-04-25 00:00:00.0 | 2017-05-07 00:00:00.0 | 182681.3 | 1 | 41.890173 | -87.649992 | 26250.0 | 1 |
19 | BC94182 | PEQ6155 | UQ8163328 | A-2018-WZ492 | 2018-02-10 00:00:00.0 | 2018-02-19 00:00:00.0 | 160429.9 | 3 | 41.840658 | -87.725899 | 25830.0 | 1 |
20 | MF96553 | PPI8400 | OT0287150 | A-2018-DI451 | 2018-01-23 00:00:00.0 | 2018-01-24 00:00:00.0 | 165363.7 | 4 | 41.884705 | -87.667046 | 23483.0 | 1 |
21 | BC90853 | JOA9170 | TW2172802 | A-2016-RW653 | 2016-12-01 00:00:00.0 | 2016-12-10 00:00:00.0 | 240639.2 | 1 | 41.736183 | -87.643660 | 2173.5 | 0 |
22 | AK42388 | KNQ4268 | PP7360289 | A-2017-KO812 | 2017-07-12 00:00:00.0 | 2017-07-12 00:00:00.0 | 388591.3 | 1 | 41.762030 | -87.611696 | 1750.0 | 0 |
23 | HB16909 | SRU2950 | OY6516260 | A-2016-CA740 | 2016-01-06 00:00:00.0 | 2016-01-08 00:00:00.0 | 59498.7 | 1 | 42.006422 | -87.678120 | 20310.0 | 1 |
24 | FA97534 | EHL9301 | OS6708536 | A-2018-PV577 | 2018-10-13 00:00:00.0 | 2018-10-25 00:00:00.0 | 94043.6 | 1 | 41.813602 | -87.704338 | 2810.0 | 0 |
25 | CF57572 | CJY6373 | QV4842191 | A-2017-JI571 | 2017-07-19 00:00:00.0 | 2017-07-19 00:00:00.0 | 69305.4 | 1 | 41.996924 | -87.806945 | 2370.0 | 0 |
26 | ME37855 | NCL2868 | VT9807337 | A-2018-CK215 | 2018-11-04 00:00:00.0 | 2018-11-13 00:00:00.0 | 207764.4 | 1 | 41.952228 | -87.727425 | 1320.0 | 0 |
27 | LA79549 | YWA1319 | PU5439335 | A-2017-FJ217 | 2017-05-31 00:00:00.0 | 2017-06-12 00:00:00.0 | 302710.0 | 5 | 41.976895 | -87.692280 | 30840.0 | 1 |
28 | II25056 | CVI8835 | OJ5466096 | A-2017-YI133 | 2017-03-01 00:00:00.0 | 2017-03-13 00:00:00.0 | 109080.0 | 3 | 41.880781 | -87.730232 | 1230.5 | 0 |
29 | CK11297 | HIR9014 | RR5447110 | A-2018-QW132 | 2018-09-05 00:00:00.0 | 2018-09-11 00:00:00.0 | 69640.2 | 1 | 41.892753 | -87.760709 | 1020.0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
945 | FH69052 | GSN4940 | VR3029092 | A-2018-IZ938 | 2018-08-01 00:00:00.0 | 2018-08-09 00:00:00.0 | 588158.9 | 1 | 41.997861 | -87.688605 | 1160.0 | 0 |
946 | LD06091 | EGF8629 | TQ4763765 | A-2016-DR889 | 2016-09-12 00:00:00.0 | 2016-09-12 00:00:00.0 | 220827.8 | 1 | 41.722449 | -87.585004 | 20550.0 | 1 |
947 | HA14850 | QAO5663 | XV0936944 | A-2018-XV535 | 2018-09-29 00:00:00.0 | 2018-10-02 00:00:00.0 | 146983.7 | 2 | 41.837622 | -87.665887 | 2620.0 | 0 |
948 | LJ86931 | YXF8421 | XO1420265 | A-2017-SS668 | 2017-07-02 00:00:00.0 | 2017-07-11 00:00:00.0 | 149332.0 | 1 | 41.810689 | -87.711095 | 2460.0 | 0 |
949 | AG86132 | VDU5872 | QP0855306 | A-2018-CY495 | 2018-09-23 00:00:00.0 | 2018-09-23 00:00:00.0 | 62842.7 | 2 | 41.922121 | -87.630243 | 1180.0 | 0 |
950 | GE81740 | HFZ7313 | QW3622319 | A-2016-BI407 | 2016-11-11 00:00:00.0 | 2016-11-18 00:00:00.0 | 214021.1 | 1 | 41.753357 | -87.702643 | 2080.0 | 0 |
951 | LK14209 | JDC8664 | US7928033 | A-2016-NI814 | 2016-01-11 00:00:00.0 | 2016-01-18 00:00:00.0 | 172955.9 | 1 | 41.866254 | -87.666524 | 13820.0 | 1 |
952 | EI71996 | QBV9830 | YU2221856 | A-2018-CU529 | 2018-06-11 00:00:00.0 | 2018-06-23 00:00:00.0 | 293880.1 | 1 | 41.858883 | -87.693363 | 29600.0 | 1 |
953 | IF40191 | LFW8264 | YY7862368 | A-2017-YB190 | 2017-03-08 00:00:00.0 | 2017-03-18 00:00:00.0 | 118784.9 | 2 | 41.911261 | -87.679055 | 1000.0 | 0 |
954 | EH17979 | EUT2904 | SP1179929 | A-2017-RM621 | 2017-07-12 00:00:00.0 | 2017-07-20 00:00:00.0 | 227731.0 | 1 | 41.857835 | -87.666287 | 37200.0 | 1 |
955 | HC35643 | PBW7429 | RZ2794532 | A-2018-KQ119 | 2018-10-14 00:00:00.0 | 2018-10-22 00:00:00.0 | 157867.5 | 3 | 41.968902 | -87.660532 | 23980.0 | 1 |
956 | JF75578 | SGO7528 | SV1869698 | A-2016-EC320 | 2016-05-25 00:00:00.0 | 2016-06-01 00:00:00.0 | 454638.7 | 4 | 41.708182 | -87.566535 | 1587.0 | 0 |
957 | DA02226 | IRJ6787 | VP1026284 | A-2017-EW172 | 2017-09-25 00:00:00.0 | 2017-09-28 00:00:00.0 | 237136.2 | 5 | 41.823685 | -87.621747 | 2530.0 | 1 |
958 | AK18614 | BKZ8467 | ZT7214353 | A-2018-CL216 | 2018-01-10 00:00:00.0 | 2018-01-15 00:00:00.0 | 325206.2 | 1 | 41.754660 | -87.741385 | 28050.0 | 1 |
959 | HI45009 | SNZ2467 | PZ8927119 | A-2018-DS775 | 2018-08-28 00:00:00.0 | 2018-08-30 00:00:00.0 | 112831.5 | 1 | 41.746195 | -87.551308 | 1350.0 | 0 |
960 | BB70679 | PDF9673 | ZQ9923600 | A-2018-UG847 | 2018-01-21 00:00:00.0 | 2018-01-29 00:00:00.0 | 137111.1 | 2 | 41.886752 | -87.671247 | 13710.0 | 1 |
961 | HG42702 | TYL6617 | OQ2049136 | A-2017-KO841 | 2017-03-05 00:00:00.0 | 2017-03-06 00:00:00.0 | 172496.5 | 1 | 41.765291 | -87.604544 | 32960.0 | 1 |
962 | KK79654 | LIK6490 | VO4861035 | A-2017-OB945 | 2017-10-18 00:00:00.0 | 2017-11-01 00:00:00.0 | 253215.2 | 1 | 41.675067 | -87.638355 | 2120.0 | 0 |
963 | FC64633 | EXS9829 | NU1726936 | A-2017-LI176 | 2017-05-22 00:00:00.0 | 2017-06-02 00:00:00.0 | 342907.1 | 1 | 41.925275 | -87.667124 | 1430.0 | 0 |
964 | FG85355 | SAC4402 | YN8255193 | A-2016-DP856 | 2016-02-06 00:00:00.0 | 2016-02-10 00:00:00.0 | 120261.6 | 1 | 41.976099 | -87.710976 | 1253.5 | 0 |
965 | EL47810 | FND5049 | NY8366901 | A-2018-ZE522 | 2018-04-03 00:00:00.0 | 2018-04-17 00:00:00.0 | 53558.5 | 5 | 41.957653 | -87.705837 | 2730.0 | 1 |
966 | FF77811 | FLR5428 | PZ8862368 | A-2017-TD385 | 2017-11-14 00:00:00.0 | 2017-11-24 00:00:00.0 | 156352.7 | 1 | 41.775257 | -87.640603 | 2250.0 | 0 |
967 | JC25299 | CDZ4102 | SS8155048 | A-2018-FD517 | 2018-03-11 00:00:00.0 | 2018-03-15 00:00:00.0 | 63924.0 | 1 | 41.997355 | -87.765614 | 2070.0 | 0 |
968 | LI38945 | HTQ6817 | NS6540979 | A-2018-UH153 | 2018-08-24 00:00:00.0 | 2018-09-03 00:00:00.0 | 105133.8 | 1 | 41.946016 | -87.764442 | 33210.0 | 1 |
969 | EE18899 | PUN8916 | NY8220476 | A-2017-BI614 | 2017-06-13 00:00:00.0 | 2017-06-25 00:00:00.0 | 299557.9 | 1 | 41.867125 | -87.647065 | 2440.0 | 0 |
970 | CM11612 | GJI9724 | QY3512250 | A-2016-TJ210 | 2016-11-29 00:00:00.0 | 2016-12-01 00:00:00.0 | 77438.7 | 1 | 41.876984 | -87.742824 | 44666.0 | 1 |
971 | HM55275 | ORD4650 | XR9202123 | A-2016-EU678 | 2016-08-14 00:00:00.0 | 2016-08-21 00:00:00.0 | 219285.9 | 1 | 42.002596 | -87.660932 | 2320.0 | 0 |
972 | BA79937 | BLV8655 | TU0769843 | A-2016-MT971 | 2016-08-22 00:00:00.0 | 2016-09-03 00:00:00.0 | 192320.1 | 1 | 41.778640 | -87.722613 | 39307.0 | 1 |
973 | AL29368 | XBH9030 | UU9571126 | A-2017-ZX388 | 2017-01-24 00:00:00.0 | 2017-02-04 00:00:00.0 | 273023.9 | 1 | 41.770738 | -87.722707 | 1720.0 | 0 |
974 | CK27900 | BZQ1079 | SV2796501 | A-2016-GP477 | 2016-07-05 00:00:00.0 | 2016-07-09 00:00:00.0 | 178004.2 | 1 | 41.768244 | -87.619856 | 1050.0 | 0 |
975 rows × 12 columns
rel_cols_2 = ["DRIVER_ID","DRIVERS_LICENSE_EXPIRY","DATE_AT_CURRENT_ADDRESS"]
df_driver = raw_df[rel_cols_2]
df_driver
DRIVER_ID | DRIVERS_LICENSE_EXPIRY | DATE_AT_CURRENT_ADDRESS | |
---|---|---|---|
0 | XZJ2837 | 2018-08-19 | 1999-04-16 |
1 | VVR6423 | 2018-01-27 | 2011-06-09 |
2 | UQM2512 | 2019-11-19 | 2005-05-21 |
3 | YDT5591 | 2019-05-16 | 2000-03-11 |
4 | ONM5465 | 2020-07-06 | 2012-07-04 |
5 | GBU7751 | 2021-09-26 | 2001-07-15 |
6 | CBR4335 | 2019-05-10 | 2004-04-08 |
7 | HZF3884 | 2019-04-26 | 1999-02-02 |
8 | CSE9523 | 2021-04-15 | 2005-03-07 |
9 | HFX7408 | 2019-08-07 | 2012-03-04 |
10 | PXE3728 | 2019-07-02 | 2017-05-03 |
11 | OKH5337 | 2018-03-29 | 2008-07-06 |
12 | RQZ1566 | 2018-04-01 | 2012-05-15 |
13 | VRX5780 | 2019-07-13 | 2007-01-14 |
14 | TIA1702 | 2020-04-25 | 2009-09-25 |
15 | OBD8151 | 2019-04-13 | 1999-07-04 |
16 | WDC2749 | 2020-07-15 | 2002-05-21 |
17 | XJB5149 | 2018-07-31 | 2012-07-16 |
18 | CRP8660 | 2019-09-28 | 2004-11-26 |
19 | PEQ6155 | 2021-07-22 | 2007-02-23 |
20 | PPI8400 | 2021-11-26 | 2006-09-18 |
21 | JOA9170 | 2020-09-09 | 1999-05-16 |
22 | KNQ4268 | 2021-03-10 | 2008-02-12 |
23 | SRU2950 | 2021-01-22 | 2006-11-01 |
24 | EHL9301 | 2019-10-07 | 2001-03-12 |
25 | CJY6373 | 2021-10-25 | 2000-01-07 |
26 | NCL2868 | 2019-05-16 | 2005-05-05 |
27 | YWA1319 | 2019-04-28 | 2012-11-27 |
28 | CVI8835 | 2018-12-08 | 2015-08-19 |
29 | HIR9014 | 2021-11-10 | 2014-07-18 |
... | ... | ... | ... |
945 | GSN4940 | 2018-01-03 | 2015-07-10 |
946 | EGF8629 | 2020-04-28 | 2016-02-27 |
947 | QAO5663 | 2018-01-30 | 2014-09-02 |
948 | YXF8421 | 2018-06-16 | 2006-09-05 |
949 | VDU5872 | 2018-01-30 | 2007-04-04 |
950 | HFZ7313 | 2018-10-06 | 2014-08-07 |
951 | JDC8664 | 2018-08-11 | 2017-02-04 |
952 | QBV9830 | 2021-11-19 | 2015-06-25 |
953 | LFW8264 | 2021-03-26 | 2006-07-03 |
954 | EUT2904 | 2021-12-10 | 2008-02-23 |
955 | PBW7429 | 2020-09-08 | 2017-05-13 |
956 | SGO7528 | 2020-03-24 | 1999-03-18 |
957 | IRJ6787 | 2021-11-13 | 2013-06-19 |
958 | BKZ8467 | 2021-11-25 | 2004-04-06 |
959 | SNZ2467 | 2021-02-14 | 2003-07-22 |
960 | PDF9673 | 2020-01-01 | 2005-03-27 |
961 | TYL6617 | 2020-08-21 | 2014-04-27 |
962 | LIK6490 | 2020-03-14 | 2010-03-17 |
963 | EXS9829 | 2021-05-08 | 2009-03-10 |
964 | SAC4402 | 2021-03-06 | 2004-05-19 |
965 | FND5049 | 2018-07-25 | 2012-01-07 |
966 | FLR5428 | 2018-04-03 | 2002-09-14 |
967 | CDZ4102 | 2021-06-03 | 2009-02-16 |
968 | HTQ6817 | 2020-03-17 | 2013-07-25 |
969 | PUN8916 | 2019-11-26 | 2013-01-19 |
970 | GJI9724 | 2018-06-23 | 2015-06-21 |
971 | ORD4650 | 2019-08-16 | 2014-05-11 |
972 | BLV8655 | 2019-12-24 | 2014-01-14 |
973 | XBH9030 | 2020-09-17 | 2010-02-11 |
974 | BZQ1079 | 2020-07-22 | 2003-09-12 |
975 rows × 3 columns
rel_cols_3 = ["POLICY_ID","EXPIRY_DATE","START_DATE","LOW_MILEAGE_USE"]
df_policy = raw_df[rel_cols_3]
df_policy
POLICY_ID | EXPIRY_DATE | START_DATE | LOW_MILEAGE_USE | |
---|---|---|---|---|
0 | NW5567882 | 2017-09-19 | 2016-09-19 | 0 |
1 | UR4864804 | 2018-06-14 | 2017-06-14 | 0 |
2 | RR8595908 | 2016-08-23 | 2015-08-24 | 1 |
3 | RN5640634 | 2016-07-24 | 2015-07-25 | 0 |
4 | YY1229530 | 2017-01-07 | 2016-01-08 | 0 |
5 | XP3473763 | 2018-01-15 | 2017-01-15 | 0 |
6 | US5444269 | 2017-07-08 | 2016-07-08 | 1 |
7 | XR1994270 | 2017-08-11 | 2016-08-11 | 0 |
8 | VP6368585 | 2018-10-13 | 2017-10-13 | 0 |
9 | YP9758006 | 2016-06-22 | 2015-06-23 | 0 |
10 | XY6800348 | 2017-04-14 | 2016-04-14 | 0 |
11 | ZK6994471 | 2018-02-22 | 2017-02-22 | 0 |
12 | ZR4462879 | 2018-01-10 | 2017-01-10 | 0 |
13 | ZW4263453 | 2017-06-04 | 2016-06-04 | 0 |
14 | UW8176531 | 2018-04-10 | 2017-04-10 | 0 |
15 | RW8652538 | 2017-11-25 | 2016-11-25 | 0 |
16 | PQ3189850 | 2017-04-06 | 2016-04-06 | 1 |
17 | XN9015815 | 2018-11-21 | 2017-11-21 | 1 |
18 | SV1659058 | 2017-07-23 | 2016-07-23 | 0 |
19 | UQ8163328 | 2018-07-11 | 2017-07-11 | 0 |
20 | OT0287150 | 2018-04-23 | 2017-04-23 | 0 |
21 | TW2172802 | 2016-08-19 | 2015-08-20 | 0 |
22 | PP7360289 | 2017-04-17 | 2016-04-17 | 0 |
23 | OY6516260 | 2016-04-17 | 2015-04-18 | 0 |
24 | OS6708536 | 2018-10-01 | 2017-10-01 | 0 |
25 | QV4842191 | 2017-03-06 | 2016-03-06 | 0 |
26 | VT9807337 | 2018-01-14 | 2017-01-14 | 0 |
27 | PU5439335 | 2017-04-04 | 2016-04-04 | 0 |
28 | OJ5466096 | 2017-05-20 | 2016-05-20 | 0 |
29 | RR5447110 | 2018-07-11 | 2017-07-11 | 0 |
... | ... | ... | ... | ... |
945 | VR3029092 | 2018-06-08 | 2017-06-08 | 0 |
946 | TQ4763765 | 2016-03-08 | 2015-03-09 | 0 |
947 | XV0936944 | 2018-07-21 | 2017-07-21 | 0 |
948 | XO1420265 | 2017-04-03 | 2016-04-03 | 0 |
949 | QP0855306 | 2018-06-22 | 2017-06-22 | 0 |
950 | QW3622319 | 2016-06-08 | 2015-06-09 | 0 |
951 | US7928033 | 2016-06-01 | 2015-06-02 | 0 |
952 | YU2221856 | 2018-08-24 | 2017-08-24 | 0 |
953 | YY7862368 | 2017-04-25 | 2016-04-25 | 0 |
954 | SP1179929 | 2017-08-20 | 2016-08-20 | 0 |
955 | RZ2794532 | 2018-10-03 | 2017-10-03 | 0 |
956 | SV1869698 | 2016-09-08 | 2015-09-09 | 0 |
957 | VP1026284 | 2017-09-22 | 2016-09-22 | 0 |
958 | ZT7214353 | 2018-11-23 | 2017-11-23 | 0 |
959 | PZ8927119 | 2018-03-10 | 2017-03-10 | 1 |
960 | ZQ9923600 | 2018-08-02 | 2017-08-02 | 0 |
961 | OQ2049136 | 2017-07-18 | 2016-07-18 | 0 |
962 | VO4861035 | 2017-10-04 | 2016-10-04 | 0 |
963 | NU1726936 | 2017-02-09 | 2016-02-10 | 0 |
964 | YN8255193 | 2016-03-13 | 2015-03-14 | 0 |
965 | NY8366901 | 2018-05-16 | 2017-05-16 | 0 |
966 | PZ8862368 | 2017-08-09 | 2016-08-09 | 0 |
967 | SS8155048 | 2018-05-24 | 2017-05-24 | 0 |
968 | NS6540979 | 2018-09-22 | 2017-09-22 | 0 |
969 | NY8220476 | 2017-10-25 | 2016-10-25 | 0 |
970 | QY3512250 | 2016-07-11 | 2015-07-12 | 0 |
971 | XR9202123 | 2016-07-20 | 2015-07-21 | 0 |
972 | TU0769843 | 2016-08-20 | 2015-08-21 | 0 |
973 | UU9571126 | 2017-04-21 | 2016-04-21 | 0 |
974 | SV2796501 | 2016-01-10 | 2015-01-10 | 0 |
975 rows × 4 columns
df_f = pd.merge(df_claim, df_driver, on='DRIVER_ID', how='left')
df_f = pd.merge(df_f,df_policy, on = 'POLICY_ID', how = 'inner')
df_f.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 975 entries, 0 to 974 Data columns (total 17 columns): HOUSEHOLD_ID 975 non-null object DRIVER_ID 975 non-null object POLICY_ID 975 non-null object CLAIM_ID 975 non-null object LOSS_EVENT_TIME 975 non-null object CLAIM_INIT_TIME 975 non-null object ODOMETER_AT_LOSS 975 non-null float64 CLAIMS_AT_LOSS_DATE 975 non-null int64 LOSS_LOCATION_LAT 975 non-null float64 LOSS_LOCATION_LONG 975 non-null float64 CLAIM_AMOUNT 975 non-null float64 FLAG_FOR_FRAUD_INV 975 non-null int64 DRIVERS_LICENSE_EXPIRY 975 non-null object DATE_AT_CURRENT_ADDRESS 975 non-null object EXPIRY_DATE 975 non-null object START_DATE 975 non-null object LOW_MILEAGE_USE 975 non-null int64 dtypes: float64(4), int64(3), object(10) memory usage: 137.1+ KB
# Claim within 15 days of policy expiry (date of loss - insurance_policy.expiry)
df_f["EXPIRY_DATE"] = pd.to_datetime(df_f["EXPIRY_DATE"])
df_f["LOSS_EVENT_TIME"] = pd.to_datetime(df_f["LOSS_EVENT_TIME"])
df_f["DAYS_FROM_LOSS"] = df_f["LOSS_EVENT_TIME"] - df_f["EXPIRY_DATE"]
df_f["DAYS_FROM_LOSS"] = abs(df_f.DAYS_FROM_LOSS.dt.days)
df_f.loc[df_f['DAYS_FROM_LOSS'] >= 15, 'SUSPICIOUS_CLAIM_TIME'] = 1
df_f.loc[df_f['DAYS_FROM_LOSS'] < 15, 'SUSPICIOUS_CLAIM_TIME'] = 0
df_f["SUSPICIOUS_CLAIM_TIME"].value_counts()
1.0 915 0.0 60 Name: SUSPICIOUS_CLAIM_TIME, dtype: int64
# Expired drivers license (if date of loss > insurance_driver.drivers_license_expiry)
df_f["DRIVERS_LICENSE_EXPIRY"] = pd.to_datetime(df_f["DRIVERS_LICENSE_EXPIRY"])
df_f["DAYS_FROM_L_EXPIRY"] = df_f["DRIVERS_LICENSE_EXPIRY"] - df_f["LOSS_EVENT_TIME"]
df_f["DAYS_FROM_L_EXPIRY"] = df_f.DAYS_FROM_L_EXPIRY.dt.days
df_f.loc[df_f['DAYS_FROM_L_EXPIRY'] >= 0, 'EXPIRED_LICENSE'] = 0
df_f.loc[df_f['DAYS_FROM_L_EXPIRY'] < 0, 'EXPIRED_LICENSE'] = 1
# Days living at current address (date of loss - insurance_driver.date_at_current_address)
df_f["DATE_AT_CURRENT_ADDRESS"] = pd.to_datetime(df_f["DATE_AT_CURRENT_ADDRESS"])
df_f["DAYS_AT_ADDRESS"] = df_f["LOSS_EVENT_TIME"] - df_f["DATE_AT_CURRENT_ADDRESS"]
df_f["DAYS_AT_ADDRESS"] = abs(df_f.DAYS_AT_ADDRESS.dt.days)
df_f.loc[df_f['DAYS_AT_ADDRESS'] >= 15, 'SUSPICIOUS_LIVING'] = 1
df_f.loc[df_f['DAYS_AT_ADDRESS'] < 15, 'SUSPICIOUS_LIVING'] = 0
#7500/year
df_f["START_DATE"] = pd.to_datetime(df_f["START_DATE"])
#find number of days between policy creation and accident
df_f["LENGTH_OF_POLICY"]=(df_f["LOSS_EVENT_TIME"] - df_f["START_DATE"]).dt.days
#convert to years
df_f["LENGTH_OF_POLICY"]=df_f["LENGTH_OF_POLICY"]/365
#divide Odometer at loss by years
df_f["MILES/YEAR"] = df_f["ODOMETER_AT_LOSS"]/df_f["LENGTH_OF_POLICY"]
df_f["MILES/YEAR"].value_counts()
145432.118234 1 234953.253233 1 55515.574648 1 123525.401515 1 198472.566993 1 41161.635017 1 316044.542945 1 343652.573991 1 357034.135714 1 444934.671171 1 47761.055147 1 106608.700207 1 280168.774737 1 97407.709325 1 83715.343421 1 836054.660714 1 225418.465704 1 119793.802198 1 49174.210227 1 154269.516340 1 87012.862810 1 55960.766821 1 340284.423841 1 55199.213855 1 158134.051821 1 148190.243875 1 128341.984375 1 388218.025735 1 38327.581707 1 311332.643229 1 .. 138485.866667 1 355749.475610 1 206160.525547 1 172133.553061 1 39974.457997 1 129429.385563 1 227492.647193 1 581033.280000 1 213403.812808 1 129338.850177 1 433650.444700 1 428016.279255 1 102579.963602 1 924887.694444 1 258935.458015 1 219977.172330 1 167467.854637 1 110428.874687 1 140508.777778 1 189399.690217 1 54898.636569 1 42573.963602 1 153467.345029 1 901214.887681 1 63433.405908 1 98941.082789 1 100299.475188 1 61050.847328 1 54548.889011 1 170432.357027 1 Name: MILES/YEAR, Length: 975, dtype: int64
# Conflict on whether a policyholder with a low mileage discount experienced a loss with high mileage at the point of loss
df_f.loc[df_f["MILES/YEAR"] <7500, 'LOW_MILEAGE_AT_LOSS'] = 1
df_f.loc[df_f["MILES/YEAR"] >=7500, 'LOW_MILEAGE_AT_LOSS'] = 0
df_f.loc[df_f["LOW_MILEAGE_USE"]==df_f["LOW_MILEAGE_AT_LOSS"], 'SUSPICIOUS_MILEAGE'] = 0
df_f.loc[df_f["LOW_MILEAGE_USE"]!= df_f["LOW_MILEAGE_AT_LOSS"], 'SUSPICIOUS_MILEAGE'] = 1
df_f.loc[df_f["CLAIM_AMOUNT"] <3000, 'EXCESSIVE_CLAIM_AMOUNT'] = 0
df_f.loc[df_f["CLAIM_AMOUNT"] >=3000, 'EXCESSIVE_CLAIM_AMOUNT'] = 1
features = ['FLAG_FOR_FRAUD_INV',
'SUSPICIOUS_MILEAGE',
'EXPIRED_LICENSE',
'SUSPICIOUS_CLAIM_TIME',
'SUSPICIOUS_LIVING',
'EXCESSIVE_CLAIM_AMOUNT']
df_model = df_f[features]
#ensure all relevant features are integers
df_model["SUSPICIOUS_LIVING"] = df_model["SUSPICIOUS_LIVING"].astype(int)
df_model["EXPIRED_LICENSE"] = df_model["EXPIRED_LICENSE"].astype(int)
df_model["SUSPICIOUS_CLAIM_TIME"] = df_model["SUSPICIOUS_CLAIM_TIME"].astype(int)
df_model["SUSPICIOUS_MILEAGE"] = df_model["SUSPICIOUS_MILEAGE"].astype(int)
df_model["EXCESSIVE_CLAIM_AMOUNT"] = df_model["EXCESSIVE_CLAIM_AMOUNT"].astype(int)
df_f.groupby("FLAG_FOR_FRAUD_INV", as_index=False).mean()
FLAG_FOR_FRAUD_INV | ODOMETER_AT_LOSS | CLAIMS_AT_LOSS_DATE | LOSS_LOCATION_LAT | LOSS_LOCATION_LONG | CLAIM_AMOUNT | LOW_MILEAGE_USE | DAYS_FROM_LOSS | SUSPICIOUS_CLAIM_TIME | DAYS_FROM_L_EXPIRY | EXPIRED_LICENSE | DAYS_AT_ADDRESS | SUSPICIOUS_LIVING | LENGTH_OF_POLICY | MILES/YEAR | LOW_MILEAGE_AT_LOSS | SUSPICIOUS_MILEAGE | EXCESSIVE_CLAIM_AMOUNT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 180783.846046 | 1.492091 | 41.857895 | -87.675014 | 2521.147627 | 0.094903 | 117.021090 | 0.933216 | 894.546573 | 0.042179 | 3487.956063 | 0.996485 | 1.070159 | 205170.356701 | 0.0 | 0.094903 | 0.086116 |
1 | 1 | 175504.046305 | 1.758621 | 41.860133 | -87.679617 | 26429.480296 | 0.088670 | 118.160099 | 0.945813 | 917.960591 | 0.036946 | 3252.413793 | 1.000000 | 0.960490 | 230687.578997 | 0.0 | 0.088670 | 0.933498 |
#split data into x and y variables
xVar = df_model[["EXPIRED_LICENSE","SUSPICIOUS_CLAIM_TIME","SUSPICIOUS_LIVING","SUSPICIOUS_MILEAGE","EXCESSIVE_CLAIM_AMOUNT"]]
yVar = df_model["FLAG_FOR_FRAUD_INV"]
xVar.head()
EXPIRED_LICENSE | SUSPICIOUS_CLAIM_TIME | SUSPICIOUS_LIVING | SUSPICIOUS_MILEAGE | EXCESSIVE_CLAIM_AMOUNT | |
---|---|---|---|---|---|
0 | 0 | 1 | 1 | 0 | 1 |
1 | 1 | 1 | 1 | 0 | 0 |
2 | 0 | 1 | 1 | 1 | 1 |
3 | 0 | 1 | 1 | 0 | 1 |
4 | 0 | 1 | 1 | 0 | 0 |
#split into a test/train set
X_train, X_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)
(780, 5) (780,)
(195, 5) (195,)
#train model
clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(X_train, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
oob_score=False, random_state=0, verbose=0, warm_start=False)
#create confusion matrix to gut check model
preds = clf.predict(X_test)
pd.crosstab(y_test, preds, rownames=['Actual Result'], colnames=['Predicted Result'])
(780, 5) (780,) (195, 5) (195,)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-32-e59721bf942e> in <module> 3 print (X_train.shape, y_train.shape) 4 print (X_test.shape, y_test.shape) ----> 5 (780, 5) (780,) 6 (195, 5) (195,) 7 #train model TypeError: 'tuple' object is not callable