import pandas as pd
data = pd.read_csv('delaware_anomaly.csv')
data.head()
FISCAL_YEAR | FISCAL_PERIOD | DEPT_NAME | DIV_NAME | MERCHANT | CAT_DESCR | TRANS_DT | MERCHANDISE_AMT | |
---|---|---|---|---|---|---|---|---|
0 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | DOUBLETREE HOTELS WASHING | LODGING | 6/3/2014 | -5.50 |
1 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 18.31 |
2 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 21.61 |
3 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 22.39 |
4 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 5.44 |
data['MERCHANT'].value_counts()
WB MASON 293 AMAZON.COM 211 VZWRLSS*IVR VB 166 AMAZON MKTPLACE PMTS 163 USPS PO 0917600901 157 ... SQ *UDUMAH TAXI GOSQ.COM 1 MEARS LUXURY DIVISION 1 STAPLS0154393874000001 1 STAPLS0176291352000001 1 AMTRAK .COM 0540725569664 1 Name: MERCHANT, Length: 5802, dtype: int64
data.shape
(15156, 8)
data.dtypes
FISCAL_YEAR int64 FISCAL_PERIOD int64 DEPT_NAME object DIV_NAME object MERCHANT object CAT_DESCR object TRANS_DT object MERCHANDISE_AMT float64 dtype: object
from pycaret.anomaly import *
s = setup(data, session_id = 123,
ignore_features = ['CAT_DESCR'], high_cardinality_features= ['MERCHANT'],
numeric_features = ['FISCAL_YEAR', 'FISCAL_PERIOD' ])
Description | Value | |
---|---|---|
0 | session_id | 123 |
1 | Original Data | (15156, 8) |
2 | Missing Values | False |
3 | Numeric Features | 3 |
4 | Categorical Features | 3 |
5 | Ordinal Features | False |
6 | High Cardinality Features | True |
7 | High Cardinality Method | frequency |
8 | Transformed Data | (15156, 53) |
9 | CPU Jobs | -1 |
10 | Use GPU | False |
11 | Log Experiment | False |
12 | Experiment Name | anomaly-default-name |
13 | USI | 25da |
14 | Imputation Type | simple |
15 | Iterative Imputation Iteration | None |
16 | Numeric Imputer | mean |
17 | Iterative Imputation Numeric Model | None |
18 | Categorical Imputer | mode |
19 | Iterative Imputation Categorical Model | None |
20 | Unknown Categoricals Handling | least_frequent |
21 | Normalize | False |
22 | Normalize Method | None |
23 | Transformation | False |
24 | Transformation Method | None |
25 | PCA | False |
26 | PCA Method | None |
27 | PCA Components | None |
28 | Ignore Low Variance | False |
29 | Combine Rare Levels | False |
30 | Rare Level Threshold | None |
31 | Numeric Binning | False |
32 | Remove Outliers | False |
33 | Outliers Threshold | None |
34 | Remove Multicollinearity | False |
35 | Multicollinearity Threshold | None |
36 | Clustering | False |
37 | Clustering Iteration | None |
38 | Polynomial Features | False |
39 | Polynomial Degree | None |
40 | Trignometry Features | False |
41 | Polynomial Threshold | None |
42 | Group Features | False |
43 | Feature Selection | False |
44 | Features Selection Threshold | None |
45 | Feature Interaction | False |
46 | Feature Ratio | False |
47 | Interaction Threshold | None |
get_config('X').columns
Index(['FISCAL_YEAR', 'FISCAL_PERIOD', 'MERCHANT', 'MERCHANDISE_AMT', 'DEPT_NAME_DEPT OF EDUCATION', 'DIV_NAME_ADVISORY COUNCIL', 'DIV_NAME_Academic Support', 'DIV_NAME_Adult Education and Work Force', 'DIV_NAME_DEPARTMENT OF EDUCATION', 'DIV_NAME_DRIVER TRAINING', 'DIV_NAME_Department of Education', 'DIV_NAME_Driver Training', 'DIV_NAME_E Education Block Grants', 'DIV_NAME_E Transportation', 'DIV_NAME_EDUCATION BLOCK GRANTS', 'DIV_NAME_Education Block Grants', 'DIV_NAME_Educator Support', 'DIV_NAME_OTHER ITEMS', 'DIV_NAME_Office of Early Learning', 'DIV_NAME_Office of the Secretary', 'DIV_NAME_Operations Support', 'DIV_NAME_Other Items', 'DIV_NAME_Professional Standards Board', 'DIV_NAME_Public School Transportation', 'DIV_NAME_SPECIAL NEEDS PROGRAMS', 'DIV_NAME_Special Needs Programs', 'DIV_NAME_State Board of Education', 'DIV_NAME_Student Support', 'DIV_NAME_TRANSPORTATION', 'DIV_NAME_Transportation', 'TRANS_DT_month_1', 'TRANS_DT_month_10', 'TRANS_DT_month_11', 'TRANS_DT_month_12', 'TRANS_DT_month_2', 'TRANS_DT_month_3', 'TRANS_DT_month_4', 'TRANS_DT_month_5', 'TRANS_DT_month_6', 'TRANS_DT_month_7', 'TRANS_DT_month_8', 'TRANS_DT_month_9', 'TRANS_DT_weekday_0', 'TRANS_DT_weekday_1', 'TRANS_DT_weekday_2', 'TRANS_DT_weekday_3', 'TRANS_DT_weekday_4', 'TRANS_DT_weekday_5', 'TRANS_DT_weekday_6', 'TRANS_DT_is_month_end_0', 'TRANS_DT_is_month_end_1', 'TRANS_DT_is_month_start_0', 'TRANS_DT_is_month_start_1'], dtype='object')
iforest = create_model('iforest')
r = assign_model(iforest)
r.head()
FISCAL_YEAR | FISCAL_PERIOD | DEPT_NAME | DIV_NAME | MERCHANT | CAT_DESCR | TRANS_DT | MERCHANDISE_AMT | Anomaly | Anomaly_Score | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | DOUBLETREE HOTELS WASHING | LODGING | 6/3/2014 | -5.50 | 0 | -0.081450 |
1 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 18.31 | 0 | -0.071574 |
2 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 21.61 | 0 | -0.071574 |
3 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 22.39 | 0 | -0.071574 |
4 | 2015 | 1 | DEPT OF EDUCATION | DEPARTMENT OF EDUCATION | HYATT REGENCY NW ORLE F/B | EATING PLACES RESTAURANTS | 6/26/2014 | 5.44 | 0 | -0.071310 |
r['Anomaly_Score'].hist(bins=100, figsize=(10,6))
<AxesSubplot:>