Using Dimension Reduction to create synthetic Concept Drift¶

In [19]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from alibi_detect.cd import MMDDrift, FETDrift, CVMDrift, KSDrift
from sklearn.metrics import confusion_matrix
from skmultiflow.drift_detection import DDM
from skmultiflow.drift_detection.adwin import ADWIN

Call in data and use bag of words to get feature values for each item in terms of how often a word appears in the item name.

In [20]:

t1 = pd.read_excel('https://raw.githubusercontent.com/UNECE/ML_dataset/master/Stats%20Poland%20ECOICOP%20data.xlsx', sheet_name = 'English')

In [21]:

vectorizer = CountVectorizer(token_pattern='\w\w+|[1-9]\.[1-9]\%|[1-9]\,[1-9]\%|[1-9]\.[1-9]|[1-9]\,[1-9]|[1-9]\%')
vectorizer.fit(t1['produkt'])
X = pd.DataFrame(vectorizer.transform(t1['produkt']).todense(), columns=vectorizer.get_feature_names())#.to_numpy()

Use PCA to reduce dimensions then change some labels based on the PCA outputs. (Probably better to use a different method than PCA eventually)

In [22]:

pca = PCA(n_components=2)
pca.fit(X)
X2 = pca.transform(X)
X2

Out[22]:

array([[ 0.72556198,  0.06901139],
       [ 0.69524086,  0.14352036],
       [-0.24980595,  0.03262967],
       ...,
       [-0.26754292, -0.02725797],
       [-0.26748707, -0.02712377],
       [-0.26748707, -0.02712377]])

In [23]:

pca = PCA(n_components=5)
pca.fit(X)
X5 = pca.transform(X)
X5

Out[23]:

array([[ 0.72556184,  0.06892948, -0.37163337, -0.02271674, -0.0702986 ],
       [ 0.69524059,  0.14204164, -0.30490279,  0.12051952, -0.04961939],
       [-0.24980628,  0.03201551, -0.06682947,  0.09084413, -0.07885592],
       ...,
       [-0.2675442 , -0.02849962, -0.0842153 ,  0.00583738,  0.00302388],
       [-0.26748833, -0.02835428, -0.08371878,  0.00587023,  0.00299765],
       [-0.26748833, -0.02835428, -0.08371878,  0.00587023,  0.00299765]])

In [24]:

pca = PCA(n_components=10)
pca.fit(X)
X10 = pca.transform(X)
X10

Out[24]:

array([[ 0.72556183,  0.06902345, -0.37144639, ..., -0.07512212,
         0.22133783,  0.52939237],
       [ 0.69524052,  0.14197847, -0.30481788, ...,  0.3714309 ,
         1.19984569, -0.35706358],
       [-0.24980602,  0.03177583, -0.06657378, ...,  0.09136378,
         0.47933746, -0.07830631],
       ...,
       [-0.26754369, -0.02833311, -0.08380358, ..., -0.09032943,
        -0.08066121, -0.02543885],
       [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202,
        -0.08031969, -0.02511831],
       [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202,
        -0.08031969, -0.02511831]])

In [25]:

t1['kategoria']

Out[25]:

0              Confectionery products
1          Fruit and vegetable juices
2        Artificial sugar substitutes
3          Jams, marmalades and honey
4          Jams, marmalades and honey
                     ...             
17094        Mineral or spring waters
17095        Mineral or spring waters
17096        Mineral or spring waters
17097        Mineral or spring waters
17098        Mineral or spring waters
Name: kategoria, Length: 17099, dtype: object

In [26]:

test=t1['kategoria'].to_numpy()

In [27]:

X10 = np.column_stack((X10,test)) #need to change variable name later

In [28]:

n = round(X10.shape[0]/100)*-1
n2 = round(X10.shape[0]/10)*-1

In [29]:

temp = np.argpartition(X10[:,0], n)[n:]

In [30]:

Y = t1['kategoria']

In [31]:

category = np.amin(Y[temp])

In [32]:

Y2 = Y.copy()
Y2[temp] = category

In [33]:

Y2[temp]

Out[33]:

3511     Baby food
16901    Baby food
10261    Baby food
132      Baby food
9621     Baby food
           ...    
2908     Baby food
8726     Baby food
10258    Baby food
905      Baby food
809      Baby food
Name: kategoria, Length: 171, dtype: object

In [34]:

temp2 = np.argpartition(X10[:,0], -5000)[-5000:]

In [35]:

category2 = np.amin(Y[temp2])

In [36]:

Y3 = Y.copy()
Y3[temp2] = "Sugar"

In [37]:

Y3[temp2]

Out[37]:

8203     Sugar
10269    Sugar
16581    Sugar
3119     Sugar
4129     Sugar
         ...  
2908     Sugar
8726     Sugar
10258    Sugar
905      Sugar
809      Sugar
Name: kategoria, Length: 5000, dtype: object

Split data into train and test then train classifier and predict on test data to verify model performance

In [38]:

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.7, random_state=42)

In [39]:

clf = linear_model.SGDClassifier()
clf.fit(Xtrain, Ytrain)
#clf = MultinomialNB()
#clf.fit(Xtrain, Ytrain)

Out[39]:

SGDClassifier()

In [40]:

train_predictions = clf.predict(Xtrain)
test_predictions = clf.predict(Xtest)

In [41]:

print("Model training accuracy: ", accuracy_score(Ytrain, train_predictions))
print("Model test accuracy: ", accuracy_score(Ytest, test_predictions))

Model training accuracy:  0.9778594702982706
Model test accuracy:  0.8947368421052632

In [42]:

clf.fit(X, Y)
x_predictions = clf.predict(X)
print("Accuracy: ", accuracy_score(Y2, x_predictions))

Accuracy:  0.9599391777296918

In [43]:

print("Accuracy: ", accuracy_score(Y3, x_predictions))

Accuracy:  0.6873501374349377

Confusion Matrix¶

Creating a multiclass label confusion matrix and calculating different metrics from that for each individual label

In [44]:

lbls = np.unique(Y3)
cm = confusion_matrix(Y3, x_predictions, labels = lbls)

In [45]:

cm

Out[45]:

array([[ 38,   0,   0, ...,   0,   0,   0],
       [  0,  96,   0, ...,   0,   0,   0],
       [  0,   0,  53, ...,   0,   0,   0],
       ...,
       [ 10, 207,   9, ...,  88, 195, 155],
       [  0,   0,   0, ...,   0, 440,   0],
       [  0,   0,   0, ...,   0,   0, 321]], dtype=int64)

In [46]:

cmv = []
metrics = []
l = cm[0].size
for i in range(l):
    tp = cm[i,i]
    fp = sum(cm[:,i])-tp
    fn = sum(cm[i,:])-tp
    tn = Y3.size-tp-fp-fn
    cmv.append([tp,fp,fn,tn])
    acc = (tp+tn)/(tp+fp+fn+tn)
    prec = tp/(tp+fp)
    rec = tp/(tp+fn)
    f1 = 2*((prec*rec)/(prec+rec))
    metrics.append([acc,prec,rec,f1])
cmv = np.asarray(cmv)
metrics = np.asarray(metrics)

In [73]:

t1 = np.random.uniform(0.9,1,5000)
t2 = np.random.uniform(0.1,0.2,1000)

In [74]:

t = np.concatenate((t1, t2), axis=0)

Drift Detection Methods Testing¶

This is just some testing of different methods and quite messy. Would need some work to get certain parts to run

In [75]:

ddm = DDM(min_num_instances=30, warning_level=2, out_control_level=3)
m = 0 #set as metric column
for i in range(6000):
    ddm.add_element(t[i])
    if ddm.detected_warning_zone():
        print('Warning zone has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))
    if ddm.detected_change():
        print('Change has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))

In [90]:

Y = x_predictions
Ys = np.unique(x_predictions)
Ys = dict(zip(Ys, range(len(Ys))))
test = np.vectorize(Ys.get)(Y)

In [103]:

import random
from river import drift

rng = random.Random(12345)
adwin = drift.ADWIN()

# Simulate a data stream composed by two data distributions
data_stream = rng.choices([0, 1], k=1000)
# Increase the probability of 1's appearing in the next 1000 instances
data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])

# Update drift detector and verify if change is detected
for i, val in enumerate(data_stream):
    _ = adwin.update(val)
    if adwin.drift_detected:
        print(f"Change detected at index {i}, input value: {val}")

Change detected at index 1567, input value: 1

In [ ]:

import random
from river import drift

rng = random.Random(42)
ddm = drift.DDM()

# Simulate a data stream where the first 1000 instances come from a uniform distribution
# of 1's and 0's
data_stream = rng.choices([0, 1], k=1000)
# Increase the probability of 1's appearing in the next 1000 instances
data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])

print_warning = True
 # Update drift detector and verify if change is detected
for i, x in enumerate(test):
    _ = ddm.update(x)
    if ddm.warning_detected and print_warning:
        print(f"Warning detected at index {i}")
        print_warning = False
    if ddm.drift_detected:
        print(f"Change detected at index {i}")
        print_warning = True

In [87]:

adwin = ADWIN()
for i in range(l):
    adwin.add_element(metrics[i,m])
    if adwin.detected_change():
        print('Change detected in data: ' + str(metrics[i,m]) + ' - at index: ' + str(i))

In [88]:

from skmultiflow.drift_detection import PageHinkley
ph = PageHinkley()
for i in range(l):
    ph.add_element(metrics[i,m])
    if ph.detected_change():
        print('Change has been detected in data: ' + str(metrics[i,m]) + ' - of index: ' + str(i))

In [144]:

from skmultiflow.data.multilabel_generator import MultilabelGenerator
# Setting up the stream
stream = MultilabelGenerator(n_samples=100, n_features=20, n_targets=10, n_labels=10)
stream.next_sample()

Out[144]:

(array([[3., 1., 3., 6., 1., 2., 1., 5., 2., 1., 2., 2., 3., 2., 4., 4.,
         4., 1., 3., 3.]]),
 array([[1, 1, 1, 1, 0, 0, 0, 1, 0, 1]]))

In [ ]:

In [44]:

lossRef = (x_predictions == Y.to_numpy()).astype(int)
loss = (x_predictions == Y3.to_numpy()).astype(int)

In [43]:

fetDetective = FETDrift(lossRef, p_val=0.05, alternative='less')

In [45]:

losses = {'loss': loss}
label = ['No!', 'Yes!']
for name, lossArr in losses.items():
    print('\n%s' % name)
    preds = fetDetective.predict(lossArr)
    print('Drift? {}'.format(label[preds['data']['is_drift']]))
    print('p-value: {}'.format(preds['data']['p_val'][0]))

loss
Drift? Yes!
p-value: 0.0

In [ ]:

#import matplotlib.pyplot as plt
#from sklearn.metrics import ConfusionMatrixDisplay
#disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#disp.plot()
#plt.show()

In [18]:

Out[18]:

0.9634452094832361