import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from alibi_detect.cd import MMDDrift, FETDrift, CVMDrift, KSDrift
from sklearn.metrics import confusion_matrix
from skmultiflow.drift_detection import DDM
from skmultiflow.drift_detection.adwin import ADWIN
Call in data and use bag of words to get feature values for each item in terms of how often a word appears in the item name.
t1 = pd.read_excel('https://raw.githubusercontent.com/UNECE/ML_dataset/master/Stats%20Poland%20ECOICOP%20data.xlsx', sheet_name = 'English')
vectorizer = CountVectorizer(token_pattern='\w\w+|[1-9]\.[1-9]\%|[1-9]\,[1-9]\%|[1-9]\.[1-9]|[1-9]\,[1-9]|[1-9]\%')
vectorizer.fit(t1['produkt'])
X = pd.DataFrame(vectorizer.transform(t1['produkt']).todense(), columns=vectorizer.get_feature_names())#.to_numpy()
Use PCA to reduce dimensions then change some labels based on the PCA outputs. (Probably better to use a different method than PCA eventually)
pca = PCA(n_components=2)
pca.fit(X)
X2 = pca.transform(X)
X2
array([[ 0.72556198, 0.06901139], [ 0.69524086, 0.14352036], [-0.24980595, 0.03262967], ..., [-0.26754292, -0.02725797], [-0.26748707, -0.02712377], [-0.26748707, -0.02712377]])
pca = PCA(n_components=5)
pca.fit(X)
X5 = pca.transform(X)
X5
array([[ 0.72556184, 0.06892948, -0.37163337, -0.02271674, -0.0702986 ], [ 0.69524059, 0.14204164, -0.30490279, 0.12051952, -0.04961939], [-0.24980628, 0.03201551, -0.06682947, 0.09084413, -0.07885592], ..., [-0.2675442 , -0.02849962, -0.0842153 , 0.00583738, 0.00302388], [-0.26748833, -0.02835428, -0.08371878, 0.00587023, 0.00299765], [-0.26748833, -0.02835428, -0.08371878, 0.00587023, 0.00299765]])
pca = PCA(n_components=10)
pca.fit(X)
X10 = pca.transform(X)
X10
array([[ 0.72556183, 0.06902345, -0.37144639, ..., -0.07512212, 0.22133783, 0.52939237], [ 0.69524052, 0.14197847, -0.30481788, ..., 0.3714309 , 1.19984569, -0.35706358], [-0.24980602, 0.03177583, -0.06657378, ..., 0.09136378, 0.47933746, -0.07830631], ..., [-0.26754369, -0.02833311, -0.08380358, ..., -0.09032943, -0.08066121, -0.02543885], [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202, -0.08031969, -0.02511831], [-0.26748782, -0.0281893 , -0.08331103, ..., -0.08996202, -0.08031969, -0.02511831]])
t1['kategoria']
0 Confectionery products 1 Fruit and vegetable juices 2 Artificial sugar substitutes 3 Jams, marmalades and honey 4 Jams, marmalades and honey ... 17094 Mineral or spring waters 17095 Mineral or spring waters 17096 Mineral or spring waters 17097 Mineral or spring waters 17098 Mineral or spring waters Name: kategoria, Length: 17099, dtype: object
test=t1['kategoria'].to_numpy()
X10 = np.column_stack((X10,test)) #need to change variable name later
n = round(X10.shape[0]/100)*-1
n2 = round(X10.shape[0]/10)*-1
temp = np.argpartition(X10[:,0], n)[n:]
Y = t1['kategoria']
category = np.amin(Y[temp])
Y2 = Y.copy()
Y2[temp] = category
Y2[temp]
3511 Baby food 16901 Baby food 10261 Baby food 132 Baby food 9621 Baby food ... 2908 Baby food 8726 Baby food 10258 Baby food 905 Baby food 809 Baby food Name: kategoria, Length: 171, dtype: object
temp2 = np.argpartition(X10[:,0], -5000)[-5000:]
category2 = np.amin(Y[temp2])
Y3 = Y.copy()
Y3[temp2] = "Sugar"
Y3[temp2]
8203 Sugar 10269 Sugar 16581 Sugar 3119 Sugar 4129 Sugar ... 2908 Sugar 8726 Sugar 10258 Sugar 905 Sugar 809 Sugar Name: kategoria, Length: 5000, dtype: object
Split data into train and test then train classifier and predict on test data to verify model performance
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=0.7, random_state=42)
clf = linear_model.SGDClassifier()
clf.fit(Xtrain, Ytrain)
#clf = MultinomialNB()
#clf.fit(Xtrain, Ytrain)
SGDClassifier()
train_predictions = clf.predict(Xtrain)
test_predictions = clf.predict(Xtest)
print("Model training accuracy: ", accuracy_score(Ytrain, train_predictions))
print("Model test accuracy: ", accuracy_score(Ytest, test_predictions))
Model training accuracy: 0.9778594702982706 Model test accuracy: 0.8947368421052632
clf.fit(X, Y)
x_predictions = clf.predict(X)
print("Accuracy: ", accuracy_score(Y2, x_predictions))
Accuracy: 0.9599391777296918
print("Accuracy: ", accuracy_score(Y3, x_predictions))
Accuracy: 0.6873501374349377
Creating a multiclass label confusion matrix and calculating different metrics from that for each individual label
lbls = np.unique(Y3)
cm = confusion_matrix(Y3, x_predictions, labels = lbls)
cm
array([[ 38, 0, 0, ..., 0, 0, 0], [ 0, 96, 0, ..., 0, 0, 0], [ 0, 0, 53, ..., 0, 0, 0], ..., [ 10, 207, 9, ..., 88, 195, 155], [ 0, 0, 0, ..., 0, 440, 0], [ 0, 0, 0, ..., 0, 0, 321]], dtype=int64)
cmv = []
metrics = []
l = cm[0].size
for i in range(l):
tp = cm[i,i]
fp = sum(cm[:,i])-tp
fn = sum(cm[i,:])-tp
tn = Y3.size-tp-fp-fn
cmv.append([tp,fp,fn,tn])
acc = (tp+tn)/(tp+fp+fn+tn)
prec = tp/(tp+fp)
rec = tp/(tp+fn)
f1 = 2*((prec*rec)/(prec+rec))
metrics.append([acc,prec,rec,f1])
cmv = np.asarray(cmv)
metrics = np.asarray(metrics)
t1 = np.random.uniform(0.9,1,5000)
t2 = np.random.uniform(0.1,0.2,1000)
t = np.concatenate((t1, t2), axis=0)
This is just some testing of different methods and quite messy. Would need some work to get certain parts to run
ddm = DDM(min_num_instances=30, warning_level=2, out_control_level=3)
m = 0 #set as metric column
for i in range(6000):
ddm.add_element(t[i])
if ddm.detected_warning_zone():
print('Warning zone has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))
if ddm.detected_change():
print('Change has been detected in data: ' + str(t[i]) + ' - of index: ' + str(i))
Y = x_predictions
Ys = np.unique(x_predictions)
Ys = dict(zip(Ys, range(len(Ys))))
test = np.vectorize(Ys.get)(Y)
import random
from river import drift
rng = random.Random(12345)
adwin = drift.ADWIN()
# Simulate a data stream composed by two data distributions
data_stream = rng.choices([0, 1], k=1000)
# Increase the probability of 1's appearing in the next 1000 instances
data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])
# Update drift detector and verify if change is detected
for i, val in enumerate(data_stream):
_ = adwin.update(val)
if adwin.drift_detected:
print(f"Change detected at index {i}, input value: {val}")
Change detected at index 1567, input value: 1
import random
from river import drift
rng = random.Random(42)
ddm = drift.DDM()
# Simulate a data stream where the first 1000 instances come from a uniform distribution
# of 1's and 0's
data_stream = rng.choices([0, 1], k=1000)
# Increase the probability of 1's appearing in the next 1000 instances
data_stream = data_stream + rng.choices([0, 1], k=1000, weights=[0.3, 0.7])
print_warning = True
# Update drift detector and verify if change is detected
for i, x in enumerate(test):
_ = ddm.update(x)
if ddm.warning_detected and print_warning:
print(f"Warning detected at index {i}")
print_warning = False
if ddm.drift_detected:
print(f"Change detected at index {i}")
print_warning = True
adwin = ADWIN()
for i in range(l):
adwin.add_element(metrics[i,m])
if adwin.detected_change():
print('Change detected in data: ' + str(metrics[i,m]) + ' - at index: ' + str(i))
from skmultiflow.drift_detection import PageHinkley
ph = PageHinkley()
for i in range(l):
ph.add_element(metrics[i,m])
if ph.detected_change():
print('Change has been detected in data: ' + str(metrics[i,m]) + ' - of index: ' + str(i))
from skmultiflow.data.multilabel_generator import MultilabelGenerator
# Setting up the stream
stream = MultilabelGenerator(n_samples=100, n_features=20, n_targets=10, n_labels=10)
stream.next_sample()
(array([[3., 1., 3., 6., 1., 2., 1., 5., 2., 1., 2., 2., 3., 2., 4., 4., 4., 1., 3., 3.]]), array([[1, 1, 1, 1, 0, 0, 0, 1, 0, 1]]))
lossRef = (x_predictions == Y.to_numpy()).astype(int)
loss = (x_predictions == Y3.to_numpy()).astype(int)
fetDetective = FETDrift(lossRef, p_val=0.05, alternative='less')
losses = {'loss': loss}
label = ['No!', 'Yes!']
for name, lossArr in losses.items():
print('\n%s' % name)
preds = fetDetective.predict(lossArr)
print('Drift? {}'.format(label[preds['data']['is_drift']]))
print('p-value: {}'.format(preds['data']['p_val'][0]))
loss Drift? Yes! p-value: 0.0
#import matplotlib.pyplot as plt
#from sklearn.metrics import ConfusionMatrixDisplay
#disp = ConfusionMatrixDisplay(confusion_matrix=cm)
#disp.plot()
#plt.show()
0.9634452094832361