from __future__ import print_function try: from sklearn.datasets import make_classification except ImportError: import pip pip.main(['install', '--user', 'scikit-learn']) from sklearn.datasets import make_classification import numpy as np X, Y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=2) # adding some static offset to the data X = X + 1 from modshogun import RealFeatures, MultilabelSOLabels, MultilabelModel def create_features(X, constant): features = RealFeatures( np.c_[X, constant * np.ones(X.shape[0])].T) return features from modshogun import MultilabelSOLabels def create_labels(Y, n_classes): try: n_samples = Y.shape[0] except AttributeError: n_samples = len(Y) labels = MultilabelSOLabels(n_samples, n_classes) for i, sparse_label in enumerate(Y): try: sparse_label = sorted(sparse_label) except TypeError: sparse_label = [sparse_label] labels.set_sparse_label(i, np.array(sparse_label, dtype=np.int32)) return labels def split_data(X, Y, ratio): num_samples = X.shape[0] train_samples = int(ratio * num_samples) return (X[:train_samples], Y[:train_samples], X[train_samples:], Y[train_samples:]) X_train, Y_train, X_test, Y_test = split_data(X, Y, 0.9) feats_0 = create_features(X_train, 0) feats_1 = create_features(X_train, 1) labels = create_labels(Y_train, 2) model = MultilabelModel(feats_0, labels) model_with_bias = MultilabelModel(feats_1, labels) from modshogun import StochasticSOSVM, DualLibQPBMSOSVM, StructuredAccuracy, LabelsFactory from time import time sgd = StochasticSOSVM(model, labels) sgd_with_bias = StochasticSOSVM(model_with_bias, labels) start = time() sgd.train() print(">>> Time taken for SGD *without* threshold tuning = %f" % (time() - start)) start = time() sgd_with_bias.train() print(">>> Time taken for SGD *with* threshold tuning = %f" % (time() - start)) def evaluate_machine(machine, X_test, Y_test, n_classes, bias): if bias: feats_test = create_features(X_test, 1) else: feats_test = create_features(X_test, 0) test_labels = create_labels(Y_test, n_classes) out_labels = LabelsFactory.to_structured(machine.apply(feats_test)) evaluator = StructuredAccuracy() jaccard_similarity_score = evaluator.evaluate(out_labels, test_labels) return jaccard_similarity_score print(">>> Accuracy of SGD *without* threshold tuning = %f " % evaluate_machine(sgd, X_test, Y_test, 2, False)) print(">>> Accuracy of SGD *with* threshold tuning = %f " %evaluate_machine(sgd_with_bias, X_test, Y_test, 2, True)) import matplotlib.pyplot as plt %matplotlib inline def get_parameters(weights): return -weights[0]/weights[1], -weights[2]/weights[1] def scatter_plot(X, y): zeros_class = np.where(y == 0) ones_class = np.where(y == 1) plt.scatter(X[zeros_class, 0], X[zeros_class, 1], c='b', label="Negative Class") plt.scatter(X[ones_class, 0], X[ones_class, 1], c='r', label="Positive Class") def plot_hyperplane(machine_0, machine_1, label_0, label_1, title, X, y): scatter_plot(X, y) x_min, x_max = np.min(X[:, 0]) - 0.5, np.max(X[:, 0]) + 0.5 y_min, y_max = np.min(X[:, 1]) - 0.5, np.max(X[:, 1]) + 0.5 xx = np.linspace(x_min, x_max, 1000) m_0, c_0 = get_parameters(machine_0.get_w()) m_1, c_1 = get_parameters(machine_1.get_w()) yy_0 = m_0 * xx + c_0 yy_1 = m_1 * xx + c_1 plt.plot(xx, yy_0, "k--", label=label_0) plt.plot(xx, yy_1, "g-", label=label_1) plt.xlim((x_min, x_max)) plt.ylim((y_min, y_max)) plt.grid() plt.legend(loc="best") plt.title(title) plt.show() fig = plt.figure(figsize=(10, 10)) plot_hyperplane(sgd, sgd_with_bias, "Boundary for machine *without* bias for class 0", "Boundary for machine *with* bias for class 0", "Binary Classification using SO-SVM with/without threshold tuning", X, Y) from modshogun import SparseMultilabel_obtain_from_generic def plot_decision_plane(machine, title, X, y, bias): plt.figure(figsize=(24, 8)) plt.suptitle(title) plt.subplot(1, 2, 1) x_min, x_max = np.min(X[:, 0]) - 0.5, np.max(X[:, 0]) + 0.5 y_min, y_max = np.min(X[:, 1]) - 0.5, np.max(X[:, 1]) + 0.5 xx = np.linspace(x_min, x_max, 200) yy = np.linspace(y_min, y_max, 200) x_mesh, y_mesh = np.meshgrid(xx, yy) if bias: feats = create_features(np.c_[x_mesh.ravel(), y_mesh.ravel()], 1) else: feats = create_features(np.c_[x_mesh.ravel(), y_mesh.ravel()], 0) out_labels = machine.apply(feats) z = [] for i in range(out_labels.get_num_labels()): label = SparseMultilabel_obtain_from_generic(out_labels.get_label(i)).get_data() if label.shape[0] == 1: # predicted a single label z.append(label[0]) elif label.shape[0] == 2: # predicted both the classes z.append(2) elif label.shape[0] == 0: # predicted none of the class z.append(3) z = np.array(z) z = z.reshape(x_mesh.shape) c = plt.pcolor(x_mesh, y_mesh, z, cmap=plt.cm.gist_heat) scatter_plot(X, y) plt.xlim((x_min, x_max)) plt.ylim((y_min, y_max)) plt.colorbar(c) plt.title("Decision Surface") plt.legend(loc="best") plt.subplot(1, 2, 2) weights = machine.get_w() m_0, c_0 = get_parameters(weights[:3]) m_1, c_1 = get_parameters(weights[3:]) yy_0 = m_0 * xx + c_0 yy_1 = m_1 * xx + c_1 plt.plot(xx, yy_0, "r--", label="Boundary for class 0") plt.plot(xx, yy_1, "g-", label="Boundary for class 1") plt.title("Hyper planes for different classes") plt.legend(loc="best") plt.xlim((x_min, x_max)) plt.ylim((y_min, y_max)) plt.show() plot_decision_plane(sgd,"Model *without* Threshold Tuning", X, Y, False) plot_decision_plane(sgd_with_bias,"Model *with* Threshold Tuning", X, Y, True) def load_data(file_name): input_file = open(file_name) lines = input_file.readlines() n_samples = len(lines) n_features = len(lines[0].split()) - 1 Y = [] X = [] for line in lines: data = line.split() Y.append(map(int, data[0].split(","))) feats = [] for feat in data[1:]: feats.append(float(feat.split(":")[1])) X.append(feats) X = np.array(X) n_classes = max(max(label) for label in Y) + 1 return X, Y, n_samples, n_features, n_classes def test_multilabel_data(train_file, test_file): X_train, Y_train, n_samples, n_features, n_classes = load_data(train_file) X_test, Y_test, n_samples, n_features, n_classes = load_data(test_file) # create features and labels multilabel_feats_0 = create_features(X_train, 0) multilabel_feats_1 = create_features(X_train, 1) multilabel_labels = create_labels(Y_train, n_classes) # create multi-label model multilabel_model = MultilabelModel(multilabel_feats_0, multilabel_labels) multilabel_model_with_bias = MultilabelModel(multilabel_feats_1, multilabel_labels) # initializing machines for SO-learning multilabel_sgd = StochasticSOSVM(multilabel_model, multilabel_labels) multilabel_sgd_with_bias = StochasticSOSVM(multilabel_model_with_bias, multilabel_labels) start = time() multilabel_sgd.train() t1 = time() - start multilabel_sgd_with_bias.train() t2 = time() - start - t1 return (evaluate_machine(multilabel_sgd, X_test, Y_test, n_classes, False), t1, evaluate_machine(multilabel_sgd_with_bias, X_test, Y_test, n_classes, True), t2) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import SVC from sklearn.metrics import jaccard_similarity_score from sklearn.preprocessing import LabelBinarizer def sklearn_implementation(train_file, test_file): label_binarizer = LabelBinarizer() X_train, Y_train, n_samples, n_features, n_classes = load_data(train_file) X_test, Y_test, n_samples, n_features, n_classes = load_data(test_file) clf = OneVsRestClassifier(SVC(kernel='linear')) start = time() clf.fit(X_train, label_binarizer.fit_transform(Y_train)) t1 = time() - start return (jaccard_similarity_score(label_binarizer.fit_transform(Y_test), clf.predict(X_test)), t1) def print_table(train_file, test_file, caption): acc_0, t1, acc_1, t2 = test_multilabel_data(train_file, test_file) sk_acc, sk_t1 = sklearn_implementation(train_file, test_file) result = ''' \t\t%s Machine\t\t\t\tAccuracy\tTrain-time\n SGD *without* threshold tuning \t%f \t%f SGD *with* threshold tuning \t%f \t%f scikit-learn's implementation \t%f \t%f ''' % (caption, acc_0, t1, acc_1, t2, sk_acc, sk_t1) print(result) print_table("../../../data/multilabel/yeast_train.svm", "../../../data/multilabel/yeast_test.svm", "Yeast dataset") print_table("../../../data/multilabel/scene_train", "../../../data/multilabel/scene_test", "Scene dataset")