import numpy as np
import sklearn
from sklearn.linear_model import Ridge
from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg
from sklearn.utils.extmath import safe_sparse_dot
import numpy as np
import sklearn
from sklearn.linear_model import Ridge
from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg
from sklearn.utils.extmath import safe_sparse_dot
# for some reason, the python ridge regression library does not
# allow rescaled data for the sparse_cg solver..no idea why
# probably should test the
def _rescale_data(X, y, sample_weight):
"""Rescale data so as to support sample_weight"""
n_samples = X.shape[0]
sample_weight = sample_weight * np.ones(n_samples)
sample_weight = np.sqrt(sample_weight)
sw_matrix = sparse.dia_matrix((sample_weight, 0),
shape=(n_samples, n_samples))
X = safe_sparse_dot(sw_matrix, X)
y = safe_sparse_dot(sw_matrix, y)
return X, y
# classifier with/ instance weights
# current: Ridge Regression w/rescaled data and real labels
def classify(X, y, sample_weight, alpha):
X, y = _rescale_data(X, y, sample_weight)
classifier = Ridge(alpha=alpha, fit_intercept=False, solver='sparse_cg')
classifier.fit(X, y)
return classifier
# select the highest confidence documents using our model
# X = X[unlabelled_ids]
# classify all docs
# select based on score R(+) , (1-R) (-)
del select_high_confidence_results(X, R, classifier):
high_c_ids = []
# apply to all data
# select the top R positive, bottom (1-R) negative scores
return high_c_ids
# self training step
# apply classifier to documents w/labels + guessed_labels
# add R fraction of [+] documents (and 1-R [-]) guessed_labels set
# retrain, with guessed_label ss weighted down
def self_train_step(X, y, W, U, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):
# apply classifier with sample weights W and U
# to labeled docs and current guess labels
current_labels = labeled_ids
if guessed_ids.shape[0] > 0 then
current_labels = np.union1d(labeled_ids,guessed_ids)
X_current = X[current_labels]
y_current = y[current_labels]
# how do we set the sample_weights ?
# create weights for all unlabeled, and then select?
# wasteful but simple
# does the guess get sample weight W or U ... I think U
# notice: we normalize by num_guessed, not num_unlabelled
instance_weights = np.empty_like(y)
instance_weights[labeled_ids] = W / float(labeled_ids.shape[0])
instance_weights[guessed_ids] = U / float(guessed_ids.shape[0])
current_weights = instance_weights[current_labels]
current_model = classify(X_current, y_current, current_weights, alpha)
# added the R/(1-R) high confidence (+)/(-) documents to the guessed set
#
#
#
# switch based on the current set of guesses total
# so can switch out labels that were not present earlier
#
#
#
# switch
return 0
# multi-switch algo
# switch [R (+)]/[1-R (-)] labels
# if they make the current fit better
#
def switch_labels(X, y,labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R) :
num_switched = 0
return num_switched
# metric to decide if we switch the labels or not
# can we use the margin even for Regularized Least Squares?
# the regularizer is the same
#
# if Xw=y, then w=(X^-1)y
# #=> we need the current version of the classifier, with weights set
# some function of the classifier
def switch_metric(classifier )
return 0
# run the incremental self training algo
#
def self_train(X, y, labeled_ids, unlabeled_ids, R=0.5, U=1, W=0.001, alpha = 1.0:
U_step_size = 0.001
istep = 1
# or, equivalently, num_steps = 1000
# U_step_size = 1/num_steps
# run initial classifier
X_labeled = X[labeled_ids]
y_labeled = y[labeled_ids]
guessed_ids = [] # or np.empty
# loop over istep = start to finish
# U_step_size*U to U in increments U_step_size
# or: break at maximum U steps
# or: break at some convergence criteria?
# set guessed_sample_weights
# U_step = (U_step_size*istep)*U
U_step = (U_step_size*istep)*U
# apply current classifier to remaining unlabeled data
# note: U = U_step
self_train_step(X, y, W, U_step, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):
switch_labels(X, y, labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R)
# stop or keep switching / stop?
# just run all the way to the end?