Notebook

In [ ]:

import numpy as np
import sklearn
from sklearn.linear_model import Ridge

from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg


from sklearn.utils.extmath import safe_sparse_dot

In [2]:

import numpy as np
import sklearn
from sklearn.linear_model import Ridge

from scipy import linalg
from scipy import sparse
from scipy.sparse import linalg as sp_linalg


from sklearn.utils.extmath import safe_sparse_dot

In [ ]:

# for some reason, the python ridge regression library does not
# allow rescaled data for the sparse_cg solver..no idea why 
#  probably should test the 
def _rescale_data(X, y, sample_weight):
    """Rescale data so as to support sample_weight"""
    n_samples = X.shape[0]
    sample_weight = sample_weight * np.ones(n_samples)
    sample_weight = np.sqrt(sample_weight)
    sw_matrix = sparse.dia_matrix((sample_weight, 0),
                                  shape=(n_samples, n_samples))
    X = safe_sparse_dot(sw_matrix, X)
    y = safe_sparse_dot(sw_matrix, y)
    return X, y

In [3]:

# classifier with/ instance weights
# current:  Ridge Regression w/rescaled data and real labels
def classify(X, y, sample_weight, alpha):
    X, y = _rescale_data(X, y, sample_weight)
    classifier = Ridge(alpha=alpha, fit_intercept=False, solver='sparse_cg')
    classifier.fit(X, y)
    return classifier

In [ ]:

# select the highest confidence documents using our model
# X = X[unlabelled_ids]
#  classify all docs
#  select based on score  R(+) , (1-R) (-)
del select_high_confidence_results(X, R, classifier):
    high_c_ids = []
  
    # apply to all data
    
    # select the top R positive, bottom (1-R) negative scores
    
    return high_c_ids

In [ ]:

# self training step
#  apply classifier to documents w/labels + guessed_labels 
#  add R fraction of [+] documents (and 1-R [-]) guessed_labels set
#  retrain, with guessed_label ss weighted down
def self_train_step(X, y, W, U, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):
    
    # apply classifier with sample weights W and U
    # to labeled docs and current guess labels
    current_labels = labeled_ids   
    
    if guessed_ids.shape[0] > 0 then
        current_labels = np.union1d(labeled_ids,guessed_ids)
        
    X_current = X[current_labels]
    y_current = y[current_labels]
    
    # how do we set the sample_weights ?
    # create weights for all unlabeled, and then select?
    # wasteful but simple
    
    # does the guess get sample weight W or U ... I think U
    # notice: we normalize by num_guessed, not num_unlabelled
    instance_weights = np.empty_like(y)
    instance_weights[labeled_ids] = W / float(labeled_ids.shape[0])
    instance_weights[guessed_ids] = U / float(guessed_ids.shape[0])  
    
    current_weights = instance_weights[current_labels]
 
    current_model = classify(X_current, y_current, current_weights, alpha)

    # added the R/(1-R) high confidence (+)/(-) documents to the guessed set
    
    #
    #
    #
    
    # switch based on the current set of guesses total
    # so can switch out labels that were not present earlier
    
    #
    #
    #
    
    # switch 
    
    return 0

In [3]:

# multi-switch algo 
#  switch [R (+)]/[1-R (-)] labels 
#  if they make the current fit better
#  
def switch_labels(X, y,labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R) :
    num_switched = 0
   
    
    return num_switched

In [ ]:

# metric to decide if we switch the labels or not
# can we use the margin even for Regularized Least Squares?
# the regularizer is the same 
#
#  if Xw=y, then w=(X^-1)y
#  #=> we need the current version of the classifier, with weights set
# some function of the classifier
def switch_metric(classifier )
    return 0

In [ ]:

# run the incremental self training algo
#
def self_train(X, y, labeled_ids, unlabeled_ids, R=0.5, U=1, W=0.001, alpha = 1.0:

    U_step_size = 0.001
    istep = 1
    # or, equivalently, num_steps = 1000
    #   U_step_size = 1/num_steps

    
    # run initial classifier
    X_labeled = X[labeled_ids]
    y_labeled = y[labeled_ids]
        
    guessed_ids = [] # or np.empty
    
    # loop over istep = start to finish 
    #  U_step_size*U to U in increments U_step_size
    
    #  or: break at maximum U steps
    #  or: break at some convergence criteria?
    
    # set guessed_sample_weights 
    #  U_step = (U_step_size*istep)*U
    U_step = (U_step_size*istep)*U
    
    # apply current classifier to remaining unlabeled data
    #  note:  U = U_step
    self_train_step(X, y, W, U_step, R, alpha, labeled_ids, guessed_ids, unlabeled_ids):
    
    
    switch_labels(X, y, labeled_ids, guessed_ids, unlabeled_ids, alpha, W, U, R)
    
    # stop or keep switching / stop?  
    #  just run all the way to the end?

In [ ]: