import numpy import decimal import scipy.sparse import os import sys import os.path import sklearn.model_selection import sklearn.tree import sklearn.linear_model import sklearn.tree import sklearn.ensemble from sklearn.externals import joblib import scipy.linalg import itertools DATA_DIR="/content" def get_feature_sets(datasetName): anchorURL = [0] bodyDoc = [0] if datasetName.startswith('MSLR'): for i in range(25): anchorURL.extend([5*i+2, 5*i+4]) bodyDoc.extend([5*i+1, 5*i+3, 5*i+5]) anchorURL.extend([126, 127, 128, 129, 130, 131]) bodyDoc.extend([132,133]) elif datasetName.startswith('MQ200'): for i in range(8): anchorURL.extend([5*i+2, 5*i+4]) bodyDoc.extend([5*i+1, 5*i+3, 5*i+5]) anchorURL.extend([41, 42, 43, 44, 45, 46]) else: print("Settings:get_feature_sets [ERR] Unknown dataset. Use MSLR/MQ200*", flush=True) sys.exit(0) return anchorURL, bodyDoc class Datasets: def __init__(self): #Must call either loadTxt(...)/loadNpz(...) to set all these members #before using Datasets objects elsewhere self.relevances=None self.features=None self.docsPerQuery=None self.queryMappings=None self.name=None #For filtered datasets, some docsPerQuery may be masked self.mask=None ###As a side-effect, loadTxt(...) stores a npz file for ###faster subsequent loading via loadNpz(...) #file_name: (str) Path to dataset file (.txt format) #name: (str) String to identify this Datasets object henceforth def loadTxt(self, file_name, name): #Internal: Counters to keep track of docID and qID previousQueryID=None docID=None qID=0 relevanceArray=None #QueryMappings: list[int],length=numQueries self.queryMappings=[] self.name=name #DocsPerQuery: list[int],length=numQueries self.docsPerQuery=[] #Relevances: list[Alpha],length=numQueries; Alpha:= numpy.array[int],length=docsForQuery self.relevances=[] #Features: list[Alpha],length=numQueries; #Alpha:= scipy.sparse.coo_matrix[double],shape=(docsForQuery, numFeatures) featureRows=None featureCols=None featureVals=None self.features=[] numFeatures=None #Now read in data with open(file_name, 'r') as f: outputFilename=file_name[:-4] outputFileDir=outputFilename+'_processed' if not os.path.exists(outputFileDir): os.makedirs(outputFileDir) for line in f: tokens=line.split(' ', 2) relevance=int(tokens[0]) queryID=int(tokens[1].split(':', 1)[1]) #Remove any trailing comments before extracting features remainder=tokens[2].split('#', 1) featureTokens=remainder[0].strip().split(' ') if numFeatures is None: numFeatures=len(featureTokens)+1 if (previousQueryID is None) or (queryID!=previousQueryID): #Begin processing a new query's documents docID=0 if relevanceArray is not None: #Previous query's data should be persisted to file/self.members currentRelevances=numpy.array(relevanceArray, dtype=numpy.int, copy=False) self.relevances.append(currentRelevances) numpy.savez_compressed(os.path.join(outputFileDir, str(qID)+'_rel'), relevances=currentRelevances) maxDocs=len(relevanceArray) self.docsPerQuery.append(maxDocs) currentFeatures=scipy.sparse.coo_matrix((featureVals, (featureRows, featureCols)), shape=(maxDocs, numFeatures), dtype=numpy.float64) currentFeatures=currentFeatures.tocsr() self.features.append(currentFeatures) scipy.sparse.save_npz(os.path.join(outputFileDir, str(qID)+'_feat'), currentFeatures) qID+=1 self.queryMappings.append(previousQueryID) if len(self.docsPerQuery)%100==0: print(".", end="", flush=True) relevanceArray=[] featureRows=[] featureCols=[] featureVals=[] previousQueryID=queryID else: docID+=1 relevanceArray.append(relevance) #Add a feature for the the intercept featureRows.append(docID) featureCols.append(0) featureVals.append(0.01) for featureToken in featureTokens: featureTokenSplit=featureToken.split(':', 1) featureIndex=int(featureTokenSplit[0]) featureValue=float(featureTokenSplit[1]) featureRows.append(docID) featureCols.append(featureIndex) featureVals.append(featureValue) #Finish processing the final query's data currentRelevances=numpy.array(relevanceArray, dtype=numpy.int, copy=False) self.relevances.append(currentRelevances) numpy.savez_compressed(os.path.join(outputFileDir, str(qID)+'_rel'), relevances=currentRelevances) maxDocs=len(relevanceArray) self.docsPerQuery.append(maxDocs) currentFeatures=scipy.sparse.coo_matrix((featureVals, (featureRows, featureCols)), shape=(maxDocs, numFeatures), dtype=numpy.float64) currentFeatures=currentFeatures.tocsr() self.features.append(currentFeatures) scipy.sparse.save_npz(os.path.join(outputFileDir, str(qID)+'_feat'), currentFeatures) self.queryMappings.append(previousQueryID) #Persist meta-data for the dataset for faster loading through loadNpz numpy.savez_compressed(outputFilename, docsPerQuery=self.docsPerQuery, name=self.name, queryMappings=self.queryMappings) print("", flush=True) print("Datasets:loadTxt [INFO] Loaded", file_name, "\t NumQueries", len(self.docsPerQuery), "\t [Min/Max]DocsPerQuery", min(self.docsPerQuery), max(self.docsPerQuery), flush=True) #file_name: (str) Path to dataset file/directory def loadNpz(self, file_name): with numpy.load(file_name+'.npz') as npFile: self.docsPerQuery=npFile['docsPerQuery'] self.name=str(npFile['name']) self.queryMappings=npFile['queryMappings'] fileDir = file_name+'_processed' if os.path.exists(fileDir): self.relevances=[] self.features=[] qID=0 while os.path.exists(os.path.join(fileDir, str(qID)+'_rel.npz')): with numpy.load(os.path.join(fileDir, str(qID)+'_rel.npz')) as currRelFile: self.relevances.append(currRelFile['relevances']) self.features.append(scipy.sparse.load_npz(os.path.join(fileDir, str(qID)+'_feat.npz'))) qID+=1 if qID%100==0: print(".", end="", flush=True) print("", flush=True) print("Datasets:loadNpz [INFO] Loaded", file_name, "\t NumQueries", len(self.docsPerQuery), "\t [Min/Max]DocsPerQuery", min(self.docsPerQuery), max(self.docsPerQuery), "\t [Sum] docsPerQuery", sum(self.docsPerQuery), flush=True) """ mq2008Data=Datasets() mq2008Data.loadTxt(Settings.DATA_DIR+'MQ2008.txt', 'MQ2008') mq2008Data.loadNpz(Settings.DATA_DIR+'MQ2008') del mq2008Data mq2007Data=Datasets() mq2007Data.loadTxt(Settings.DATA_DIR+'MQ2007.txt', 'MQ2007') mq2007Data.loadNpz(Settings.DATA_DIR+'MQ2007') del mq2007Data """ mslrData=Datasets() mslrData.loadTxt(Settings.DATA_DIR+'MSLR-WEB10K/mslr.txt', 'MSLR10k') del mslrData for foldID in range(1,6): for fraction in ['train','vali','test']: mslrData=Datasets() mslrData.loadTxt(Settings.DATA_DIR+'MSLR-WEB10K\\Fold'+str(foldID)+'\\'+fraction+'.txt', 'MSLR10k-'+str(foldID)+'-'+fraction) del mslrData mslrData=Datasets() mslrData.loadTxt(Settings.DATA_DIR+'MSLR/mslr.txt', 'MSLR') del mslrData for foldID in range(1,6): for fraction in ['train','vali','test']: mslrData=Datasets() mslrData.loadTxt(Settings.DATA_DIR+'MSLR\\Fold'+str(foldID)+'\\'+fraction+'.txt', 'MSLR-'+str(foldID)+'-'+fraction) del mslrData class Estimator: #ranking_size: (int) Size of slate, l #logging_policy: (UniformPolicy) Logging policy, \mu #target_policy: (Policy) Target policy, \pi def __init__(self, ranking_size, logging_policy, target_policy): self.rankingSize=ranking_size self.name=None self.loggingPolicy=logging_policy self.targetPolicy=target_policy if target_policy.name is None or logging_policy.name is None: print("Estimator:init [ERR] Either target or logging policy is not initialized", flush=True) sys.exit(0) if target_policy.dataset.name != logging_policy.dataset.name: print("Estimator:init [ERR] Target and logging policy operate on different datasets", flush=True) sys.exit(0) ###All sub-classes of Estimator should supply a estimate method ###Requires: query, logged_ranking, logged_value, ###Returns: float indicating estimated value self.runningSum=0 self.runningMean=0.0 def updateRunningAverage(self, value): self.runningSum+=1 delta=value-self.runningMean self.runningMean+=delta/self.runningSum def reset(self): self.runningSum=0 self.runningMean=0.0 class OnPolicy(Estimator): def __init__(self, ranking_size, logging_policy, target_policy, metric): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='OnPolicy' self.metric=metric #This member is set on-demand by estimateAll(...) self.savedValues=None def estimateAll(self): if self.savedValues is not None: return self.savedValues=[] numQueries=len(self.loggingPolicy.dataset.docsPerQuery) for i in range(numQueries): newRanking=self.targetPolicy.predict(i, self.rankingSize) self.savedValues.append(self.metric.computeMetric(i, newRanking)) if i%100==0: print(".", end="", flush=True) print("") print("OnPolicy:estimateAll [LOG] Precomputed estimates.", flush=True) def estimate(self, query, logged_ranking, new_ranking, logged_value): currentValue=None if self.savedValues is not None: currentValue=self.savedValues[query] else: currentValue=self.metric.computeMetric(query, new_ranking) self.updateRunningAverage(currentValue) return self.runningMean def reset(self): Estimator.reset(self) self.savedValues=None class UniformIPS(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='Unif-IPS' def estimate(self, query, logged_ranking, new_ranking, logged_value): exactMatch=numpy.absolute(new_ranking-logged_ranking).sum() == 0 currentValue=0.0 if exactMatch: numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] validDocs=logged_ranking.size invPropensity=None if self.loggingPolicy.allowRepetitions: invPropensity=numpy.float_power(numAllowedDocs, validDocs) else: invPropensity=numpy.prod(range(numAllowedDocs+1-validDocs, numAllowedDocs+1), dtype=numpy.float64) currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) return self.runningMean class NonUniformIPS(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='NonUnif-IPS' def estimate(self, query, logged_ranking, new_ranking, logged_value): exactMatch=numpy.absolute(new_ranking-logged_ranking).sum() == 0 currentValue=0.0 if exactMatch: numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] underlyingRanking=self.loggingPolicy.policy.predict(query, -1) currentDistribution=self.loggingPolicy.multinomials[numAllowedDocs] numRankedDocs=logged_ranking.size invPropensity=1.0 denominator=1.0 for j in range(numRankedDocs): underlyingIndex=numpy.flatnonzero(underlyingRanking == logged_ranking[j])[0] invPropensity*=(denominator*1.0/currentDistribution[underlyingIndex]) if not self.loggingPolicy.allowRepetitions: denominator-=currentDistribution[underlyingIndex] currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) return self.runningMean class UniformSNIPS(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='Unif-IPS_SN' self.runningDenominatorMean=0.0 def estimate(self, query, logged_ranking, new_ranking, logged_value): exactMatch=numpy.absolute(new_ranking-logged_ranking).sum() == 0 currentValue=0.0 if exactMatch: numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] validDocs=logged_ranking.size invPropensity=None if self.loggingPolicy.allowRepetitions: invPropensity=numpy.float_power(numAllowedDocs, validDocs) else: invPropensity=numpy.prod(range(numAllowedDocs+1-validDocs, numAllowedDocs+1), dtype=numpy.float64) currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) denominatorDelta=invPropensity-self.runningDenominatorMean self.runningDenominatorMean+=denominatorDelta/self.runningSum if self.runningDenominatorMean!=0.0: return 1.0*self.runningMean/self.runningDenominatorMean else: return 0.0 def reset(self): Estimator.reset(self) self.runningDenominatorMean=0.0 class NonUniformSNIPS(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='NonUnif-IPS_SN' self.runningDenominatorMean=0.0 def estimate(self, query, logged_ranking, new_ranking, logged_value): exactMatch=numpy.absolute(new_ranking-logged_ranking).sum() == 0 currentValue=0.0 if exactMatch: numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] underlyingRanking=self.loggingPolicy.policy.predict(query, -1) currentDistribution=self.loggingPolicy.multinomials[numAllowedDocs] numRankedDocs=logged_ranking.size invPropensity=1.0 denominator=1.0 for j in range(numRankedDocs): underlyingIndex=numpy.flatnonzero(underlyingRanking == logged_ranking[j])[0] invPropensity*=(denominator*1.0/currentDistribution[underlyingIndex]) if not self.loggingPolicy.allowRepetitions: denominator-=currentDistribution[underlyingIndex] currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) denominatorDelta=invPropensity-self.runningDenominatorMean self.runningDenominatorMean+=denominatorDelta/self.runningSum if self.runningDenominatorMean!=0.0: return 1.0*self.runningMean/self.runningDenominatorMean else: return 0.0 def reset(self): Estimator.reset(self) self.runningDenominatorMean=0.0 class UniformPI(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='Unif-PI' def estimate(self, query, logged_ranking, new_ranking, logged_value): numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] validDocs=logged_ranking.size vectorDimension=validDocs*numAllowedDocs exploredMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) newMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) for j in range(validDocs): if self.loggingPolicy.dataset.mask is None: exploredMatrix[j, logged_ranking[j]]=1 newMatrix[j, new_ranking[j]]=1 else: logIndex=numpy.flatnonzero(self.loggingPolicy.dataset.mask[query] == logged_ranking[j])[0] newIndex=numpy.flatnonzero(self.loggingPolicy.dataset.mask[query] == new_ranking[j])[0] exploredMatrix[j, logIndex]=1 newMatrix[j, newIndex]=1 posRelVector=exploredMatrix.reshape(vectorDimension) newSlateVector=newMatrix.reshape(vectorDimension) estimatedPhi=numpy.dot(self.loggingPolicy.gammas[numAllowedDocs], posRelVector) invPropensity=numpy.dot(estimatedPhi, newSlateVector) currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) return self.runningMean class NonUniformPI(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='NonUnif-PI' def estimate(self, query, logged_ranking, new_ranking, logged_value): numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] underlyingRanking=self.loggingPolicy.policy.predict(query, -1) validDocs=logged_ranking.size vectorDimension=validDocs*numAllowedDocs exploredMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) newMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) for j in range(validDocs): logIndex=numpy.flatnonzero(underlyingRanking == logged_ranking[j])[0] newIndex=numpy.flatnonzero(underlyingRanking == new_ranking[j])[0] exploredMatrix[j, logIndex]=1 newMatrix[j, newIndex]=1 posRelVector=exploredMatrix.reshape(vectorDimension) newSlateVector=newMatrix.reshape(vectorDimension) estimatedPhi=numpy.dot(self.loggingPolicy.gammas[numAllowedDocs], posRelVector) invPropensity=numpy.dot(estimatedPhi, newSlateVector) currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) return self.runningMean class UniformSNPI(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='Unif-PI_SN' self.runningDenominatorMean=0.0 def estimate(self, query, logged_ranking, new_ranking, logged_value): numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] validDocs=logged_ranking.size vectorDimension=validDocs*numAllowedDocs exploredMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) newMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) for j in range(validDocs): if self.loggingPolicy.dataset.mask is None: exploredMatrix[j, logged_ranking[j]]=1 newMatrix[j, new_ranking[j]]=1 else: logIndex=numpy.flatnonzero(self.loggingPolicy.dataset.mask[query] == logged_ranking[j])[0] newIndex=numpy.flatnonzero(self.loggingPolicy.dataset.mask[query] == new_ranking[j])[0] exploredMatrix[j, logIndex]=1 newMatrix[j, newIndex]=1 posRelVector=exploredMatrix.reshape(vectorDimension) newSlateVector=newMatrix.reshape(vectorDimension) estimatedPhi=numpy.dot(self.loggingPolicy.gammas[numAllowedDocs], posRelVector) invPropensity=numpy.dot(estimatedPhi, newSlateVector) currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) denominatorDelta=invPropensity-self.runningDenominatorMean self.runningDenominatorMean+=denominatorDelta/self.runningSum if self.runningDenominatorMean!=0.0: return 1.0*self.runningMean/self.runningDenominatorMean else: return 0.0 def reset(self): Estimator.reset(self) self.runningDenominatorMean=0.0 class NonUniformSNPI(Estimator): def __init__(self, ranking_size, logging_policy, target_policy): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name='NonUnif-PI_SN' self.runningDenominatorMean=0.0 def estimate(self, query, logged_ranking, new_ranking, logged_value): numAllowedDocs=self.loggingPolicy.dataset.docsPerQuery[query] underlyingRanking=self.loggingPolicy.policy.predict(query, -1) validDocs=logged_ranking.size vectorDimension=validDocs*numAllowedDocs exploredMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) newMatrix=numpy.zeros((validDocs, numAllowedDocs), dtype=numpy.float64) for j in range(validDocs): logIndex=numpy.flatnonzero(underlyingRanking == logged_ranking[j])[0] newIndex=numpy.flatnonzero(underlyingRanking == new_ranking[j])[0] exploredMatrix[j, logIndex]=1 newMatrix[j, newIndex]=1 posRelVector=exploredMatrix.reshape(vectorDimension) newSlateVector=newMatrix.reshape(vectorDimension) estimatedPhi=numpy.dot(self.loggingPolicy.gammas[numAllowedDocs], posRelVector) invPropensity=numpy.dot(estimatedPhi, newSlateVector) currentValue=logged_value*invPropensity self.updateRunningAverage(currentValue) denominatorDelta=invPropensity-self.runningDenominatorMean self.runningDenominatorMean+=denominatorDelta/self.runningSum if self.runningDenominatorMean!=0.0: return 1.0*self.runningMean/self.runningDenominatorMean else: return 0.0 def reset(self): Estimator.reset(self) self.runningDenominatorMean=0.0 class Direct(Estimator): def __init__(self, ranking_size, logging_policy, target_policy, estimator_type): Estimator.__init__(self, ranking_size, logging_policy, target_policy) self.name = 'Direct_'+estimator_type self.estimatorType = estimator_type self.numFeatures=self.loggingPolicy.dataset.features[0].shape[1] self.hyperParams={'alpha': (numpy.logspace(-2,1,num=4,base=10)).tolist()} self.treeDepths={'max_depth': list(range(3,15,3))} if self.estimatorType=='tree': self.tree=None else: self.policyParams=None #This member is set on-demand by estimateAll(...) self.savedValues=None def estimateAll(self, metric=None): if self.savedValues is not None: return self.savedValues=[] numQueries=len(self.loggingPolicy.dataset.docsPerQuery) for query in range(numQueries): newRanking=self.targetPolicy.predict(query, self.rankingSize) allFeatures=self.loggingPolicy.dataset.features[query][newRanking,:] if newRanking.size < self.rankingSize: emptyPad=scipy.sparse.csr_matrix((self.rankingSize-newRanking.size, self.numFeatures), dtype=numpy.float64) allFeatures=scipy.sparse.vstack((allFeatures, emptyPad), format="csr", dtype=numpy.float64) allFeatures=allFeatures.toarray() nRows, nCols = allFeatures.shape size=nRows*nCols currentFeatures=numpy.reshape(allFeatures, (1,size)) currentValue=None if self.estimatorType=='tree': currentValue=self.tree.predict(currentFeatures)[0] else: currentValue=numpy.dot(currentFeatures, self.policyParams)[0] low=None high=None if metric is not None: low=metric.getMin(newRanking.size) high=metric.getMax(newRanking.size) if low is not None: currentValue = max(currentValue, low) if high is not None: currentValue = min(currentValue, high) if currentValue > 1.0 or currentValue < 0.0: print("Direct:estimateAll [LOG] estimate %0.3f " % (currentValue), flush=True) del allFeatures del currentFeatures self.savedValues.append(currentValue) if query%100==0: print(".", end="", flush=True) print("") print("Direct:estimateAll [LOG] Precomputed estimates.", flush=True) def train(self, logged_data): numInstances=len(logged_data) targets=numpy.zeros(numInstances, order='C', dtype=numpy.float64) covariates=scipy.sparse.lil_matrix((numInstances, self.numFeatures*self.rankingSize), dtype=numpy.float64) print("Starting to create covariates", flush=True) for j in range(numInstances): currentDatapoint=logged_data.pop() targets[j]=currentDatapoint[2] currentQuery=currentDatapoint[0] currentRanking=currentDatapoint[1] allFeatures=self.loggingPolicy.dataset.features[currentQuery][currentRanking,:] allFeatures.eliminate_zeros() covariates.data[j]=allFeatures.data newIndices=allFeatures.indices for k in range(allFeatures.shape[0]): newIndices[allFeatures.indptr[k]:allFeatures.indptr[k+1]]+=k*self.numFeatures covariates.rows[j]=newIndices if j%1000 == 0: print(".", end='', flush=True) del currentDatapoint del allFeatures print("Converting covariates", flush=True) covariates=covariates.tocsr() print("Finished conversion", flush=True) if self.estimatorType=='tree': treeCV=sklearn.model_selection.GridSearchCV(sklearn.tree.DecisionTreeRegressor(criterion="mse", splitter="random", min_samples_split=4, min_samples_leaf=4, presort=False), param_grid=self.treeDepths, scoring=None, fit_params=None, n_jobs=1, iid=True, cv=3, refit=True, verbose=0, pre_dispatch=1, error_score='raise', return_train_score=False) treeCV.fit(covariates, targets) self.tree=treeCV.best_estimator_ print("DirectEstimator:train [INFO] Done. Best depth", treeCV.best_params_['max_depth'], flush=True) elif self.estimatorType=='lasso': lassoCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Lasso(fit_intercept=False, normalize=False, precompute=False, copy_X=False, max_iter=30000, tol=1e-4, warm_start=False, positive=False, random_state=None, selection='random'), param_grid=self.hyperParams, scoring=None, fit_params=None, n_jobs=1, iid=True, cv=3, refit=True, verbose=0, pre_dispatch=1, error_score='raise', return_train_score=False) lassoCV.fit(covariates, targets) self.policyParams=lassoCV.best_estimator_.coef_ print("DirectEstimator:train [INFO] Done. CVAlpha", lassoCV.best_params_['alpha'], flush=True) elif self.estimatorType=='ridge': ridgeCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Ridge(fit_intercept=False, normalize=False, copy_X=False, max_iter=30000, tol=1e-4, solver='sag', random_state=None), param_grid=self.hyperParams, scoring=None, fit_params=None, n_jobs=1, iid=True, cv=3, refit=True, verbose=0, pre_dispatch=1, error_score='raise', return_train_score=False) ridgeCV.fit(covariates, targets) self.policyParams=ridgeCV.best_estimator_.coef_ print("DirectEstimator:train [INFO] Done. CVAlpha", ridgeCV.best_params_['alpha'], flush=True) else: print("DirectEstimator:train [ERR] %s not supported." % self.modelType, flush=True) sys.exit(0) def estimate(self, query, logged_ranking, new_ranking, logged_value): currentValue=None if self.savedValues is not None: currentValue=self.savedValues[query] else: allFeatures=self.loggingPolicy.dataset.features[query][new_ranking,:] if new_ranking.size < self.rankingSize: emptyPad=scipy.sparse.csr_matrix((self.rankingSize-new_ranking.size, self.numFeatures), dtype=numpy.float64) allFeatures=scipy.sparse.vstack((allFeatures, emptyPad), format="csr", dtype=numpy.float64) allFeatures=allFeatures.toarray() nRows, nCols = allFeatures.shape size=nRows*nCols currentFeatures=numpy.reshape(allFeatures, (1,size)) if self.estimatorType=='tree': currentValue=self.tree.predict(currentFeatures)[0] else: currentValue=numpy.dot(currentFeatures, self.policyParams)[0] del allFeatures del currentFeatures self.updateRunningAverage(currentValue) return self.runningMean def reset(self): Estimator.reset(self) self.savedValues=None if self.estimatorType=='tree': self.tree=None else: self.policyParams=None class GammaCalculator: def __init__(self, weights, nSlots): assert nSlots>0, "NSLOTS MUST BE POSITIVE" self.nDocs = len(weights) self.nSlots = nSlots self.nTypes = 0 self.weightToType = {} self.typeToWeight = [] self.typeToDocs = [] self.nDocsOfType = [] self.docToType = [] self.weights = weights for i in range(len(weights)): weight = weights[i] if not weight in self.weightToType: self.typeToWeight.append(decimal.Decimal(weight)) self.typeToDocs.append([]) self.nDocsOfType.append(0) self.weightToType[weight] = self.nTypes self.nTypes += 1 t = self.weightToType[weight] self.docToType.append(t) self.nDocsOfType[t] += 1 self.typeToDocs[t].append(i) self.table = {} empty_prefix = (0,)*self.nTypes self.table[ empty_prefix, () ] = decimal.Decimal(1) self.visited = set() self.fill_table(empty_prefix, ()) self.gamma_types = {} for (prefix,anchor) in self.table.keys(): length = sum(prefix) for t in range(self.nTypes): if prefix[t]0, "DECR PREFIX OUT OF BOUNDS" prefix_mut[t] -= 1 return tuple(prefix_mut) def incr(self, prefix, t): prefix_mut = list(prefix) assert prefix_mut[t]0: prefix0 = self.decr(prefix, t) if anchor==() or prefix0[anchor[1]]>0: prob += self.get_prob(prefix0, anchor, t) else: t=anchor[1] prefix0 = self.decr(prefix, t) prob += self.get_prob(prefix0, (), t) self.table[prefix,anchor] = prob return prob def fill_table(self, prefix, anchor): """add more entries to the DP table extending the current prefix. here: prefix: tuple of type counts anchor: specifies (pos, type) where pos self.bestSoFar: self.bestSoFar=prefix_value self.bestSlate=slate_prefix return docSet=set(slate_prefix) bestFutureVal=0.0 if currentPos < self.l: bestFutureVal=self.upperPos[currentPos:].sum() delta=prefix_value+bestFutureVal for i in range(self.m): currentDoc=self.sortedIndices[-1-i, currentPos] if currentDoc in docSet: continue currentVal=self.scores[currentDoc, currentPos] if self.bestSoFar is None or ((currentVal+delta) > self.bestSoFar): self.eval_slate(slate_prefix + [currentDoc], prefix_value+currentVal) else: break class Policy: #dataset: (Datasets) Must be initialized using Datasets.loadTxt(...)/loadNpz(...) #allow_repetitions: (bool) If true, the policy predicts rankings with repeated documents def __init__(self, dataset, allow_repetitions): self.dataset=dataset self.allowRepetitions=allow_repetitions self.name=None ###All sub-classes of Policy should supply a predict method ###Requires: (int) query_id; (int) ranking_size. ###Returns: list[int],length=min(ranking_size,docsPerQuery[query_id]) ranking class L2RPolicy(Policy): def __init__(self, dataset, ranking_size, model_type, greedy_select, cross_features): Policy.__init__(self, dataset, False) self.rankingSize=ranking_size self.numDocFeatures=dataset.features[0].shape[1] self.modelType=model_type self.crossFeatures=cross_features self.hyperParams=numpy.logspace(0,2,num=5,base=10).tolist() if self.modelType=='tree' or self.modelType=='gbrt': self.tree=None else: self.policyParams=None self.greedy=greedy_select self.numFeatures=self.numDocFeatures+self.rankingSize if self.crossFeatures: self.numFeatures+=self.numDocFeatures*self.rankingSize print("L2RPolicy:init [INFO] Dataset:", dataset.name, flush=True) def createFeature(self, docFeatures, position): currFeature=numpy.zeros(self.numFeatures, dtype=numpy.float64) currFeature[0:self.numDocFeatures]=docFeatures currFeature[self.numDocFeatures+position]=1 if self.crossFeatures: currFeature[self.numDocFeatures+self.rankingSize+position*self.numDocFeatures: \ self.numDocFeatures+self.rankingSize+(position+1)*self.numDocFeatures]=docFeatures return currFeature.reshape(1,-1) def predict(self, query_id, ranking_size): allowedDocs=self.dataset.docsPerQuery[query_id] validDocs=min(allowedDocs, self.rankingSize) allScores=numpy.zeros((allowedDocs, validDocs), dtype=numpy.float64) allFeatures=self.dataset.features[query_id].toarray() for doc in range(allowedDocs): docID=doc if self.dataset.mask is not None: docID=self.dataset.mask[query_id][doc] for pos in range(validDocs): currFeature=self.createFeature(allFeatures[docID,:], pos) if self.modelType=='tree' or self.modelType=='gbrt': allScores[doc, pos]=self.tree.predict(currFeature) else: allScores[doc, pos]=currFeature.dot(self.policyParams) tieBreaker=1e-14*numpy.random.random((allowedDocs, validDocs)) allScores+=tieBreaker upperBound=numpy.amax(allScores, axis=0) producedRanking=None if self.greedy: producedRanking=numpy.empty(validDocs, dtype=numpy.int32) currentVal=0.0 for i in range(validDocs): maxIndex=numpy.argmax(allScores) chosenDoc,chosenPos = numpy.unravel_index(maxIndex, allScores.shape) currentVal+=allScores[chosenDoc, chosenPos] if self.dataset.mask is None: producedRanking[chosenPos]=chosenDoc else: producedRanking[chosenPos]=self.dataset.mask[query_id][chosenDoc] allScores[chosenDoc,:] = float('-inf') allScores[:,chosenPos] = float('-inf') self.debug=upperBound.sum()-currentVal else: slateScorer=RecursiveSlateEval(allScores) if self.dataset.mask is None: producedRanking=numpy.array(slateScorer.bestSlate) else: producedRanking=self.dataset.mask[slateScorer.bestSlate] self.debug=upperBound.sum()-slateScorer.bestSoFar del slateScorer del allFeatures del allScores return producedRanking def train(self, dataset, targets, hyper_params): numQueries=len(dataset.docsPerQuery) validDocs=numpy.minimum(dataset.docsPerQuery, self.rankingSize) queryDocPosTriplets=numpy.dot(dataset.docsPerQuery, validDocs) designMatrix=numpy.zeros((queryDocPosTriplets, self.numFeatures), dtype=numpy.float32, order='F') regressionTargets=numpy.zeros(queryDocPosTriplets, dtype=numpy.float64, order='C') sampleWeights=numpy.zeros(queryDocPosTriplets, dtype=numpy.float32) currID=-1 for i in range(numQueries): numAllowedDocs=dataset.docsPerQuery[i] currValidDocs=validDocs[i] allFeatures=dataset.features[i].toarray() for doc in range(numAllowedDocs): docID=doc if dataset.mask is not None: docID=dataset.mask[i][doc] for j in range(currValidDocs): currID+=1 designMatrix[currID,:]=self.createFeature(allFeatures[docID,:], j) regressionTargets[currID]=targets[i][j,doc] sampleWeights[currID]=1.0/(numAllowedDocs * currValidDocs) for i in targets: del i del targets print("L2RPolicy:train [LOG] Finished creating features and targets ", numpy.amin(regressionTargets), numpy.amax(regressionTargets), numpy.median(regressionTargets), flush=True) print("L2RPolicy:train [LOG] Histogram of targets ", numpy.histogram(regressionTargets), flush=True) if self.modelType == 'gbrt': tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=hyper_params['lr'], n_estimators=hyper_params['ensemble'], subsample=hyper_params['subsample'], max_leaf_nodes=hyper_params['leaves'], max_features=1.0, presort=False) tree.fit(designMatrix, regressionTargets, sample_weight=sampleWeights) self.tree=tree print("L2RPolicy:train [INFO] %s" % self.modelType, flush=True) elif self.modelType == 'ridge': ridgeCV=sklearn.linear_model.RidgeCV(alphas=self.hyperParams, fit_intercept=False, normalize=False, cv=3) ridgeCV.fit(designMatrix, regressionTargets, sample_weight=sampleWeights) self.policyParams=ridgeCV.coef_ print("L2RPolicy:train [INFO] Done. ", flush=True) else: print("L2RPolicy:train [ERR] %s not supported." % self.modelType, flush = True) sys.exit(0) print("L2R:train [INFO] Created %s predictor using dataset %s." % (self.modelType, dataset.name), flush = True) class DeterministicPolicy(Policy): #model_type: (str) Model class to use for scoring documents def __init__(self, dataset, model_type, regress_gains=False, weighted_ls=False, hyper_params=None): Policy.__init__(self, dataset, False) self.modelType=model_type self.hyperParams={'alpha': (numpy.logspace(-3,2,num=6,base=10)).tolist()} if hyper_params is not None: self.hyperParams=hyper_params self.regressGains=regress_gains self.weighted=weighted_ls self.treeDepths={'max_depth': list(range(3,21,3))} #Must call train(...) to set all these members #before using DeterministicPolicy objects elsewhere self.featureList=None if self.modelType=='tree': self.tree=None else: self.policyParams=None #These members are set by predictAll(...) method self.savedRankingsSize=None self.savedRankings=None print("DeterministicPolicy:init [INFO] Dataset", dataset.name, flush=True) #feature_list: list[int],length=unmaskedFeatures; List of features that should be used for training #name: (str) String to help identify this DeterministicPolicy object henceforth def train(self, feature_list, name): self.featureList=feature_list self.name=name+'-'+self.modelType modelFile=Settings.DATA_DIR+self.dataset.name+'_'+self.name if 'alpha' not in self.hyperParams: #Expecting hyper-params for GBRT; Add those hyper-params to the model file name modelFile=modelFile+'ensemble-'+str(self.hyperParams['ensemble'])+'_lr-'+str(self.hyperParams['lr'])+'_subsample-'+str(self.hyperParams['subsample'])+'_leaves-'+str(self.hyperParams['leaves']) if self.modelType=='tree' or self.modelType=='gbrt': modelFile+='.z' else: modelFile+='.npz' self.savedRankingsSize=None self.savedRankings=None if os.path.exists(modelFile): if self.modelType=='tree' or self.modelType=='gbrt': self.tree=joblib.load(modelFile) print("DeterministicPolicy:train [INFO] Using precomputed policy", modelFile, flush=True) else: with numpy.load(modelFile) as npFile: self.policyParams=npFile['policyParams'] print("DeterministicPolicy:train [INFO] Using precomputed policy", modelFile, flush=True) print("DeterministicPolicy:train [INFO] PolicyParams", self.policyParams,flush=True) else: numQueries=len(self.dataset.features) allFeatures=None allTargets=None print("DeterministicPolicy:train [INFO] Constructing features and targets", flush=True) if self.dataset.mask is None: allFeatures=scipy.sparse.vstack(self.dataset.features, format='csc') allTargets=numpy.hstack(self.dataset.relevances) else: temporaryFeatures=[] temporaryTargets=[] for currentQuery in range(numQueries): temporaryFeatures.append(self.dataset.features[currentQuery][self.dataset.mask[currentQuery], :]) temporaryTargets.append(self.dataset.relevances[currentQuery][self.dataset.mask[currentQuery]]) allFeatures=scipy.sparse.vstack(temporaryFeatures, format='csc') allTargets=numpy.hstack(temporaryTargets) if self.regressGains: allTargets=numpy.exp2(allTargets)-1.0 allSampleWeights=None fitParams=None if self.weighted: allSampleWeights=numpy.array(self.dataset.docsPerQuery, dtype=numpy.float64) allSampleWeights=numpy.reciprocal(allSampleWeights) allSampleWeights=numpy.repeat(allSampleWeights, self.dataset.docsPerQuery) fitParams={'sample_weight': allSampleWeights} #Restrict features to only the unmasked features if self.featureList is not None: print("DeterministicPolicy:train [INFO] Masking unused features. Remaining feature size", len(feature_list), flush=True) allFeatures = allFeatures[:, self.featureList] print("DeterministicPolicy:train [INFO] Beginning training", self.modelType, flush=True) if self.modelType=='tree': treeCV=sklearn.model_selection.GridSearchCV(sklearn.tree.DecisionTreeRegressor(criterion="mse", splitter="random", min_samples_split=4, min_samples_leaf=4, presort=False), param_grid=self.treeDepths, scoring=None, fit_params=fitParams, n_jobs=-2, iid=True, cv=5, refit=True, verbose=0, pre_dispatch="1*n_jobs", error_score='raise', return_train_score=False) treeCV.fit(allFeatures, allTargets) self.tree=treeCV.best_estimator_ print("DeterministicPolicy:train [INFO] Done. Best depth", treeCV.best_params_['max_depth'], flush=True) joblib.dump(self.tree, modelFile, compress=9, protocol=-1) elif self.modelType=='lasso': lassoCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Lasso(fit_intercept=False, normalize=False, precompute=False, copy_X=False, max_iter=3000, tol=1e-4, warm_start=False, positive=False, random_state=None, selection='random'), param_grid=self.hyperParams, scoring=None, fit_params=fitParams, n_jobs=-2, iid=True, cv=5, refit=True, verbose=0, pre_dispatch="1*n_jobs", error_score='raise', return_train_score=False) lassoCV.fit(allFeatures, allTargets) self.policyParams=lassoCV.best_estimator_.coef_ print("DeterministicPolicy:train [INFO] Done. CVAlpha", lassoCV.best_params_['alpha'], flush=True) print("DeterministicPolicy:train [INFO] PolicyParams", self.policyParams,flush=True) numpy.savez_compressed(modelFile, policyParams=self.policyParams) elif self.modelType == 'ridge': ridgeCV=sklearn.model_selection.GridSearchCV(sklearn.linear_model.Ridge(fit_intercept=False, normalize=False, copy_X=False, max_iter=3000, tol=1e-4, random_state=None), param_grid=self.hyperParams, n_jobs=-2, fit_params=fitParams, iid=True, cv=3, refit=True, verbose=0, pre_dispatch='1*n_jobs') ridgeCV.fit(allFeatures, allTargets) self.policyParams=ridgeCV.best_estimator_.coef_ print("DeterministicPolicy:train [INFO] Done. CVAlpha", ridgeCV.best_params_['alpha'], flush=True) elif self.modelType=='gbrt': tree=sklearn.ensemble.GradientBoostingRegressor(learning_rate=self.hyperParams['lr'], n_estimators=self.hyperParams['ensemble'], subsample=self.hyperParams['subsample'], max_leaf_nodes=self.hyperParams['leaves'], max_features=1.0, presort=False) tree.fit(allFeatures, allTargets, sample_weight=allSampleWeights) self.tree=tree print("DeterministicPolicy:train [INFO] Done.", flush=True) joblib.dump(self.tree, modelFile, compress=9, protocol=-1) else: print("DeterministicPolicy:train [ERR] %s not supported." % self.modelType, flush=True) sys.exit(0) #query_id: (int) Query ID in self.dataset #ranking_size: (int) Size of ranking. Returned ranking length is min(ranking_size,docsPerQuery[query_id]) # Use ranking_size=-1 to rank all available documents for query_id def predict(self, query_id, ranking_size): if self.savedRankingsSize is not None and self.savedRankingsSize==ranking_size: return self.savedRankings[query_id] allowedDocs=self.dataset.docsPerQuery[query_id] validDocs=ranking_size if ranking_size <= 0 or validDocs > allowedDocs: validDocs=allowedDocs currentFeatures=None if self.dataset.mask is None: if self.featureList is not None: currentFeatures=self.dataset.features[query_id][:, self.featureList] else: currentFeatures=self.dataset.features[query_id] else: currentFeatures=self.dataset.features[query_id][self.dataset.mask[query_id], :] if self.featureList is not None: currentFeatures=currentFeatures[:, self.featureList] allDocScores=None if self.modelType=='tree': allDocScores=self.tree.predict(currentFeatures) elif self.modelType=='gbrt': allDocScores=self.tree.predict(currentFeatures.toarray()) else: allDocScores=currentFeatures.dot(self.policyParams) tieBreaker=numpy.random.random(allDocScores.size) sortedDocScores=numpy.lexsort((tieBreaker,-allDocScores))[0:validDocs] if self.dataset.mask is None: return sortedDocScores else: return self.dataset.mask[query_id][sortedDocScores] #ranking_size: (int) Size of ranking. Returned ranking length is min(ranking_size,docsPerQuery[query_id]) # Use ranking_size=-1 to rank all available documents for query_id def predictAll(self, ranking_size): if self.savedRankingsSize is not None and self.savedRankingsSize==ranking_size: return numQueries=len(self.dataset.features) predictedRankings=[] for i in range(numQueries): predictedRankings.append(self.predict(i, ranking_size)) if i%100==0: print(".", end="", flush=True) self.savedRankingsSize=ranking_size self.savedRankings=predictedRankings print("", flush=True) print("DeterministicPolicy:predictAll [INFO] Generated all predictions for %s using policy: " % self.dataset.name, self.name, flush=True) #num_allowed_docs: (int) Filters the dataset where the max docs per query is num_allowed_docs. # Uses policyParams to rank and filter the original document set. def filterDataset(self, num_allowed_docs): self.savedRankingsSize=None self.savedRankings=None numQueries=len(self.dataset.docsPerQuery) self.dataset.name=self.dataset.name+'-filt('+self.name+'-'+str(num_allowed_docs)+')' newMask = [] for i in range(numQueries): producedRanking=self.predict(i, num_allowed_docs) self.dataset.docsPerQuery[i]=numpy.shape(producedRanking)[0] newMask.append(producedRanking) if i%100==0: print(".", end="", flush=True) self.dataset.mask=newMask print("", flush=True) print("DeterministicPolicy:filteredDataset [INFO] New Name", self.dataset.name, "\t MaxNumDocs", num_allowed_docs, flush=True) class UniformPolicy(Policy): def __init__(self, dataset, allow_repetitions): Policy.__init__(self, dataset, allow_repetitions) self.name='Unif-' if allow_repetitions: self.name+='Rep' else: self.name+='NoRep' #These members are set on-demand by setupGamma(...) self.gammas=None self.gammaRankingSize=None print("UniformPolicy:init [INFO] Dataset: %s AllowRepetitions:" % dataset.name, allow_repetitions, flush=True) #ranking_size: (int) Size of ranking. def setupGamma(self, ranking_size): if self.gammaRankingSize is not None and self.gammaRankingSize==ranking_size: print("UniformPolicy:setupGamma [INFO] Gamma has been pre-computed for this ranking_size. Size of Gamma cache:", len(self.gammas), flush=True) return gammaFile=Settings.DATA_DIR+self.dataset.name+'_'+self.name+'_'+str(ranking_size)+'.z' if os.path.exists(gammaFile): self.gammas=joblib.load(gammaFile) self.gammaRankingSize=ranking_size print("UniformPolicy:setupGamma [INFO] Using precomputed gamma", gammaFile, flush=True) else: self.gammas={} self.gammaRankingSize=ranking_size candidateSet=set(self.dataset.docsPerQuery) responses=joblib.Parallel(n_jobs=-2, verbose=50)(joblib.delayed(UniformGamma)(i, ranking_size, self.allowRepetitions) for i in candidateSet) for tup in responses: self.gammas[tup[0]]=tup[1] joblib.dump(self.gammas, gammaFile, compress=9, protocol=-1) print("", flush=True) print("UniformPolicy:setupGamma [INFO] Finished creating Gamma_pinv cache. Size", len(self.gammas), flush=True) def predict(self, query_id, ranking_size): allowedDocs=self.dataset.docsPerQuery[query_id] validDocs=ranking_size if ranking_size < 0 or ((not self.allowRepetitions) and (validDocs > allowedDocs)): validDocs=allowedDocs producedRanking=None if self.allowRepetitions: producedRanking=numpy.random.choice(allowedDocs, size=validDocs, replace=True) else: producedRanking=numpy.random.choice(allowedDocs, size=validDocs, replace=False) if self.dataset.mask is None: return producedRanking else: return self.dataset.mask[query_id][producedRanking] class NonUniformPolicy(Policy): def __init__(self, deterministic_policy, dataset, allow_repetitions, decay): Policy.__init__(self, dataset, allow_repetitions) self.decay = decay self.policy = deterministic_policy self.name='NonUnif-' if allow_repetitions: self.name+='Rep' else: self.name+='NoRep' self.name += '(' + deterministic_policy.name + ';' + str(decay) + ')' #These members are set on-demand by setupGamma self.gammas=None self.multinomials=None self.gammaRankingSize=None print("NonUniformPolicy:init [INFO] Dataset: %s AllowRepetitions:" % dataset.name, allow_repetitions, "\t Decay:", decay, flush=True) def setupGamma(self, ranking_size): if self.gammaRankingSize is not None and self.gammaRankingSize==ranking_size: print("NonUniformPolicy:setupGamma [INFO] Gamma has been pre-computed for this ranking_size. Size of Gamma cache:", len(self.gammas), flush=True) return gammaFile=Settings.DATA_DIR+self.dataset.name+'_'+self.name+'_'+str(ranking_size)+'.z' if os.path.exists(gammaFile): self.gammas, self.multinomials=joblib.load(gammaFile) self.gammaRankingSize=ranking_size print("NonUniformPolicy:setupGamma [INFO] Using precomputed gamma", gammaFile, flush=True) else: self.gammas={} self.multinomials={} self.gammaRankingSize=ranking_size candidateSet=set(self.dataset.docsPerQuery) responses=joblib.Parallel(n_jobs=-2, verbose=50)(joblib.delayed(NonUniformGamma)(i, self.decay, ranking_size, self.allowRepetitions) for i in candidateSet) for tup in responses: self.gammas[tup[0]]=tup[2] self.multinomials[tup[0]]=tup[1] joblib.dump((self.gammas, self.multinomials), gammaFile, compress=9, protocol=-1) print("", flush=True) print("NonUniformPolicy:setupGamma [INFO] Finished creating Gamma_pinv cache. Size", len(self.gammas), flush=True) self.policy.predictAll(-1) def predict(self, query_id, ranking_size): allowedDocs=self.dataset.docsPerQuery[query_id] underlyingRanking=self.policy.predict(query_id, -1) validDocs=ranking_size if ranking_size < 0 or ((not self.allowRepetitions) and (validDocs > allowedDocs)): validDocs=allowedDocs currentDistribution=self.multinomials[allowedDocs] producedRanking=None if self.allowRepetitions: producedRanking=numpy.random.choice(allowedDocs, size=validDocs, replace=True, p=currentDistribution) else: producedRanking=numpy.random.choice(allowedDocs, size=validDocs, replace=False, p=currentDistribution) return underlyingRanking[producedRanking] M=100 L=10 resetSeed=387 mslrData=Datasets.Datasets() mslrData.loadNpz(Settings.DATA_DIR+'MSLR/mslr') anchorURLFeatures, bodyTitleDocFeatures=Settings.get_feature_sets("MSLR") numpy.random.seed(resetSeed) detLogger=DeterministicPolicy(mslrData, 'tree') detLogger.train(anchorURLFeatures, 'url') detLogger.filterDataset(M) filteredDataset=detLogger.dataset del mslrData del detLogger uniform=UniformPolicy(filteredDataset, False) uniform.setupGamma(L) del uniform numpy.random.seed(resetSeed) loggingPolicyTree=DeterministicPolicy(filteredDataset, 'tree') loggingPolicyTree.train(anchorURLFeatures, 'url') numpy.random.seed(resetSeed) targetPolicyTree=DeterministicPolicy(filteredDataset, 'tree') targetPolicyTree.train(bodyTitleDocFeatures, 'body') numpy.random.seed(resetSeed) loggingPolicyLinear=DeterministicPolicy(filteredDataset, 'lasso') loggingPolicyLinear.train(anchorURLFeatures, 'url') numpy.random.seed(resetSeed) targetPolicyLinear=DeterministicPolicy(filteredDataset, 'lasso') targetPolicyLinear.train(bodyTitleDocFeatures, 'body') numQueries=len(filteredDataset.docsPerQuery) TTtau=[] TToverlap=[] TLtau=[] TLoverlap=[] LTtau=[] LToverlap=[] LLtau=[] LLoverlap=[] LogLogtau=[] LogLogoverlap=[] TargetTargettau=[] TargetTargetoverlap=[] def computeTau(ranking1, ranking2): rank1set=set(ranking1) rank2set=set(ranking2) documents=rank1set | rank2set rankingSize=len(rank1set) newRanking1=numpy.zeros(len(documents), dtype=numpy.int) newRanking2=numpy.zeros(len(documents), dtype=numpy.int) for docID, doc in enumerate(documents): if doc not in rank1set: newRanking1[docID]=rankingSize + 1 newRanking2[docID]=ranking2.index(doc) elif doc not in rank2set: newRanking2[docID]=rankingSize + 1 newRanking1[docID]=ranking1.index(doc) else: newRanking1[docID]=ranking1.index(doc) newRanking2[docID]=ranking2.index(doc) return scipy.stats.kendalltau(newRanking1, newRanking2)[0], 1.0*len(rank1set&rank2set)/rankingSize numpy.random.seed(resetSeed) for currentQuery in range(numQueries): if filteredDataset.docsPerQuery[currentQuery]<4: continue logTreeRanking=loggingPolicyTree.predict(currentQuery, L).tolist() logLinearRanking=loggingPolicyLinear.predict(currentQuery, L).tolist() targetTreeRanking=targetPolicyTree.predict(currentQuery, L).tolist() targetLinearRanking=targetPolicyLinear.predict(currentQuery, L).tolist() tau, overlap=computeTau(logTreeRanking, targetTreeRanking) TTtau.append(tau) TToverlap.append(overlap) tau, overlap=computeTau(logTreeRanking, targetLinearRanking) TLtau.append(tau) TLoverlap.append(overlap) tau, overlap=computeTau(logLinearRanking, targetTreeRanking) LTtau.append(tau) LToverlap.append(overlap) tau, overlap=computeTau(logLinearRanking, targetLinearRanking) LLtau.append(tau) LLoverlap.append(overlap) tau, overlap=computeTau(logLinearRanking, logTreeRanking) LogLogtau.append(tau) LogLogoverlap.append(overlap) tau, overlap=computeTau(targetLinearRanking, targetTreeRanking) TargetTargettau.append(tau) TargetTargetoverlap.append(overlap) if len(TTtau) % 100 == 0: print(".", end="", flush=True) TTtau=numpy.array(TTtau) TLtau=numpy.array(TLtau) LTtau=numpy.array(LTtau) LLtau=numpy.array(LLtau) LogLogtau=numpy.array(LogLogtau) TargetTargettau=numpy.array(TargetTargettau) TToverlap=numpy.array(TToverlap) TLoverlap=numpy.array(TLoverlap) LToverlap=numpy.array(LToverlap) LLoverlap=numpy.array(LLoverlap) LogLogoverlap=numpy.array(LogLogoverlap) TargetTargetoverlap=numpy.array(TargetTargetoverlap) print("", flush=True) print("TTtau", numpy.amax(TTtau), numpy.amin(TTtau), numpy.mean(TTtau), numpy.std(TTtau), numpy.median(TTtau), len(numpy.where(TTtau > 0.99)[0])) print("TToverlap", numpy.amax(TToverlap), numpy.amin(TToverlap), numpy.mean(TToverlap), numpy.std(TToverlap), numpy.median(TToverlap), len(numpy.where(TToverlap > 0.99)[0])) print("TLtau", numpy.amax(TLtau), numpy.amin(TLtau), numpy.mean(TLtau), numpy.std(TLtau), numpy.median(TLtau), len(numpy.where(TLtau > 0.99)[0])) print("TLoverlap", numpy.amax(TLoverlap), numpy.amin(TLoverlap), numpy.mean(TLoverlap), numpy.std(TLoverlap), numpy.median(TLoverlap), len(numpy.where(TLoverlap > 0.99)[0])) print("LTtau", numpy.amax(LTtau), numpy.amin(LTtau), numpy.mean(LTtau), numpy.std(LTtau), numpy.median(LTtau), len(numpy.where(LTtau > 0.99)[0])) print("LToverlap", numpy.amax(LToverlap), numpy.amin(LToverlap), numpy.mean(LToverlap), numpy.std(LToverlap), numpy.median(LToverlap), len(numpy.where(LToverlap > 0.99)[0])) print("LLtau", numpy.amax(LLtau), numpy.amin(LLtau), numpy.mean(LLtau), numpy.std(LLtau), numpy.median(LLtau), len(numpy.where(LLtau > 0.99)[0])) print("LLoverlap", numpy.amax(LLoverlap), numpy.amin(LLoverlap), numpy.mean(LLoverlap), numpy.std(LLoverlap), numpy.median(LLoverlap), len(numpy.where(LLoverlap > 0.99)[0])) print("LogLogtau", numpy.amax(LogLogtau), numpy.amin(LogLogtau), numpy.mean(LogLogtau), numpy.std(LogLogtau), numpy.median(LogLogtau), len(numpy.where(LogLogtau > 0.99)[0])) print("LogLogoverlap", numpy.amax(LogLogoverlap), numpy.amin(LogLogoverlap), numpy.mean(LogLogoverlap), numpy.std(LogLogoverlap), numpy.median(LogLogoverlap), len(numpy.where(LogLogoverlap > 0.99)[0])) print("TargetTargettau", numpy.amax(TargetTargettau), numpy.amin(TargetTargettau), numpy.mean(TargetTargettau), numpy.std(TargetTargettau), numpy.median(TargetTargettau), len(numpy.where(TargetTargettau > 0.99)[0])) print("TargetTargetoverlap", numpy.amax(TargetTargetoverlap), numpy.amin(TargetTargetoverlap), numpy.mean(TargetTargetoverlap), numpy.std(TargetTargetoverlap), numpy.median(TargetTargetoverlap), len(numpy.where(TargetTargetoverlap > 0.99)[0])) # import Datasets # import argparse # import Settings # import sys # import os # import numpy # import Policy # import Metrics