lines = open('../data/lemons_description.txt') for line in lines: print line.strip() import pandas as pd from sklearn import tree from sklearn.cross_validation import cross_val_score # Load in data and create sets. dropping all na columns on the live data set. lemons = pd.read_csv('../data/lemons.csv') lemons_oos = pd.read_csv('../data/lemons_oos.csv') print lemons.dtypes lemons = lemons.dropna(axis=1) # Generating a list of continuous data features from the describe dataframe. # Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value) features = list(lemons.describe().columns) features.remove('RefId') features.remove('IsBadBuy') best_score = -1 for depth in range(1, 10): scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth, random_state=1234), lemons[features], lemons.IsBadBuy, scoring='roc_auc', cv=5) if scores.mean() > best_score: best_depth = depth best_score = scores.mean() # Is the best score we have better than each DummyClassifier type? from sklearn import dummy, metrics for strat in ['stratified', 'most_frequent', 'uniform']: dummyclf = dummy.DummyClassifier(strategy=strat).fit(lemons[features], lemons.IsBadBuy) print 'did better than %s?' % strat, metrics.roc_auc_score(lemons.IsBadBuy, dummyclf.predict(lemons[features])) < best_score # seems so! # Create a classifier and prediction. clf = tree.DecisionTreeClassifier(max_depth=depth, random_state=1234).fit(lemons[features], lemons.IsBadBuy) y_pred = clf.predict(lemons_oos[features]) # Create a submission submission = pd.DataFrame({ 'RefId' : lemons_oos.RefId, 'prediction' : y_pred }) submission.to_csv('submission.csv') import numpy as np from __future__ import division class Transformations(object): """since these transformations are all related, we'll nest them all under a feature norm class""" def mean_at_zero(self, arr): return np.array([i - np.mean(arr) for i in arr]) def norm_to_min_zero(self, arr): return np.array([i / max(arr) for i in arr]) def norm_to_absolute_min_zero(self, arr): """should be a range of 0 to 1, where 0 maintains its 0 value""" def norm_to_neg_pos(self, arr): """should be a range of -1 to 1, where 0 represents the mean""" def norm_by_std(self, arr): """should be a range where 0 represents the mean""" ## tests to make sure we built this correctly: transformer = Transformations() a = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) print transformer.mean_at_zero(a) == np.array([-2, -1, 0, 1, 2]) print transformer.norm_to_min_zero(a) == np.array([0.2, 0.4, 0.6, 0.8, 1.0]) print transformer.norm_to_absolute_min_zero(a) == np.array([0.0, 0.25, 0.5, 0.75, 1.0]) print transformer.norm_to_neg_pos(a) == np.array([-1.0, -0.5, 0.0, 0.5, 1.0]) print transformer.norm_by_std(a) == np.array([-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095]) from sklearn import preprocessing print a print preprocessing.scale(a, with_mean=True, with_std=False) print preprocessing.scale(a, with_mean=True, with_std=True) %matplotlib inline import scipy as sp import matplotlib.pyplot as plt import seaborn as sns class PowerLaw(object): def fit(self, x, y, transform=True): """ returns back the amplitude and index of a powerlaw relationship. assumes the data is not already log10 transformed. return: [index, amp], also stored on the instance """ if transform: x = np.log10(x) y = np.log10(y) # define our (line) fitting function and error function to optimize on fitfunc = lambda p, x: p[0] + p[1] * x errfunc = lambda p, x, y: (y - fitfunc(p, x)) # defines a starting point to optimize from. p_init = [1.0, -1.0] out = sp.optimize.leastsq(errfunc, p_init, args=(x, y), full_output=1) result = out[0] self.index = result[1] self.amp = 10.0**result[0] return np.array([self.amp, self.index]) def transform(self, x): """returns the x-transformed data""" return self.amp * (x**self.index) xdata=np.array([ 0.00010851, 0.00021701, 0.00043403, 0.00086806, 0.00173611, 0.00347222]) ydata=np.array([ 29.56241016, 29.82245508, 25.33930469, 19.97075977, 12.61276074, 7.12695312]) powerlaw = PowerLaw() powerlaw.fit(xdata, ydata) print 'amp:',powerlaw.amp, 'index', powerlaw.index sns.set_style('white') plt.figure() plt.subplot(2, 1, 1) plt.plot(xdata, powerlaw.transform(xdata)) plt.plot(xdata, ydata) plt.text(0.0020, 30, 'Ampli = %5.2f' % powerlaw.amp) plt.text(0.0020, 25, 'Index = %5.2f' % powerlaw.index) plt.xlabel('X') plt.ylabel('Y') plt.subplot(2, 1, 2) plt.loglog(xdata, powerlaw.transform(xdata)) plt.plot(xdata, ydata) plt.xlabel('X (log scale)') plt.ylabel('Y (log scale)') print lemons.groupby('Auction').Auction.count() print lemons.groupby('Auction').IsBadBuy.mean() # seems like the ADESA auction is particularly worse for bad buys (about 36% more) # it may help to create a new column that specically refers to "is_adesa" lemons['auct_adesa'] = lemons.Auction.apply(lambda x: 1 if x == 'ADESA' else 0) print lemons.groupby('auct_adesa').IsBadBuy.mean() print plt.hist(lemons.VehicleAge) print lemons.groupby('VehicleAge').IsBadBuy.mean() # there seems to be a stronger relationship with bad buys as vehicles are older. # is there anything we should do here?