lines = open('../data/lemons_description.txt')
for line in lines:
    print line.strip()

import pandas as pd
from sklearn import tree
from sklearn.cross_validation import cross_val_score

# Load in data and create sets. dropping all na columns on the live data set.
lemons = pd.read_csv('../data/lemons.csv')
lemons_oos = pd.read_csv('../data/lemons_oos.csv')
print lemons.dtypes

lemons = lemons.dropna(axis=1)
# Generating a list of continuous data features from the describe dataframe. 
# Then, removing the two non-features (RefId is an index, IsBadBuy is the prediction value)
features = list(lemons.describe().columns)
features.remove('RefId')
features.remove('IsBadBuy')

best_score = -1
for depth in range(1, 10):
    scores = cross_val_score(tree.DecisionTreeClassifier(max_depth=depth, random_state=1234),
                                lemons[features],
                                lemons.IsBadBuy,
                                scoring='roc_auc',
                                cv=5)
    if scores.mean() > best_score:
        best_depth = depth
        best_score = scores.mean()

# Is the best score we have better than each DummyClassifier type?
from sklearn import dummy, metrics
for strat in ['stratified', 'most_frequent', 'uniform']:
    dummyclf = dummy.DummyClassifier(strategy=strat).fit(lemons[features], lemons.IsBadBuy)
    print 'did better than %s?' % strat, metrics.roc_auc_score(lemons.IsBadBuy, dummyclf.predict(lemons[features])) < best_score

# seems so!

# Create a classifier and prediction.
clf = tree.DecisionTreeClassifier(max_depth=depth, random_state=1234).fit(lemons[features], lemons.IsBadBuy)

y_pred = clf.predict(lemons_oos[features])

# Create a submission
submission = pd.DataFrame({ 'RefId' : lemons_oos.RefId, 'prediction' : y_pred })
submission.to_csv('submission.csv')

import numpy as np
from __future__ import division
class Transformations(object):
    """since these transformations are all related, we'll nest them all under a feature norm class"""
    def mean_at_zero(self, arr):
        return np.array([i - np.mean(arr) for i in arr])

    def norm_to_min_zero(self, arr):
        return np.array([i / max(arr) for i in arr])
    
    def norm_to_absolute_min_zero(self, arr):
        """should be a range of 0 to 1, where 0 maintains its 0 value"""
    
    def norm_to_neg_pos(self, arr):
        """should be a range of -1 to 1, where 0 represents the mean"""
    
    def norm_by_std(self, arr):
        """should be a range where 0 represents the mean"""

## tests to make sure we built this correctly:
transformer = Transformations()
a = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
print transformer.mean_at_zero(a) == np.array([-2, -1, 0, 1, 2])
print transformer.norm_to_min_zero(a) == np.array([0.2, 0.4, 0.6, 0.8, 1.0])
print transformer.norm_to_absolute_min_zero(a) == np.array([0.0, 0.25, 0.5, 0.75, 1.0])
print transformer.norm_to_neg_pos(a) == np.array([-1.0, -0.5, 0.0, 0.5, 1.0])
print transformer.norm_by_std(a) == np.array([-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095])


from sklearn import preprocessing
print a
print preprocessing.scale(a, with_mean=True, with_std=False)
print preprocessing.scale(a, with_mean=True, with_std=True)

%matplotlib inline
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns

class PowerLaw(object):
    def fit(self, x, y, transform=True):
        """
        returns back the amplitude and index of a powerlaw relationship.
        assumes the data is not already log10 transformed.
        return: [index, amp], also stored on the instance
        """
        if transform:
            x = np.log10(x)
            y = np.log10(y)
        # define our (line) fitting function and error function to optimize on
        fitfunc = lambda p, x: p[0] + p[1] * x
        errfunc = lambda p, x, y: (y - fitfunc(p, x))
        # defines a starting point to optimize from.
        p_init = [1.0, -1.0] 
        out = sp.optimize.leastsq(errfunc, p_init, args=(x, y), full_output=1)
        result = out[0]
        self.index = result[1]
        self.amp = 10.0**result[0]
        return np.array([self.amp, self.index])
    
    def transform(self, x):
        """returns the x-transformed data"""
        return self.amp * (x**self.index)

xdata=np.array([ 0.00010851,  0.00021701,  0.00043403,  0.00086806,  0.00173611, 0.00347222])
ydata=np.array([ 29.56241016,  29.82245508,  25.33930469,  19.97075977,  12.61276074, 7.12695312])

powerlaw = PowerLaw()
powerlaw.fit(xdata, ydata)
print 'amp:',powerlaw.amp, 'index', powerlaw.index

sns.set_style('white')
plt.figure()
plt.subplot(2, 1, 1)
plt.plot(xdata, powerlaw.transform(xdata))
plt.plot(xdata, ydata)
plt.text(0.0020, 30, 'Ampli = %5.2f' % powerlaw.amp)
plt.text(0.0020, 25, 'Index = %5.2f' % powerlaw.index)
plt.xlabel('X')
plt.ylabel('Y')
plt.subplot(2, 1, 2)
plt.loglog(xdata, powerlaw.transform(xdata))
plt.plot(xdata, ydata)
plt.xlabel('X (log scale)')
plt.ylabel('Y (log scale)')

print lemons.groupby('Auction').Auction.count()
print lemons.groupby('Auction').IsBadBuy.mean()

# seems like the ADESA auction is particularly worse for bad buys (about 36% more)
# it may help to create a new column that specically refers to "is_adesa"

lemons['auct_adesa'] = lemons.Auction.apply(lambda x: 1 if x == 'ADESA' else 0)

print lemons.groupby('auct_adesa').IsBadBuy.mean()

print plt.hist(lemons.VehicleAge)

print lemons.groupby('VehicleAge').IsBadBuy.mean()

# there seems to be a stronger relationship with bad buys as vehicles are older.
# is there anything we should do here?