%pylab inline
import json, re
from collections import defaultdict
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc
dfs = {}
for name in ['train', 'test']:
df = pd.read_json('../data/%s.json' % name)
df['_data'] = name
dfs[name] = df
# combine train and test data into one df
df = dfs['train'].append(dfs['test'])
df = df.reset_index(drop=True)
# limit to shared columns (plus predictor)
cols = list(dfs['test'].columns) + ['requester_received_pizza']
df = df[cols]
# rename a few columns to be pithier
df.rename(columns={
'request_title': 'title',
'request_text_edit_aware': 'body',
'requester_upvotes_minus_downvotes_at_request': 'karma',
'requester_number_of_posts_at_request': 'prior_posts',
'requester_number_of_posts_on_raop_at_request': 'prior_raop_posts',
'requester_account_age_in_days_at_request': 'requester_age',
'unix_timestamp_of_request_utc': 'epoch',
'requester_received_pizza': 'got_pizza',
}, inplace=True)
# convert got pizza indicator to ints
df['got_pizza'] = df['got_pizza'].apply(lambda x: -1 if pd.isnull(x) else int(x))
df.iloc[0]
# clean up text field (lowercase, letters only)
def clean_txt(raw, remove_stop=False):
# remove non-letters
letters_only = re.sub("[^a-zA-Z]", " ", raw)
# convert to lower case, split into individual words
words = letters_only.lower().split()
if remove_stop:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
# join cleaned words
return " ".join(words)
# combine title and body columns, then clean
df['txt_raw'] = df['title'] + ' ' + df['body']
df['txt_clean'] = df['txt_raw'].apply(clean_txt)
# check that it worked
for col in ['txt_raw', 'txt_clean']:
print df.iloc[0][col]
print '--'
# temporal features
dt = pd.to_datetime(df['epoch'], unit='s')
dt = pd.DatetimeIndex(dt)
df['date'] = dt.date
df['day'] = dt.day
df['month'] = dt.month
df['dow'] = dt.dayofweek
df['community_age'] = (dt - min(dt)).days.astype(int)
temporal_cols = [
'day',
'month',
'community_age',
]
print df[['date'] + temporal_cols].head()
# status features
status_cols = [
'karma',
'prior_raop_posts',
'prior_posts',
'requester_age',
]
print df[status_cols].describe()
# narrative groupings from paper
# source: http://cs.stanford.edu/~althoff/raop-dataset/altruistic_requests_icwsm.pdf
narrative_cats = {
'money': [
'money', 'now', 'broke', 'week', 'until', 'time',
'last', 'day', 'when', 'today', 'tonight', 'paid', 'next',
'first', 'night', 'after', 'tomorrow', 'month', 'while',
'account', 'before', 'long', 'Friday', 'rent', 'buy',
'bank', 'still', 'bills', 'bills', 'ago', 'cash', 'due', 'due',
'soon', 'past', 'never', 'paycheck', 'check', 'spent',
'years', 'poor', 'till', 'yesterday', 'morning', 'dollars',
'financial', 'hour', 'bill', 'evening', 'credit',
'budget', 'loan', 'bucks', 'deposit', 'dollar', 'current',
'payed',
],
'job': [
'work', 'job', 'paycheck', 'unemployment', 'interview',
'fired', 'employment', 'hired', 'hire',
],
'student': [
'college', 'student', 'school', 'roommate',
'studying', 'university', 'finals', 'semester',
'class', 'study', 'project', 'dorm', 'tuition',
],
'family': [
'family', 'mom', 'wife', 'parents', 'mother', 'husband',
'dad', 'son', 'daughter', 'father', 'parent',
'mum',
],
'craving': [
'friend', 'girlfriend', 'craving', 'birthday',
'boyfriend', 'celebrate', 'party', 'game', 'games',
'movie', 'date', 'drunk', 'beer', 'celebrating', 'invited',
'drinks', 'crave', 'wasted', 'invite',
],
}
# list of narrative category names
cat_list = sorted(narrative_cats.keys())
print 'cat list: %s\n' % cat_list
# create word to category mapping
word_to_cats = defaultdict(list)
for cat, words in narrative_cats.iteritems():
for word in words:
word_to_cats[word].append(cat)
word_to_cats = dict(word_to_cats)
# check that things are working
print 'checking word to category lookups:'
for word in ['university', 'parent', 'cash']:
print '%s - categories: %s' % (
word,
word_to_cats.get(word, 'NONE')
)
# loop through cleaned text and count occurrences
# of words in each narrative category
def categorize(words):
cats = defaultdict(int)
for word in words.split():
matches = word_to_cats.get(word)
if matches:
for m in matches:
cats[m] += 1
return dict(cats)
df['txt_cats'] = df['txt_clean'].apply(categorize)
# check that it worked
for i in range(3):
ex = df.iloc[i]
print ex['txt_clean']
print ex['txt_cats']
print '\n---\n'
# turn data dict into indiv columns (narrative features)
def to_freq(row, cat):
cats, txt = row['txt_cats'], row['txt_clean']
if cats.get(cat) > 0:
return cats.get(cat) * 1.0 / len(txt.split())
else:
return 0
for cat in cat_list:
df['narr_%s' % cat] = df.apply(lambda row: to_freq(row, cat), axis=1)
# assign variable to the list of these new cols
narrative_cols = [c for c in df.columns if c.startswith('narr_')]
# check that it worked
df[['txt_cats'] + narrative_cols].iloc[0]
# add a few more, potentially useful features
# has link
df['hyperlink'] = df['body'].apply(lambda x: 1 if re.search("http|www", x) else 0)
# character length of title + body fields
df['txt_chars'] = df['txt_clean'].apply(lambda x: len(x))
# politeness indicator
df['polite'] = df['txt_clean'].apply(lambda x: 1 if re.search("thank|appreciate|advance", x) else 0)
# reciprocity indicator
df['reciprocity'] = df['txt_clean'].apply(lambda x:
1 if re.search("repay|pay.+back|pay.+forward|return.+favor", x)
else 0)
# check their distributions
for col in ['polite', 'hyperlink', 'reciprocity']:
print '%s: %s' % (
col,
df[col].value_counts().to_dict()
)
# combine these new cols together
additional_cols = [
'txt_chars',
'polite',
'hyperlink',
'reciprocity',
]
# combine features (and check that things look good)
x_cols = temporal_cols + status_cols + narrative_cols + additional_cols
print x_cols
df[x_cols].head()
# set up framework to quickly iterate on
# different feature sets and algorithm params
def get_data():
data = df[df['_data'] == 'train'].copy()
return data
def prep_data(data, input_cols):
X = data[input_cols].as_matrix()
y = data['got_pizza'].astype(int).as_matrix()
return X, y
def predict(input_cols, model_params={}):
data = get_data()
X, y = prep_data(data, input_cols)
rando = 123
Xr, Xt, yr, yt = train_test_split(X, y, random_state=rando)
model_params.update({
'random_state': rando,
})
model = GradientBoostingClassifier(**model_params)
model = model.fit(Xr, yr)
ypred = model.predict_proba(Xt)[:, 1]
fpr, tpr, thresholds = roc_curve(yt, ypred)
auc_val = auc(fpr, tpr)
return auc_val
# try out a few different feature sets + model params
# just narrative features
print predict(narrative_cols)
# just temporal features
print predict(temporal_cols)
# all features
print predict(x_cols)
# all features with more n_estimators
print predict(x_cols, {'n_estimators': 1000})
# model parameter tuning
# (this takes a little while to run)
param_grid = {
'n_estimators': [100, 500, 1000],
'learning_rate': [0.005, 0.01, 0.02],
'max_depth': [2, 3, 4],
}
model = GradientBoostingClassifier(random_state=123)
grid_search = GridSearchCV(model, param_grid, cv=6, verbose=0, scoring='roc_auc')
grid_search.fit(X_train, y_train)
print grid_search.best_score_
print grid_search.best_params_
# finally, train classifier over entire training set
# with best params from grid search and save predictions
df_train = df[df['_data'] == 'train'].copy()
X_train = df_train[x_cols].as_matrix()
y_train = df_train['got_pizza'].astype(int).as_matrix()
model = GradientBoostingClassifier(
n_estimators=500,
learning_rate=0.01,
max_depth=4,
random_state=123
)
model = model.fit(X_train, y_train)
df_test = df[df['_data'] == 'test'].copy()
X_test = df_test[x_cols].as_matrix()
ypred = model.predict_proba(X_test)[:, 1]
df_test['requester_received_pizza'] = ypred
final_df = df_test[['request_id', 'requester_received_pizza']]
final_df.to_csv('../output/predicted.csv', index=False)
print 'boom.'