%matplotlib inline

# standard path wrangling to make notebook reproducible in dev and prod
import sys
from os.path import dirname, abspath
from os import getcwd
try:
    root = dirname(dirname(abspath(__file__)))
except NameError:
    root = dirname(getcwd())
sys.path.append(root)
from lib import auto_impala

# imports specific to this notebook
from impala.util import as_pandas
import brewer2mpl
import math
from scipy.stats import chisquare
import pandas as pd
import brewer2mpl

import lib.display_utils

query = lib.display_utils.sql_query_from_file('examples/first_digits.sql')
query

with auto_impala() as cursor:
    cursor.execute(query)
    df = as_pandas(cursor)

# Clean up the dataframe a bit
df = df.dropna().sort('first digit').set_index('first digit')

def benfordp(digit):
    return math.log10(1. + (1./digit))

scaled_df = df / df.sum()[0]
scaled_df.rename(columns={'count': 'True Rate'}, inplace=True)

benford_series = pd.Series(map(benfordp, scaled_df.index), index=scaled_df.index)
benford_df = pd.DataFrame(benford_series, columns=['Benford Rate'])


joined_df = pd.concat([benford_df, scaled_df], axis=1)
joined_df

colors = brewer2mpl.wesanderson.Moonrise1.mpl_colors # oh yeah

joined_df.plot(kind='bar', figsize=(14,10), title="Charge 1st Digits vs Benford's Law", color=colors)

chi2_score, p_value = chisquare(df['count'], benford_series * df.sum()[0])
p_value