#!/usr/bin/env python
# coding: utf-8

# # U.S. Opiate Prescriptions/Overdoses
# 
# Accidental death by fatal drug overdose is a rising trend in the United States. What can you do to help?
# 
# This dataset contains:
# - summaries of prescription records for 250 common **opioid** and ***non-opioid*** drugs written by 25,000 unique licensed medical professionals in 2014 in the United States for citizens covered under Class D Medicare
# - Metadata about the doctors themselves. 
# - This data here is in a format with 1 row per prescriber and 25,000 unique prescribers down to 25,000 to keep it manageable. 
# - The main data is in `prescriber-info.csv`. 
# - There is also `opioids.csv` that contains the names of all opioid drugs included in the data 
# - There is the file `overdoses.csv` that contains information on opioid related drug overdose fatalities.
# 
# The increase in overdose fatalities is a well-known problem, and the search for possible solutions is an ongoing effort. This dataset is can be used to detect sources of significant quantities of opiate prescriptions. 
# 
# The data consists of the following characteristics for each prescriber
# 
# - NPI – unique National Provider Identifier number
# - Gender - (M/F)
# - State - U.S. State by abbreviation
# - Credentials - set of initials indicative of medical degree
# - Specialty - description of type of medicinal practice
# - A long list of drugs with numeric values indicating the total number of prescriptions written for the year by that individual
# - `Opioid.Prescriber` - a boolean label indicating whether or not that individual prescribed opiate drugs more than 10 times in the year.
# 

# ## Import our packages and data

# In[1]:


import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # so we can see the value of multiple statements at once.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


# In[2]:


opioids = pd.read_csv('opioids.csv')
pinfo = pd.read_csv('prescriber-info.csv')
overdoses = pd.read_csv('overdoses.csv')
opioids.head()
pinfo.head()
overdoses.head()


# In[3]:


overdoses['Deaths'].sum()


# ## Exploration and Initial Preprocessing

# In[4]:


overdoses['Deaths'] = overdoses['Deaths'].str.replace(',', '')
overdoses['Deaths'] = overdoses['Deaths'].astype(int)
overdoses['Population'] = overdoses['Population'].str.replace(',', '')
overdoses['Population'] = overdoses['Population'].astype(int)
overdoses['Deaths/Population'] = (overdoses['Deaths']/overdoses['Population'])
overdoses.head()


# In[5]:


overdoses_plot = overdoses.copy()
overdoses_plot.set_index('State', inplace=True)
overdoses_plot.head()


# In[71]:


from matplotlib import pyplot
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
fig = plt.figure()
ax = overdoses_plot[['Deaths/Population']].plot(kind='bar', title ="Death per state", figsize=(15, 10), fontsize=12)
ax.set_xlabel("State", fontsize=12)
ax.set_ylabel("Deaths/Population", fontsize=12)
plt.show();


# In[7]:


overdoses_top_5 = overdoses_plot[['Deaths/Population']].sort_values('Deaths/Population',ascending=False).iloc[0:5]
overdoses_top_5


# In[8]:


from matplotlib import pyplot
import matplotlib.pyplot as plt
ax = overdoses_top_5[['Deaths/Population']].plot(kind='bar', title ="Death/Population per state", figsize=(10, 5), fontsize=12)
ax.set_xlabel("State", fontsize=12)
ax.set_ylabel("Deaths/Population", fontsize=12)
plt.show();


# In[9]:


pinfo.groupby('Opioid.Prescriber').size() / pinfo.groupby('Opioid.Prescriber').size().sum()


# In[10]:


pd.value_counts(pinfo['Opioid.Prescriber']).plot.bar()


# ## Creating our Classifiers to Predict Opioid Prescribers

# In[11]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split


# In[12]:


overdoses.to_csv('overdosesnew.csv')


# In[13]:


overdoses = pd.read_csv('overdosesnew.csv',index_col=0)
overdoses.head()


# In[14]:


opioids = opioids 
name = opioids['Drug Name']
import re
new_name = name.apply(lambda x:re.sub("\ |-",".",str(x)))
columns = pinfo.columns
abandoned_variables = set(columns).intersection(set(new_name))
kept_variable=[]
for each in columns:
    if each in abandoned_variables:
        pass
    else:
        kept_variable.append(each)


# In[15]:


df = pinfo[kept_variable]


# In[16]:


df.head()


# In[17]:


df = pinfo[kept_variable]
df = df.drop(df.columns[[0,3]], axis=1) 
df.head()


# In[22]:


cols = ['Specialty']
dummies = pd.concat([pd.get_dummies(df[cols][col], drop_first = True, prefix= col) for col in df[cols]], axis=1)
dummies.head()


# In[24]:


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # see the value of multiple statements at once.
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
import numpy as np
from sklearn.metrics import confusion_matrix


# In[25]:


X = dummies
X.head()


# In[26]:


X.head()


# In[27]:


y = df['Opioid.Prescriber']
y.head()


# In[28]:


n_estimators = list(range(20,220,10))
max_depth = list(range(2, 22, 2)) + [None]


target = 'Opioid.Prescriber'

def rfscore2(X,y,test_size,n_estimators,max_depth):

    X_train, X_test, y_train, y_test = train_test_split(X, 
                                                        y, test_size = test_size, random_state=42) 
    rf_params = {
             'n_estimators':n_estimators,
             'max_depth':max_depth}   # parameters for grid search
    rf_gs = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, verbose=1, n_jobs=-1)
    rf_gs.fit(X_train,y_train) # training the random forest with all possible parameters
    print('GridSearch results')
    print('The best parameters on the training data are:\n',rf_gs.best_params_) # printing the best parameters
    max_depth_best = rf_gs.best_params_['max_depth']      # getting the best max_depth
    n_estimators_best = rf_gs.best_params_['n_estimators']  # getting the best n_estimators
    print("best max_depth:",max_depth_best)
    print("best n_estimators:",n_estimators_best)
    best_rf_gs = RandomForestClassifier(max_depth=max_depth_best,n_estimators=n_estimators_best) # instantiate the best model
    best_rf_gs.fit(X_train,y_train)  # fitting the best model
    best_rf_score = best_rf_gs.score(X_test,y_test) 
    print ("best score is:",round(best_rf_score,2))
    print("Is the prediction smaller (S) or larger (L) than the median:\n")
    preds = best_rf_gs.predict(X_test)
    print(['S' if p == 0 else 'L' for p in best_rf_gs.predict(X_test)])
    print("")
    print("What were the probabilities of the each result above:\n")
    print("Probabilities that the number of comments is smaller than the media for each observation are:\n")
    print([('S',round(p[0],2)) if p[0] > p[1] else ('S',round(p[0],2)) for p in best_rf_gs.predict_proba(X_test)])
    print("")
    print("Confusion Matrix:\n")
    print(pd.crosstab(pd.concat([X_test,y_test],axis=1)[target], preds, rownames=['Actual Values'], colnames=['Predicted Values']))
    print('Features and their importance:\n')
    feature_importances = pd.Series(best_rf_gs.feature_importances_, index=X.columns).sort_values().tail(10)
    print(feature_importances)
    print(feature_importances.plot(kind="barh", figsize=(6,6)))
    return


rfscore2(X,y,0.3,n_estimators,max_depth)


# In[29]:


def cv_score(X,y,cv,n_estimators,max_depth):
    rf = RandomForestClassifier(n_estimators=n_estimators_best,
                                max_depth=max_depth_best)
    s = cross_val_score(rf, X, y, cv=cv, n_jobs=-1)
    return("{} Score is :{:0.3} ± {:0.3}".format("Random Forest", s.mean().round(3), s.std().round(3)))


# In[30]:


dict_best = {'max_depth': None, 'n_estimators': 20}
n_estimators_best = dict_best['n_estimators']
max_depth_best = dict_best['max_depth']
cv_score(X,y,5,n_estimators_best,max_depth_best)


# In[69]:


df = pinfo[kept_variable]
df.head()


# In[32]:


c = ['NPI','Gender','State','Credentials','Opioid.Prescriber']
df3 = df.drop(c,axis=1)
df3.head()


# In[33]:


df4 = df3.drop('Specialty',axis=1)
all_ops = df4[df4.columns.tolist()].sum(axis=1)
all_ops.head()


# In[34]:


df6 = df3[['Specialty']]
df5 = pd.concat([all_ops, df6], axis = 1)
df5 = df5.rename(columns={0: 'total_opioids'})
df5.head()


# In[66]:


df5_new = df5[['Specialty','total_opioids']].groupby(df5['total_opioids']).sum()
df5_new.head()


# In[70]:


cols_to_drop = ['Specialty','NPI','Gender','Credentials']
df = df.drop(cols_to_drop,axis=1)
df.head()
#dummies = pd.concat([pd.get_dummies(X_train[cols][col], drop_first = True, prefix= col) for col in X_train[cols]], axis=1)
#train_wo_dummies = X_train.drop(cols,axis=1)


# In[37]:


cols = ['Opioid.Prescriber','State']
aux = df.drop(cols,axis=1)
all_ops = aux[aux.columns.tolist()].sum(axis=1)
all_ops.head()


# In[38]:


df1 = df[['State','Opioid.Prescriber']]
df1.head()


# In[39]:


df2 = pd.concat([all_ops, df1], axis = 1)
#X_train = X_train.rename(columns={0: 'total_opioids'})
df2.head()


# In[40]:


#df2.set_index('State', inplace=True)
df2 = df2.rename(columns={0: 'total_opioids'})
df2.head()


# In[41]:


df2 = df2[['State','total_opioids']]
df2.head()


# In[ ]:


# In[42]:


df2['State'].value_counts().head()


# In[43]:


df2_new = df2[['total_opioids']].groupby(df2['State']).sum()
df2_new.head()


# In[44]:


df2_new.reset_index(level=0, inplace=True)
df2_new.head()


# In[45]:


df2.columns


# In[46]:


ops_top_5 = df2[['total_opioids','State']].sort_values('total_opioids',ascending=False).iloc[0:5]
ops_top_5


# In[47]:


ops_top_5.set_index('State', inplace=True)
ops_top_5.head()


# In[48]:


from matplotlib import pyplot
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
ax = ops_top_5.plot(kind='bar', title ="Total Opioids per state", figsize=(10, 5), fontsize=12)
ax.set_xlabel("State", fontsize=12)
ax.set_ylabel("Total Opioids", fontsize=12)
plt.show();


# In[49]:


income = pd.read_csv('mhincome.csv')
income.head()


# In[50]:


income_top_5 = income[['Income','State']].sort_values('Income',ascending=False).iloc[0:5]
income_top_5


# In[51]:


income_top_5.set_index('State', inplace=True)
income_top_5


# In[52]:


from matplotlib import pyplot
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
ax = income_top_5.plot(kind='bar', title ="Median Household Income", figsize=(10, 5), fontsize=12)
ax.set_xlabel("State", fontsize=12)
ax.set_ylabel("Median Household Income", fontsize=12)
plt.show();


# In[53]:


d = {'State': ['Maine','Vermont','New Hampshire','West Virginia','Florida'], 'Median Age': [44.5,43.1,42.7,42.3,42.1]}
ages = pd.DataFrame(data=d)
ages.head()


# In[54]:


ages.set_index('State', inplace=True)
ages.head()


# In[55]:


from matplotlib import pyplot
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
ax = ages.plot(kind='bar', title ="Median Age", figsize=(10, 5), fontsize=12)
ax.set_xlabel("State", fontsize=12)
ax.set_ylabel("Median Age", fontsize=12)
plt.show();