#!/usr/bin/env python # coding: utf-8 # # U.S. Opiate Prescriptions/Overdoses # # Accidental death by fatal drug overdose is a rising trend in the United States. What can you do to help? # # This dataset contains: # - summaries of prescription records for 250 common **opioid** and ***non-opioid*** drugs written by 25,000 unique licensed medical professionals in 2014 in the United States for citizens covered under Class D Medicare # - Metadata about the doctors themselves. # - This data here is in a format with 1 row per prescriber and 25,000 unique prescribers down to 25,000 to keep it manageable. # - The main data is in `prescriber-info.csv`. # - There is also `opioids.csv` that contains the names of all opioid drugs included in the data # - There is the file `overdoses.csv` that contains information on opioid related drug overdose fatalities. # # The increase in overdose fatalities is a well-known problem, and the search for possible solutions is an ongoing effort. This dataset is can be used to detect sources of significant quantities of opiate prescriptions. # # The data consists of the following characteristics for each prescriber # # - NPI – unique National Provider Identifier number # - Gender - (M/F) # - State - U.S. State by abbreviation # - Credentials - set of initials indicative of medical degree # - Specialty - description of type of medicinal practice # - A long list of drugs with numeric values indicating the total number of prescriptions written for the year by that individual # - `Opioid.Prescriber` - a boolean label indicating whether or not that individual prescribed opiate drugs more than 10 times in the year. # # ## Import our packages and data # In[1]: import pandas as pd from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # so we can see the value of multiple statements at once. pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) # In[2]: opioids = pd.read_csv('opioids.csv') pinfo = pd.read_csv('prescriber-info.csv') overdoses = pd.read_csv('overdoses.csv') opioids.head() pinfo.head() overdoses.head() # In[3]: overdoses['Deaths'].sum() # ## Exploration and Initial Preprocessing # In[4]: overdoses['Deaths'] = overdoses['Deaths'].str.replace(',', '') overdoses['Deaths'] = overdoses['Deaths'].astype(int) overdoses['Population'] = overdoses['Population'].str.replace(',', '') overdoses['Population'] = overdoses['Population'].astype(int) overdoses['Deaths/Population'] = (overdoses['Deaths']/overdoses['Population']) overdoses.head() # In[5]: overdoses_plot = overdoses.copy() overdoses_plot.set_index('State', inplace=True) overdoses_plot.head() # In[71]: from matplotlib import pyplot import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') fig = plt.figure() ax = overdoses_plot[['Deaths/Population']].plot(kind='bar', title ="Death per state", figsize=(15, 10), fontsize=12) ax.set_xlabel("State", fontsize=12) ax.set_ylabel("Deaths/Population", fontsize=12) plt.show(); # In[7]: overdoses_top_5 = overdoses_plot[['Deaths/Population']].sort_values('Deaths/Population',ascending=False).iloc[0:5] overdoses_top_5 # In[8]: from matplotlib import pyplot import matplotlib.pyplot as plt ax = overdoses_top_5[['Deaths/Population']].plot(kind='bar', title ="Death/Population per state", figsize=(10, 5), fontsize=12) ax.set_xlabel("State", fontsize=12) ax.set_ylabel("Deaths/Population", fontsize=12) plt.show(); # In[9]: pinfo.groupby('Opioid.Prescriber').size() / pinfo.groupby('Opioid.Prescriber').size().sum() # In[10]: pd.value_counts(pinfo['Opioid.Prescriber']).plot.bar() # ## Creating our Classifiers to Predict Opioid Prescribers # In[11]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split # In[12]: overdoses.to_csv('overdosesnew.csv') # In[13]: overdoses = pd.read_csv('overdosesnew.csv',index_col=0) overdoses.head() # In[14]: opioids = opioids name = opioids['Drug Name'] import re new_name = name.apply(lambda x:re.sub("\ |-",".",str(x))) columns = pinfo.columns abandoned_variables = set(columns).intersection(set(new_name)) kept_variable=[] for each in columns: if each in abandoned_variables: pass else: kept_variable.append(each) # In[15]: df = pinfo[kept_variable] # In[16]: df.head() # In[17]: df = pinfo[kept_variable] df = df.drop(df.columns[[0,3]], axis=1) df.head() # In[22]: cols = ['Specialty'] dummies = pd.concat([pd.get_dummies(df[cols][col], drop_first = True, prefix= col) for col in df[cols]], axis=1) dummies.head() # In[24]: from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # see the value of multiple statements at once. from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer import numpy as np from sklearn.metrics import confusion_matrix # In[25]: X = dummies X.head() # In[26]: X.head() # In[27]: y = df['Opioid.Prescriber'] y.head() # In[28]: n_estimators = list(range(20,220,10)) max_depth = list(range(2, 22, 2)) + [None] target = 'Opioid.Prescriber' def rfscore2(X,y,test_size,n_estimators,max_depth): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42) rf_params = { 'n_estimators':n_estimators, 'max_depth':max_depth} # parameters for grid search rf_gs = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, verbose=1, n_jobs=-1) rf_gs.fit(X_train,y_train) # training the random forest with all possible parameters print('GridSearch results') print('The best parameters on the training data are:\n',rf_gs.best_params_) # printing the best parameters max_depth_best = rf_gs.best_params_['max_depth'] # getting the best max_depth n_estimators_best = rf_gs.best_params_['n_estimators'] # getting the best n_estimators print("best max_depth:",max_depth_best) print("best n_estimators:",n_estimators_best) best_rf_gs = RandomForestClassifier(max_depth=max_depth_best,n_estimators=n_estimators_best) # instantiate the best model best_rf_gs.fit(X_train,y_train) # fitting the best model best_rf_score = best_rf_gs.score(X_test,y_test) print ("best score is:",round(best_rf_score,2)) print("Is the prediction smaller (S) or larger (L) than the median:\n") preds = best_rf_gs.predict(X_test) print(['S' if p == 0 else 'L' for p in best_rf_gs.predict(X_test)]) print("") print("What were the probabilities of the each result above:\n") print("Probabilities that the number of comments is smaller than the media for each observation are:\n") print([('S',round(p[0],2)) if p[0] > p[1] else ('S',round(p[0],2)) for p in best_rf_gs.predict_proba(X_test)]) print("") print("Confusion Matrix:\n") print(pd.crosstab(pd.concat([X_test,y_test],axis=1)[target], preds, rownames=['Actual Values'], colnames=['Predicted Values'])) print('Features and their importance:\n') feature_importances = pd.Series(best_rf_gs.feature_importances_, index=X.columns).sort_values().tail(10) print(feature_importances) print(feature_importances.plot(kind="barh", figsize=(6,6))) return rfscore2(X,y,0.3,n_estimators,max_depth) # In[29]: def cv_score(X,y,cv,n_estimators,max_depth): rf = RandomForestClassifier(n_estimators=n_estimators_best, max_depth=max_depth_best) s = cross_val_score(rf, X, y, cv=cv, n_jobs=-1) return("{} Score is :{:0.3} ± {:0.3}".format("Random Forest", s.mean().round(3), s.std().round(3))) # In[30]: dict_best = {'max_depth': None, 'n_estimators': 20} n_estimators_best = dict_best['n_estimators'] max_depth_best = dict_best['max_depth'] cv_score(X,y,5,n_estimators_best,max_depth_best) # In[69]: df = pinfo[kept_variable] df.head() # In[32]: c = ['NPI','Gender','State','Credentials','Opioid.Prescriber'] df3 = df.drop(c,axis=1) df3.head() # In[33]: df4 = df3.drop('Specialty',axis=1) all_ops = df4[df4.columns.tolist()].sum(axis=1) all_ops.head() # In[34]: df6 = df3[['Specialty']] df5 = pd.concat([all_ops, df6], axis = 1) df5 = df5.rename(columns={0: 'total_opioids'}) df5.head() # In[66]: df5_new = df5[['Specialty','total_opioids']].groupby(df5['total_opioids']).sum() df5_new.head() # In[70]: cols_to_drop = ['Specialty','NPI','Gender','Credentials'] df = df.drop(cols_to_drop,axis=1) df.head() #dummies = pd.concat([pd.get_dummies(X_train[cols][col], drop_first = True, prefix= col) for col in X_train[cols]], axis=1) #train_wo_dummies = X_train.drop(cols,axis=1) # In[37]: cols = ['Opioid.Prescriber','State'] aux = df.drop(cols,axis=1) all_ops = aux[aux.columns.tolist()].sum(axis=1) all_ops.head() # In[38]: df1 = df[['State','Opioid.Prescriber']] df1.head() # In[39]: df2 = pd.concat([all_ops, df1], axis = 1) #X_train = X_train.rename(columns={0: 'total_opioids'}) df2.head() # In[40]: #df2.set_index('State', inplace=True) df2 = df2.rename(columns={0: 'total_opioids'}) df2.head() # In[41]: df2 = df2[['State','total_opioids']] df2.head() # In[ ]: # In[42]: df2['State'].value_counts().head() # In[43]: df2_new = df2[['total_opioids']].groupby(df2['State']).sum() df2_new.head() # In[44]: df2_new.reset_index(level=0, inplace=True) df2_new.head() # In[45]: df2.columns # In[46]: ops_top_5 = df2[['total_opioids','State']].sort_values('total_opioids',ascending=False).iloc[0:5] ops_top_5 # In[47]: ops_top_5.set_index('State', inplace=True) ops_top_5.head() # In[48]: from matplotlib import pyplot import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') ax = ops_top_5.plot(kind='bar', title ="Total Opioids per state", figsize=(10, 5), fontsize=12) ax.set_xlabel("State", fontsize=12) ax.set_ylabel("Total Opioids", fontsize=12) plt.show(); # In[49]: income = pd.read_csv('mhincome.csv') income.head() # In[50]: income_top_5 = income[['Income','State']].sort_values('Income',ascending=False).iloc[0:5] income_top_5 # In[51]: income_top_5.set_index('State', inplace=True) income_top_5 # In[52]: from matplotlib import pyplot import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') ax = income_top_5.plot(kind='bar', title ="Median Household Income", figsize=(10, 5), fontsize=12) ax.set_xlabel("State", fontsize=12) ax.set_ylabel("Median Household Income", fontsize=12) plt.show(); # In[53]: d = {'State': ['Maine','Vermont','New Hampshire','West Virginia','Florida'], 'Median Age': [44.5,43.1,42.7,42.3,42.1]} ages = pd.DataFrame(data=d) ages.head() # In[54]: ages.set_index('State', inplace=True) ages.head() # In[55]: from matplotlib import pyplot import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') ax = ages.plot(kind='bar', title ="Median Age", figsize=(10, 5), fontsize=12) ax.set_xlabel("State", fontsize=12) ax.set_ylabel("Median Age", fontsize=12) plt.show();