!pip install plotly &> /dev/null !pip install scikit-learn &> /dev/null !pip uninstall scikit-learn -y &> /dev/null !pip install -U scikit-learn &> /dev/null !mkdir census_package &> /dev/null !pip install geocoder &> /dev/null !pip install squarify &> /dev/null !pip install shap &> /dev/nul import sklearn from sklearn import metrics from sklearn import linear_model from sklearn.linear_model import LinearRegression from sklearn import datasets from sklearn.model_selection import cross_validate from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import chi2 from sklearn.feature_selection import SelectKBest from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.preprocessing import LabelEncoder from datetime import datetime import seaborn as sns from google.colab import drive import random #py.init_notebook_mode(connected=True) # importing visualization libraries import plotly import plotly.graph_objects as go import plotly.express as px import plotly.offline as py from plotly.offline import iplot %matplotlib inline from IPython.display import HTML import shap !wget https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore.csv df1 = pd.read_csv("/content/googleplaystore.csv") df1.info() df1.dropna(inplace = True) df1.info() # Cleaning Categories into integers CategoryString = df1["Category"] categoryVal = df1["Category"].unique() categoryValCount = len(categoryVal) category_dict = {} for i in range(0,categoryValCount): category_dict[categoryVal[i]] = i df1["Category_c"] = df1["Category"].map(category_dict).astype(int) #Cleaning of genres GenresL = df1.Genres.unique() GenresDict = {} for i in range(len(GenresL)): GenresDict[GenresL[i]] = i df1['Genres_c'] = df1['Genres'].map(GenresDict).astype(int) df1['Price'] = df1['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x)) df1['Price'] = df1['Price'].apply(lambda x: float(x)) # Removing punctuation and plus marks. So that the quantities become from string to int. def get_number_from_string(my_string): if isinstance(my_string, str): my_string = my_string.replace(",","") my_string = my_string.replace("+","") number = int(my_string) return number # This function is responsible for converting measurments in column "Size" from either Mega and Kilo to Bytes. def handle_size(str): if str[-1] == "M": return (float(str[:-1])*10**6) elif str[-1] == "k": return (float(str[:-1])*10**3) # This function is responsible for measuring the number of days passed from 2010 for each row in the column "Last Updated" def handle_last_updated(str) : input_data = datetime.strptime(str, "%B %d, %Y") lower_bound = datetime(2010, 1, 1, 0, 0) return (input_data - lower_bound).days relevant_rows = df1['Installs']!= "Free" relevant_rows df1 = df1.loc[relevant_rows,:] # 4.2 Invoking the preprocessing functions # Run the command across the entire column # get_number_from_string("80,000+") df1["Installs"] = df1["Installs"].apply(lambda x:get_number_from_string(x)) #handle_size(df1.loc[0,"Size"]) df1["Size"] = df1["Size"].apply(lambda x:handle_size(x)) df1["Last Updated"] = df1["Last Updated"].apply(lambda x:handle_last_updated(x)) # convert reviews to numeric df1['Reviews'] = df1['Reviews'].astype(int) df1.isnull().sum() df1['Size'] = df1.Size.fillna(df1.Size.median()) df1.isnull().sum() print("The data table size is:", df1.shape) print("*"*100) print("The columns name are:", df1.columns) print("*"*100) print("The distibution values is:", df1["Type"].value_counts()) print("*"*100) print("The average rating score of all apps is: ", df1["Rating"].mean()) print("*"*100) print("The min rating score of all apps is: ", df1["Rating"].min()) print("*"*100) print("The max rating score of all apps is: ", df1["Rating"].max()) print("*"*100) print("Printing the first 5 rows of the table: ") print(df1.head(n=5)) print("Printing the last 5 rows of the table: ") print(df1.tail(n=5)) x = df1['Rating'].dropna() y = df1['Size'].dropna() z = df1['Installs'][df1.Installs!=0].dropna() p = df1['Reviews'][df1.Reviews!=0].dropna() t = df1['Type'].dropna() price = df1['Price'] p = sns.pairplot(pd.DataFrame(list(zip(x, y, np.log(z), np.log10(p), t, price)), columns=['Rating','Size', 'Installs', 'Reviews', 'Type', 'Price']), hue='Type', palette="Set2") column = 'Type' grouped = df1[column].value_counts().reset_index() grouped = grouped.rename(columns={column:'count','index':column}) print(grouped) # Now plot the data trace = go.Pie(labels=grouped[column],values=grouped['count'],pull=[0.05,0]) layout = {'title':'The Distribution of paid and not paid apps in the app store'} fig = go.Figure(data=[trace],layout=layout) iplot(fig) # show it plt.tight_layout() plt.show() vc=df1["Content Rating"].value_counts().reset_index() vc.rename(columns={'Content Rating': 'count','index':"type" }, inplace=True) vc['percent']=vc['count'].apply(lambda x : 100*x/sum(vc['count'])) vc=vc.sort_values("percent") vc trace = go.Bar(x=vc["type"], y=vc["percent"], name="Group", marker=dict(color="#6ad49b")) #layout={'title':"The number of ",'xaxis':{'title':"x title"}} layout={'title':'The size of each ranking group','xaxis':{'title':"Group name"}} fig = go.Figure(data=trace, layout=layout) iplot(fig) # show it plt.tight_layout() plt.show() number_of_apps_in_category = df1['Category'].value_counts().sort_values(ascending=True) data = [go.Pie( labels = number_of_apps_in_category.index, values = number_of_apps_in_category.values, hoverinfo = 'label+value' )] plotly.offline.iplot(data, filename='active_category') # show it plt.tight_layout() plt.show() import squarify #for making treemap, we need squarify plt.figure(figsize=(20,8)) labels = df1['Rating'].value_counts().index.tolist() colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))] squarify.plot(sizes = df1['Rating'].value_counts(), label = labels, color = colors, alpha = 0.8) data = [go.Histogram( x = df1.Rating, xbins = {'start': 1, 'size': 0.1, 'end' :5} )] print('Average app rating = ', np.mean(df1['Rating'])) plotly.offline.iplot(data, filename='overall_rating_distribution') groups = df1.groupby('Category').filter(lambda x: len(x) >= 50).reset_index() # sns.set_style('ticks') # fig, ax = plt.subplots() # fig.set_size_inches(8, 8) sns.set_style("darkgrid") ax = sns.jointplot(df1['Size'], df1['Rating']) #ax.set_title('Rating Vs Size') subset_df = df1[df1.Size > 40] groups_temp = subset_df.groupby('Category').filter(lambda x: len(x) >20) groups_temp['Category'].value_counts().head(n=8) paid_apps = df1[df1.Price>0] p = sns.jointplot( "Price", "Rating", paid_apps) subset_df = df1[df1.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY', 'MEDICAL', 'TOOLS', 'FINANCE', 'LIFESTYLE','BUSINESS'])] sns.set_style('darkgrid') fig, ax = plt.subplots() fig.set_size_inches(15, 8) p = sns.stripplot(x="Price", y="Category", data=subset_df, jitter=True, linewidth=1) title = ax.set_title('App pricing trend across categories') subset_df.loc[subset_df['Price']>250,'Category'].value_counts() fig, ax = plt.subplots() fig.set_size_inches(15, 8) subset_df_price = subset_df[subset_df.Price<100] p = sns.stripplot(x="Price", y="Category", data=subset_df_price, jitter=True, linewidth=1) title = ax.set_title('App pricing trend across categories - after filtering for junk apps') #print(df1.head(n=5)) df1.Type.value_counts() # groups = df1.groupby(['Category', 'Type']) # for category_type, group in groups: # print("category_type: ", category_type) # print("group size: ", group.shape[0]) # Stacked bar graph for top 5-10 categories - Ratio of paid and free apps #fig, ax = plt.subplots(figsize=(15,10)) new_df = df1.groupby(['Category', 'Type']).agg({'App' : 'count'}).reset_index() #print(new_df) # outer_group_names = df1['Category'].sort_values().value_counts()[:5].index # outer_group_values = df1['Category'].sort_values().value_counts()[:5].values outer_group_names = ['GAME', 'FAMILY', 'MEDICAL', 'TOOLS'] outer_group_values = [len(df1.App[df1.Category == category]) for category in outer_group_names] a, b, c, d=[plt.cm.Blues, plt.cm.Reds, plt.cm.Greens, plt.cm.Purples] inner_group_names = ['Paid', 'Free'] * 4 inner_group_values = [] #inner_colors = ['#58a27c','#FFD433'] for category in outer_group_names: for t in ['Paid', 'Free']: x = new_df[new_df.Category == category] try: #print(x.App[x.Type == t].values[0]) inner_group_values.append(int(x.App[x.Type == t].values[0])) except: #print(x.App[x.Type == t].values[0]) inner_group_values.append(0) explode = (0.025,0.025,0.025,0.025) # First Ring (outside) fig, ax = plt.subplots(figsize=(10,10)) ax.axis('equal') mypie, texts, _ = ax.pie(outer_group_values, radius=1.2, labels=outer_group_names, autopct='%1.1f%%', pctdistance=1.1, labeldistance= 0.75, explode = explode, colors=[a(0.6), b(0.6), c(0.6), d(0.6)], textprops={'fontsize': 16}) plt.setp( mypie, width=0.5, edgecolor='black') # Second Ring (Inside) mypie2, _ = ax.pie(inner_group_values, radius=1.2-0.5, labels=inner_group_names, labeldistance= 0.7, textprops={'fontsize': 12}, colors = [a(0.4), a(0.2), b(0.4), b(0.2), c(0.4), c(0.2), d(0.4), d(0.2)]) plt.setp( mypie2, width=0.5, edgecolor='black') plt.margins(0,0) # show it plt.title("The inner distribution of paid and free apps among Game,Tools,Medical and Family") plt.tight_layout() plt.show() trace0 = go.Box( y=np.log10(df1['Installs'][df1.Type=='Paid']), name = 'Paid', marker = dict( color = 'rgb(214, 12, 140)', ), boxpoints='all' ) trace1 = go.Box( y=np.log10(df1['Installs'][df1.Type=='Free']), name = 'Free', marker = dict( color = 'rgb(0, 128, 128)', ), boxpoints='all' ) layout = go.Layout( title = "Number of downloads of paid apps Vs free apps", yaxis= {'title': 'Number of downloads (log-scaled)'} ) data = [trace0, trace1] plotly.offline.iplot({'data': data, 'layout': layout}) corrmat = df1.corr() #f, ax = plt.subplots() p =sns.heatmap(corrmat, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True)) df_copy = df1.copy() df_copy = df_copy[df_copy.Reviews > 10] df_copy = df_copy[df_copy.Installs > 0] df_copy['Installs'] = np.log10(df1['Installs']) df_copy['Reviews'] = np.log10(df1['Reviews']) sns.lmplot("Reviews", "Installs", data=df_copy) ax = plt.gca() _ = ax.set_title('Number of Reviews Vs Number of Downloads (Log scaled)') #Converting Type classification into binary def type_cat(types): if types == 'Free': return 0 else: return 1 df1['Type'] = df1['Type'].map(type_cat) #Cleaning of content rating classification RatingL = df1['Content Rating'].unique() RatingDict = {} for i in range(len(RatingL)): RatingDict[RatingL[i]] = i df1['Content Rating'] = df1['Content Rating'].map(RatingDict).astype(int) df1.info() #dropping of unrelated and unnecessary items df1.drop(labels = ['Last Updated','Current Ver','Android Ver','App'], axis = 1, inplace = True) df1.head() # for dummy variable encoding for Categories df2 = pd.get_dummies(df1, columns=['Category']) df2.head() from sklearn.decomposition import PCA X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1) y = df1.Rating print("The number of columns (features) before Dimension Reduction is: ", X.shape[1]) pca = PCA().fit(X) plt.plot(np.cumsum(pca.explained_variance_ratio_)) print(pca.explained_variance_ratio_) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); # Dimension Reduction: pca = PCA(n_components=3) x_res = pca.fit_transform(X) fig = plt.figure() ax = fig.add_subplot(projection="3d") ax.set_title('Data representation with 3 components', fontsize=10) ax.scatter(x_res[:, 0], x_res[:, 1], x_res[:, 2],cmap=plt.cm.nipy_spectral,marker='o',edgecolor="k") ax.set_xlabel('PC1') ax.set_ylabel('PC2') ax.set_zlabel('PC3') plt.subplots_adjust(right=1.3) print(f"Before PCA we had {X.shape[1]} features after applying PCA we got only 3 Principal components" ) #for evaluation of error term and def Evaluation_metrics(y_true, y_predict): print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_true,y_predict))) print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_true,y_predict))) print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_true,y_predict))) #to add into results_index for evaluation of error term def Evaluationmatrix_dict(y_true, y_predict, name = 'Linear - Integer'): dict_matrix = {} dict_matrix['Series Name'] = name dict_matrix['Mean Squared Error'] = metrics.mean_squared_error(y_true,y_predict) dict_matrix['Mean Absolute Error'] = metrics.mean_absolute_error(y_true,y_predict) dict_matrix['Mean Squared Log Error'] = metrics.mean_squared_log_error(y_true,y_predict) return dict_matrix #excluding Genre label #Integer encoding X = df1.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1) y = df1.Rating # print(X.columns) # print(df1.shape) # print(X.shape) y = pd.DataFrame(y) # print(y.columns) # print(y.shape) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) lr_model = LinearRegression() lr_model.fit(X_train,y_train) results = lr_model.predict(X_test) print("The linear regression model without including the 'Category' column:") Evaluation_metrics(y_test, results) print('*'*100) #dummy encoding X_d = df2.drop(labels = ['Rating','Genres','Category_c','Genres_c'],axis = 1) y_d = df2.Rating X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30) lr_model2 = LinearRegression() lr_model2.fit(X_train_d,y_train_d) results_d = lr_model2.predict(X_test_d) print("The linear regression model with including the 'Category' column:") Evaluation_metrics(y_test, results_d) plt.figure(figsize=(12,7)) sns.regplot(results,y_test,color='teal', label = 'Integer', marker = 'x') sns.regplot(results_d,y_test_d,color='orange',label = 'Dummy') plt.legend() plt.title('Linear model - Excluding Genres') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.show() print ('Actual mean of population:' + str(y.mean())) print ('Integer encoding(mean) :' + str(results.mean())) print ('Dummy encoding(mean) :'+ str(results_d.mean())) print ('Integer encoding(std) :' + str(results.std())) print ('Dummy encoding(std) :'+ str(results_d.std())) explainer = shap.explainers.Linear(lr_model, X) shap_values = explainer(X) # visualize the first prediction's explanation shap.plots.waterfall(shap_values[0]) #Including genre label #Integer encoding X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1) y = df1.Rating X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) lr_model3 = LinearRegression() lr_model3.fit(X_train,y_train) Results = lr_model3.predict(X_test) #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results, name = 'Linear(inc Genre) - Integer'),ignore_index = True) #dummy encoding X_d = df2.drop(labels = ['Rating','Genres','Category_c'],axis = 1) y_d = df2.Rating X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30) lr_model4 = LinearRegression() lr_model4.fit(X_train_d,y_train_d) Results_d = lr_model4.predict(X_test_d) #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test_d,Results_d, name = 'Linear(inc Genre) - Dummy'),ignore_index = True) plt.figure(figsize=(12,7)) sns.regplot(Results,y_test,color='teal', label = 'Integer', marker = 'x') sns.regplot(Results_d,y_test_d,color='orange',label = 'Dummy') plt.legend() plt.title('Linear model - Including Genres') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.show() print ('Integer encoding(mean) :' + str(Results.mean())) print ('Dummy encoding(mean) :'+ str(Results_d.mean())) print ('Integer encoding(std) :' + str(Results.std())) print ('Dummy encoding(std) :'+ str(Results_d.std())) explainer = shap.explainers.Linear(lr_model3, X) shap_values = explainer(X) # visualize the first prediction's explanation shap.plots.waterfall(shap_values[0]) #Excluding genres from sklearn import svm #Integer encoding X1 = df1.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1) print(X1.columns) y = df1.Rating X_train, X_test, y_train, y_test = train_test_split(X1.values, y.values, test_size=0.30) svm_model1 = svm.SVR() svm_model1.fit(X_train,y_train) Results2 = svm_model1.predict(X_test) #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results2, name = 'SVM - Integer'),ignore_index = True) #dummy based X_d = df2.drop(labels = ['Rating','Genres','Category_c','Genres_c',],axis = 1) y_d = df2.Rating X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30) svm_model2 = svm.SVR() svm_model2.fit(X_train_d,y_train_d) Results2_d = svm_model2.predict(X_test_d) Evaluation_metrics(y_test_d, Results2_d) #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test_d,Results2_d, name = 'SVM - Dummy'),ignore_index = True) plt.figure(figsize=(12,7)) sns.regplot(Results2,y_test,color='teal', label = 'Integer', marker = 'x') sns.regplot(Results2_d,y_test_d,color='orange',label = 'Dummy') plt.legend() plt.title('SVM model - excluding Genres') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.show() print ('Integer encoding(mean) :' + str(Results2.mean())) print ('Dummy encoding(mean) :'+ str(Results2_d.mean())) print ('Integer encoding(std) :' + str(Results2.std())) print ('Dummy encoding(std) :'+ str(Results2_d.std())) #Integer encoding, including Genres_c svm_model3 = svm.SVR() X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1) y = df1.Rating X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) svm_model3.fit(X_train,y_train) Results2a = svm_model3.predict(X_test) #evaluation #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results2a, name = 'SVM(inc Genres) - Integer'),ignore_index = True) #dummy encoding, including Genres_c svm_model4 = svm.SVR() X_d = df2.drop(labels = ['Rating','Genres','Category_c'],axis = 1) y_d = df2.Rating X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30) svm_model4.fit(X_train_d,y_train_d) Results2a_d = svm_model4.predict(X_test_d) #evaluation #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test_d,Results2a_d, name = 'SVM(inc Genres) - Dummy'),ignore_index = True) plt.figure(figsize=(12,7)) sns.regplot(Results2a,y_test,color='teal', label = 'Integer', marker = 'x') sns.regplot(Results2a_d,y_test_d,color='orange',label = 'Dummy') plt.legend() plt.title('SVM model - including Genres') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.show() print ('Integer encoding(mean) :' + str(Results2a.mean())) print ('Dummy encoding(mean) :'+ str(Results2a_d.mean())) print ('Integer encoding(std) :' + str(Results2a.std())) print ('Dummy encoding(std) :'+ str(Results2a_d.std())) from sklearn.ensemble import RandomForestRegressor #Integer encoding X = df1.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1) y = df1.Rating X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) rfr_model1 = RandomForestRegressor(max_depth=10) rfr_model1.fit(X_train,y_train) Results3 = rfr_model1.predict(X_test) #evaluation #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3, name = 'RFR - Integer'),ignore_index = True) #dummy encoding X_d = df2.drop(labels = ['Rating','Genres','Category_c','Genres_c'],axis = 1) y_d = df2.Rating X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30) rfr_model2 = RandomForestRegressor() rfr_model2.fit(X_train_d,y_train_d) Results3_d = rfr_model2.predict(X_test_d) #evaluation #esultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3_d, name = 'RFR - Dummy'),ignore_index = True) plt.figure(figsize=(12,7)) sns.regplot(Results3,y_test,color='teal', label = 'Integer', marker = 'x') sns.regplot(Results3_d,y_test_d,color='orange',label = 'Dummy') plt.legend() plt.title('RFR model - excluding Genres') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.show() print ('Integer encoding(mean) :' + str(Results3.mean())) print ('Dummy encoding(mean) :'+ str(Results3_d.mean())) print ('Integer encoding(std) :' + str(Results3.std())) print ('Dummy encoding(std) :'+ str(Results3_d.std())) #for integer plt.figure(figsize=(12,7)) Feat_impt = {} for col,feat in zip(X.columns,rfr_model2.feature_importances_): Feat_impt[col] = feat Feat_impt_df = pd.DataFrame.from_dict(Feat_impt,orient = 'index') Feat_impt_df.sort_values(by = 0, inplace = True) Feat_impt_df.rename(index = str, columns = {0:'Pct'},inplace = True) plt.figure(figsize= (14,10)) Feat_impt_df.plot(kind = 'barh',figsize= (14,10),legend = False) plt.show() #for dummy Feat_impt_d = {} for col,feat in zip(X_d.columns,rfr_model1.feature_importances_): Feat_impt_d[col] = feat Feat_impt_df_d = pd.DataFrame.from_dict(Feat_impt_d,orient = 'index') Feat_impt_df_d.sort_values(by = 0, inplace = True) Feat_impt_df_d.rename(index = str, columns = {0:'Pct'},inplace = True) plt.figure(figsize= (14,10)) Feat_impt_df_d.plot(kind = 'barh',figsize= (14,10),legend = False) plt.show() #Including Genres_C #Integer encoding X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1) y = df1.Rating X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) rfr_model3 = RandomForestRegressor() rfr_model3.fit(X_train,y_train) Results3a = rfr_model3.predict(X_test) #evaluation #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3a, name = 'RFR(inc Genres) - Integer'),ignore_index = True) #dummy encoding X_d = df2.drop(labels = ['Rating','Genres','Category_c'],axis = 1) y_d = df2.Rating X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30) rfr_model4 = RandomForestRegressor() rfr_model4.fit(X_train_d,y_train_d) Results3a_d = rfr_model4.predict(X_test_d) #evaluation #resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3a_d, name = 'RFR(inc Genres) - Dummy'),ignore_index = True) plt.figure(figsize=(12,7)) sns.regplot(Results3a,y_test,color='teal', label = 'Integer', marker = 'x') sns.regplot(Results3a_d,y_test_d,color='orange',label = 'Dummy') plt.legend() plt.title('RFR model - including Genres') plt.xlabel('Predicted Ratings') plt.ylabel('Actual Ratings') plt.show() print ('Integer encoding(mean) :' + str(Results3.mean())) print ('Dummy encoding(mean) :'+ str(Results3_d.mean())) print ('Integer encoding(std) :' + str(Results3.std())) print ('Dummy encoding(std) :'+ str(Results3_d.std())) #for integer Feat_impt = {} for col,feat in zip(X.columns,rfr_model3.feature_importances_): Feat_impt[col] = feat Feat_impt_df = pd.DataFrame.from_dict(Feat_impt,orient = 'index') Feat_impt_df.sort_values(by = 0, inplace = True) Feat_impt_df.rename(index = str, columns = {0:'Pct'},inplace = True) plt.figure(figsize= (14,10)) Feat_impt_df.plot(kind = 'barh',figsize= (14,10),legend = False) plt.show() #for dummy Feat_impt_d = {} for col,feat in zip(X_d.columns,rfr_model4.feature_importances_): Feat_impt_d[col] = feat Feat_impt_df_d = pd.DataFrame.from_dict(Feat_impt_d,orient = 'index') Feat_impt_df_d.sort_values(by = 0, inplace = True) Feat_impt_df_d.rename(index = str, columns = {0:'Pct'},inplace = True) plt.figure(figsize= (14,10)) Feat_impt_df_d.plot(kind = 'barh',figsize= (14,10),legend = False) plt.show()