!pip install plotly &> /dev/null
!pip install scikit-learn &> /dev/null
!pip uninstall scikit-learn -y &> /dev/null

!pip install -U scikit-learn &> /dev/null
!mkdir census_package &> /dev/null
!pip install geocoder &> /dev/null
!pip install squarify &> /dev/null
!pip install shap &> /dev/nul

import sklearn
from sklearn import metrics
from  sklearn import linear_model
from sklearn.linear_model import LinearRegression 
from sklearn import datasets
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import seaborn as sns
from google.colab import drive
import random
#py.init_notebook_mode(connected=True)

# importing visualization libraries
import plotly
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
from plotly.offline import iplot
%matplotlib inline
from IPython.display import HTML
import shap


!wget https://raw.githubusercontent.com/jasonchang0/kaggle-google-apps/master/google-play-store-apps/googleplaystore.csv

df1 = pd.read_csv("/content/googleplaystore.csv")

df1.info()

df1.dropna(inplace = True)

df1.info()

# Cleaning Categories into integers
CategoryString = df1["Category"]
categoryVal = df1["Category"].unique()
categoryValCount = len(categoryVal)
category_dict = {}
for i in range(0,categoryValCount):
    category_dict[categoryVal[i]] = i
df1["Category_c"] = df1["Category"].map(category_dict).astype(int)

#Cleaning of genres
GenresL = df1.Genres.unique()
GenresDict = {}
for i in range(len(GenresL)):
    GenresDict[GenresL[i]] = i
df1['Genres_c'] = df1['Genres'].map(GenresDict).astype(int)

df1['Price'] = df1['Price'].apply(lambda x: str(x).replace('$', '') if '$' in str(x) else str(x))
df1['Price'] = df1['Price'].apply(lambda x: float(x))

# Removing punctuation and plus marks. So that the quantities become from string to int.

def get_number_from_string(my_string):
  if isinstance(my_string, str):
    my_string = my_string.replace(",","")
    my_string = my_string.replace("+","")
    number = int(my_string)
    return number

# This function is responsible for converting measurments in column "Size" from either Mega and Kilo to Bytes.

def handle_size(str):
  if str[-1] == "M":
    return (float(str[:-1])*10**6)
  elif str[-1] == "k":
    return (float(str[:-1])*10**3)

# This function is responsible for measuring the number of days passed from 2010 for each row in the column "Last Updated"

def handle_last_updated(str) :
    input_data = datetime.strptime(str, "%B %d, %Y")  
    lower_bound = datetime(2010, 1, 1, 0, 0)  
    return (input_data - lower_bound).days 

relevant_rows = df1['Installs']!= "Free"
relevant_rows
df1 = df1.loc[relevant_rows,:]

# 4.2 Invoking the preprocessing functions

# Run the command across the entire column
# get_number_from_string("80,000+")
df1["Installs"] = df1["Installs"].apply(lambda x:get_number_from_string(x))

#handle_size(df1.loc[0,"Size"]) 
df1["Size"] = df1["Size"].apply(lambda x:handle_size(x))

df1["Last Updated"] = df1["Last Updated"].apply(lambda x:handle_last_updated(x))

# convert reviews to numeric
df1['Reviews'] = df1['Reviews'].astype(int)

df1.isnull().sum()

df1['Size'] = df1.Size.fillna(df1.Size.median())

df1.isnull().sum()


print("The data table size is:", df1.shape)
print("*"*100)
print("The columns name are:", df1.columns)
print("*"*100)
print("The  distibution values is:", df1["Type"].value_counts())
print("*"*100)
print("The average rating score of all apps is: ", df1["Rating"].mean())
print("*"*100)
print("The min rating score of all apps is: ", df1["Rating"].min())
print("*"*100)
print("The max rating score of all apps is: ", df1["Rating"].max())
print("*"*100)

print("Printing the first 5 rows of the table: ")
print(df1.head(n=5))
print("Printing the last 5 rows of the table: ")
print(df1.tail(n=5))

x = df1['Rating'].dropna()
y = df1['Size'].dropna()
z = df1['Installs'][df1.Installs!=0].dropna()
p = df1['Reviews'][df1.Reviews!=0].dropna()
t = df1['Type'].dropna()
price = df1['Price']

p = sns.pairplot(pd.DataFrame(list(zip(x, y, np.log(z), np.log10(p), t, price)), 
                        columns=['Rating','Size', 'Installs', 'Reviews', 'Type', 'Price']), hue='Type', palette="Set2")

column = 'Type'
grouped = df1[column].value_counts().reset_index()
grouped = grouped.rename(columns={column:'count','index':column})
print(grouped)
# Now plot the data
trace = go.Pie(labels=grouped[column],values=grouped['count'],pull=[0.05,0])
layout = {'title':'The Distribution of paid and not paid apps in the app store'}
fig = go.Figure(data=[trace],layout=layout)
iplot(fig)
# show it
plt.tight_layout()
plt.show()

vc=df1["Content Rating"].value_counts().reset_index()

vc.rename(columns={'Content Rating': 'count','index':"type" }, inplace=True)
vc['percent']=vc['count'].apply(lambda x : 100*x/sum(vc['count']))

vc=vc.sort_values("percent")
vc

trace = go.Bar(x=vc["type"], y=vc["percent"], name="Group", marker=dict(color="#6ad49b"))
#layout={'title':"The number of ",'xaxis':{'title':"x title"}}
layout={'title':'The size of each ranking group','xaxis':{'title':"Group name"}}
fig = go.Figure(data=trace, layout=layout)
iplot(fig)
# show it
plt.tight_layout()
plt.show()

number_of_apps_in_category = df1['Category'].value_counts().sort_values(ascending=True)

data = [go.Pie(
        labels = number_of_apps_in_category.index,
        values = number_of_apps_in_category.values,
        hoverinfo = 'label+value'
    
)]

plotly.offline.iplot(data, filename='active_category')
# show it
plt.tight_layout()
plt.show()

import squarify #for making treemap, we need squarify
plt.figure(figsize=(20,8))
labels = df1['Rating'].value_counts().index.tolist()
colors = [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]
squarify.plot(sizes = df1['Rating'].value_counts(), label = labels, color = colors, alpha = 0.8)

data = [go.Histogram(
        x = df1.Rating,
        xbins = {'start': 1, 'size': 0.1, 'end' :5}
)]

print('Average app rating = ', np.mean(df1['Rating']))
plotly.offline.iplot(data, filename='overall_rating_distribution')

groups = df1.groupby('Category').filter(lambda x: len(x) >= 50).reset_index()

# sns.set_style('ticks')
# fig, ax = plt.subplots()
# fig.set_size_inches(8, 8)
sns.set_style("darkgrid")
ax = sns.jointplot(df1['Size'], df1['Rating'])
#ax.set_title('Rating Vs Size')

subset_df = df1[df1.Size > 40]
groups_temp = subset_df.groupby('Category').filter(lambda x: len(x) >20)
groups_temp['Category'].value_counts().head(n=8)

paid_apps = df1[df1.Price>0]
p = sns.jointplot( "Price", "Rating", paid_apps)

subset_df = df1[df1.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY', 'MEDICAL', 'TOOLS', 'FINANCE',
                                 'LIFESTYLE','BUSINESS'])]
sns.set_style('darkgrid')
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
p = sns.stripplot(x="Price", y="Category", data=subset_df, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across categories')


subset_df.loc[subset_df['Price']>250,'Category'].value_counts()

fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
subset_df_price = subset_df[subset_df.Price<100]
p = sns.stripplot(x="Price", y="Category", data=subset_df_price, jitter=True, linewidth=1)
title = ax.set_title('App pricing trend across categories - after filtering for junk apps')

#print(df1.head(n=5))
df1.Type.value_counts()
# groups = df1.groupby(['Category', 'Type'])

# for category_type, group in groups:
#   print("category_type: ", category_type)
#   print("group size: ", group.shape[0])

# Stacked bar graph for top 5-10 categories - Ratio of paid and free apps
#fig, ax = plt.subplots(figsize=(15,10))

new_df = df1.groupby(['Category', 'Type']).agg({'App' : 'count'}).reset_index()
#print(new_df)

# outer_group_names = df1['Category'].sort_values().value_counts()[:5].index
# outer_group_values = df1['Category'].sort_values().value_counts()[:5].values

outer_group_names = ['GAME', 'FAMILY', 'MEDICAL', 'TOOLS']
outer_group_values = [len(df1.App[df1.Category == category]) for category in outer_group_names]

a, b, c, d=[plt.cm.Blues, plt.cm.Reds, plt.cm.Greens, plt.cm.Purples]


inner_group_names = ['Paid', 'Free'] * 4
inner_group_values = []
#inner_colors = ['#58a27c','#FFD433']


for category in outer_group_names:
    for t in ['Paid', 'Free']:
        x = new_df[new_df.Category == category]
        try:
            #print(x.App[x.Type == t].values[0])
            inner_group_values.append(int(x.App[x.Type == t].values[0]))
        except:
            #print(x.App[x.Type == t].values[0])
            inner_group_values.append(0)

explode = (0.025,0.025,0.025,0.025)
# First Ring (outside)
fig, ax = plt.subplots(figsize=(10,10))
ax.axis('equal')
mypie, texts, _ = ax.pie(outer_group_values, radius=1.2, labels=outer_group_names, autopct='%1.1f%%', pctdistance=1.1,
                                 labeldistance= 0.75,  explode = explode, colors=[a(0.6), b(0.6), c(0.6), d(0.6)], textprops={'fontsize': 16})
plt.setp( mypie, width=0.5, edgecolor='black')
 
# Second Ring (Inside)
mypie2, _ = ax.pie(inner_group_values, radius=1.2-0.5, labels=inner_group_names, labeldistance= 0.7, 
                   textprops={'fontsize': 12}, colors = [a(0.4), a(0.2), b(0.4), b(0.2), c(0.4), c(0.2), d(0.4), d(0.2)])
plt.setp( mypie2, width=0.5, edgecolor='black')
plt.margins(0,0)
 
# show it
plt.title("The inner distribution of paid and free apps among Game,Tools,Medical and Family")
plt.tight_layout()
plt.show()


trace0 = go.Box(
    y=np.log10(df1['Installs'][df1.Type=='Paid']),
    name = 'Paid',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    ),
    boxpoints='all'
)
trace1 = go.Box(
    y=np.log10(df1['Installs'][df1.Type=='Free']),
    name = 'Free',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    ),
    boxpoints='all'
)
layout = go.Layout(
    title = "Number of downloads of paid apps Vs free apps",
    yaxis= {'title': 'Number of downloads (log-scaled)'}
)
data = [trace0, trace1]
plotly.offline.iplot({'data': data, 'layout': layout})

corrmat = df1.corr()
#f, ax = plt.subplots()
p =sns.heatmap(corrmat, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

df_copy = df1.copy()

df_copy = df_copy[df_copy.Reviews > 10]
df_copy = df_copy[df_copy.Installs > 0]

df_copy['Installs'] = np.log10(df1['Installs'])
df_copy['Reviews'] = np.log10(df1['Reviews'])

sns.lmplot("Reviews", "Installs", data=df_copy)
ax = plt.gca()
_ = ax.set_title('Number of Reviews Vs Number of Downloads (Log scaled)')

#Converting Type classification into binary
def type_cat(types):
    if types == 'Free':
        return 0
    else:
        return 1

df1['Type'] = df1['Type'].map(type_cat)

#Cleaning of content rating classification
RatingL = df1['Content Rating'].unique()
RatingDict = {}
for i in range(len(RatingL)):
    RatingDict[RatingL[i]] = i
df1['Content Rating'] = df1['Content Rating'].map(RatingDict).astype(int)

df1.info()

#dropping of unrelated and unnecessary items
df1.drop(labels = ['Last Updated','Current Ver','Android Ver','App'], axis = 1, inplace = True)

df1.head()

# for dummy variable encoding for Categories
df2 = pd.get_dummies(df1, columns=['Category'])

df2.head()

from sklearn.decomposition import PCA
X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1)
y = df1.Rating
print("The number of columns (features) before Dimension Reduction is: ", X.shape[1])

pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_))

print(pca.explained_variance_ratio_)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

# Dimension Reduction:
pca = PCA(n_components=3)
x_res = pca.fit_transform(X)

fig = plt.figure()
ax = fig.add_subplot(projection="3d")

ax.set_title('Data representation with 3 components', fontsize=10)
ax.scatter(x_res[:, 0], x_res[:, 1], x_res[:, 2],cmap=plt.cm.nipy_spectral,marker='o',edgecolor="k")

ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

plt.subplots_adjust(right=1.3)

print(f"Before PCA we had {X.shape[1]} features after applying PCA we got only 3 Principal components" )


#for evaluation of error term and 
def Evaluation_metrics(y_true, y_predict):
    print ('Mean Squared Error: '+ str(metrics.mean_squared_error(y_true,y_predict)))
    print ('Mean absolute Error: '+ str(metrics.mean_absolute_error(y_true,y_predict)))
    print ('Mean squared Log Error: '+ str(metrics.mean_squared_log_error(y_true,y_predict)))

#to add into results_index for evaluation of error term 
def Evaluationmatrix_dict(y_true, y_predict, name = 'Linear - Integer'):
    dict_matrix = {}
    dict_matrix['Series Name'] = name
    dict_matrix['Mean Squared Error'] = metrics.mean_squared_error(y_true,y_predict)
    dict_matrix['Mean Absolute Error'] = metrics.mean_absolute_error(y_true,y_predict)
    dict_matrix['Mean Squared Log Error'] = metrics.mean_squared_log_error(y_true,y_predict)
    return dict_matrix

#excluding Genre label
#Integer encoding

X = df1.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1)
y = df1.Rating

# print(X.columns)
# print(df1.shape)
# print(X.shape)

y = pd.DataFrame(y)

# print(y.columns)
# print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
results = lr_model.predict(X_test)
print("The linear regression model without including the 'Category' column:")
Evaluation_metrics(y_test, results)

print('*'*100)
#dummy encoding

X_d = df2.drop(labels = ['Rating','Genres','Category_c','Genres_c'],axis = 1)
y_d = df2.Rating
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30)
lr_model2 = LinearRegression()
lr_model2.fit(X_train_d,y_train_d)
results_d = lr_model2.predict(X_test_d)
print("The linear regression model with including the 'Category' column:")
Evaluation_metrics(y_test, results_d)

plt.figure(figsize=(12,7)) 
sns.regplot(results,y_test,color='teal', label = 'Integer', marker = 'x')
sns.regplot(results_d,y_test_d,color='orange',label = 'Dummy') 
plt.legend() 
plt.title('Linear model - Excluding Genres') 
plt.xlabel('Predicted Ratings') 
plt.ylabel('Actual Ratings') 
plt.show()

print ('Actual mean of population:' + str(y.mean()))
print ('Integer encoding(mean) :' + str(results.mean()))
print ('Dummy encoding(mean) :'+ str(results_d.mean()))
print ('Integer encoding(std) :' + str(results.std()))
print ('Dummy encoding(std) :'+ str(results_d.std()))

explainer = shap.explainers.Linear(lr_model, X)
shap_values = explainer(X)
 # visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])

#Including genre label

#Integer encoding
X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1)
y = df1.Rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
lr_model3 = LinearRegression()
lr_model3.fit(X_train,y_train)
Results = lr_model3.predict(X_test)

#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results, name = 'Linear(inc Genre) - Integer'),ignore_index = True)

#dummy encoding

X_d = df2.drop(labels = ['Rating','Genres','Category_c'],axis = 1)
y_d = df2.Rating
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30)
lr_model4 = LinearRegression()
lr_model4.fit(X_train_d,y_train_d)
Results_d = lr_model4.predict(X_test_d)

#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test_d,Results_d, name = 'Linear(inc Genre) - Dummy'),ignore_index = True)

plt.figure(figsize=(12,7))
sns.regplot(Results,y_test,color='teal', label = 'Integer', marker = 'x')
sns.regplot(Results_d,y_test_d,color='orange',label = 'Dummy')
plt.legend()
plt.title('Linear model - Including Genres')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()

print ('Integer encoding(mean) :' + str(Results.mean()))
print ('Dummy encoding(mean) :'+ str(Results_d.mean()))
print ('Integer encoding(std) :' + str(Results.std()))
print ('Dummy encoding(std) :'+ str(Results_d.std()))

explainer = shap.explainers.Linear(lr_model3, X)
shap_values = explainer(X)
 # visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])

#Excluding genres
from sklearn import svm
#Integer encoding

X1 = df1.drop(labels =  ['Category','Rating','Genres','Genres_c'],axis = 1)
print(X1.columns)
y = df1.Rating
X_train, X_test, y_train, y_test = train_test_split(X1.values, y.values, test_size=0.30)

svm_model1 = svm.SVR()
svm_model1.fit(X_train,y_train)

Results2 = svm_model1.predict(X_test)

#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results2, name = 'SVM - Integer'),ignore_index = True)

#dummy based

X_d = df2.drop(labels = ['Rating','Genres','Category_c','Genres_c',],axis = 1)
y_d = df2.Rating

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30)

svm_model2 = svm.SVR()
svm_model2.fit(X_train_d,y_train_d)

Results2_d = svm_model2.predict(X_test_d)

Evaluation_metrics(y_test_d, Results2_d)

#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test_d,Results2_d, name = 'SVM - Dummy'),ignore_index = True)

plt.figure(figsize=(12,7))
sns.regplot(Results2,y_test,color='teal', label = 'Integer', marker = 'x')
sns.regplot(Results2_d,y_test_d,color='orange',label = 'Dummy')
plt.legend()
plt.title('SVM model - excluding Genres')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()

print ('Integer encoding(mean) :' + str(Results2.mean()))
print ('Dummy encoding(mean) :'+ str(Results2_d.mean()))
print ('Integer encoding(std) :' + str(Results2.std()))
print ('Dummy encoding(std) :'+ str(Results2_d.std()))

#Integer encoding, including Genres_c
svm_model3 = svm.SVR()

X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1)
y = df1.Rating

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

svm_model3.fit(X_train,y_train)

Results2a = svm_model3.predict(X_test)

#evaluation
#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results2a, name = 'SVM(inc Genres) - Integer'),ignore_index = True)

#dummy encoding, including Genres_c
svm_model4 = svm.SVR()

X_d = df2.drop(labels = ['Rating','Genres','Category_c'],axis = 1)
y_d = df2.Rating

X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30)

svm_model4.fit(X_train_d,y_train_d)

Results2a_d = svm_model4.predict(X_test_d)

#evaluation
#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test_d,Results2a_d, name = 'SVM(inc Genres) - Dummy'),ignore_index = True)

plt.figure(figsize=(12,7))
sns.regplot(Results2a,y_test,color='teal', label = 'Integer', marker = 'x')
sns.regplot(Results2a_d,y_test_d,color='orange',label = 'Dummy')
plt.legend()
plt.title('SVM model - including Genres')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()

print ('Integer encoding(mean) :' + str(Results2a.mean()))
print ('Dummy encoding(mean) :'+ str(Results2a_d.mean()))
print ('Integer encoding(std) :' + str(Results2a.std()))
print ('Dummy encoding(std) :'+ str(Results2a_d.std()))

from sklearn.ensemble import RandomForestRegressor

#Integer encoding
X = df1.drop(labels = ['Category','Rating','Genres','Genres_c'],axis = 1)
y = df1.Rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
rfr_model1 = RandomForestRegressor(max_depth=10)
rfr_model1.fit(X_train,y_train)
Results3 = rfr_model1.predict(X_test)

#evaluation
#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3, name = 'RFR - Integer'),ignore_index = True)

#dummy encoding

X_d = df2.drop(labels = ['Rating','Genres','Category_c','Genres_c'],axis = 1)
y_d = df2.Rating
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30)
rfr_model2 = RandomForestRegressor()
rfr_model2.fit(X_train_d,y_train_d)
Results3_d = rfr_model2.predict(X_test_d)

#evaluation
#esultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3_d, name = 'RFR - Dummy'),ignore_index = True)

plt.figure(figsize=(12,7))
sns.regplot(Results3,y_test,color='teal', label = 'Integer', marker = 'x')
sns.regplot(Results3_d,y_test_d,color='orange',label = 'Dummy')
plt.legend()
plt.title('RFR model - excluding Genres')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()

print ('Integer encoding(mean) :' + str(Results3.mean()))
print ('Dummy encoding(mean) :'+ str(Results3_d.mean()))
print ('Integer encoding(std) :' + str(Results3.std()))
print ('Dummy encoding(std) :'+ str(Results3_d.std()))

#for integer
plt.figure(figsize=(12,7))
Feat_impt = {}
for col,feat in zip(X.columns,rfr_model2.feature_importances_):
    Feat_impt[col] = feat

Feat_impt_df = pd.DataFrame.from_dict(Feat_impt,orient = 'index')
Feat_impt_df.sort_values(by = 0, inplace = True)
Feat_impt_df.rename(index = str, columns = {0:'Pct'},inplace = True)

plt.figure(figsize= (14,10))
Feat_impt_df.plot(kind = 'barh',figsize= (14,10),legend = False)
plt.show()


#for dummy
Feat_impt_d = {}
for col,feat in zip(X_d.columns,rfr_model1.feature_importances_):
    Feat_impt_d[col] = feat

Feat_impt_df_d = pd.DataFrame.from_dict(Feat_impt_d,orient = 'index')
Feat_impt_df_d.sort_values(by = 0, inplace = True)
Feat_impt_df_d.rename(index = str, columns = {0:'Pct'},inplace = True)

plt.figure(figsize= (14,10))
Feat_impt_df_d.plot(kind = 'barh',figsize= (14,10),legend = False)
plt.show()

#Including Genres_C

#Integer encoding
X = df1.drop(labels = ['Category','Rating','Genres'],axis = 1)
y = df1.Rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
rfr_model3 = RandomForestRegressor()
rfr_model3.fit(X_train,y_train)
Results3a = rfr_model3.predict(X_test)

#evaluation
#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3a, name = 'RFR(inc Genres) - Integer'),ignore_index = True)

#dummy encoding

X_d = df2.drop(labels = ['Rating','Genres','Category_c'],axis = 1)
y_d = df2.Rating
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_d, y_d, test_size=0.30)
rfr_model4 = RandomForestRegressor()
rfr_model4.fit(X_train_d,y_train_d)
Results3a_d = rfr_model4.predict(X_test_d)

#evaluation
#resultsdf = resultsdf.append(Evaluationmatrix_dict(y_test,Results3a_d, name = 'RFR(inc Genres) - Dummy'),ignore_index = True)

plt.figure(figsize=(12,7))
sns.regplot(Results3a,y_test,color='teal', label = 'Integer', marker = 'x')
sns.regplot(Results3a_d,y_test_d,color='orange',label = 'Dummy')
plt.legend()
plt.title('RFR model - including Genres')
plt.xlabel('Predicted Ratings')
plt.ylabel('Actual Ratings')
plt.show()

print ('Integer encoding(mean) :' + str(Results3.mean()))
print ('Dummy encoding(mean) :'+ str(Results3_d.mean()))
print ('Integer encoding(std) :' + str(Results3.std()))
print ('Dummy encoding(std) :'+ str(Results3_d.std()))


#for integer
Feat_impt = {}
for col,feat in zip(X.columns,rfr_model3.feature_importances_):
    Feat_impt[col] = feat

Feat_impt_df = pd.DataFrame.from_dict(Feat_impt,orient = 'index')
Feat_impt_df.sort_values(by = 0, inplace = True)
Feat_impt_df.rename(index = str, columns = {0:'Pct'},inplace = True)

plt.figure(figsize= (14,10))
Feat_impt_df.plot(kind = 'barh',figsize= (14,10),legend = False)
plt.show()

#for dummy
Feat_impt_d = {}
for col,feat in zip(X_d.columns,rfr_model4.feature_importances_):
    Feat_impt_d[col] = feat

Feat_impt_df_d = pd.DataFrame.from_dict(Feat_impt_d,orient = 'index')
Feat_impt_df_d.sort_values(by = 0, inplace = True)
Feat_impt_df_d.rename(index = str, columns = {0:'Pct'},inplace = True)

plt.figure(figsize= (14,10))
Feat_impt_df_d.plot(kind = 'barh',figsize= (14,10),legend = False)
plt.show()