#!/usr/bin/env python
# coding: utf-8

# In[2]:


import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

dfWell = pd.read_csv('https://www.dropbox.com/s/170bc3dimgn8ru8/wellness.csv?dl=1')
dfWell.info()


# In[3]:


dfWell.head()


# In[4]:


dfWell.Menstruation.value_counts()


# In[5]:


dfWell.Nutrition.value_counts()


# In[6]:


dfWell.NutritionAdjustment.value_counts()


# In[7]:


# As Menstruation, Nutrition and Nutrition Adjustment are categorical, fill NA with their Modes
dfWell.Menstruation.fillna(dfWell.Menstruation.mode()[0],inplace=True)
dfWell.Nutrition.fillna(dfWell.Nutrition.mode()[0],inplace=True)
dfWell.NutritionAdjustment.fillna(dfWell.NutritionAdjustment.mode()[0],inplace=True)
dfWell.info()


# In[8]:


dfWell.Menstruation.value_counts()


# In[9]:


dfWell.Nutrition.value_counts()


# In[10]:


dfWell.NutritionAdjustment.value_counts()


# In[11]:


# As USG measurement has almost 87% NA, it doesn't make sense to fill NA with random or mean values. 
# Hence, drop USG related columns
dfWell = dfWell.drop(['USGMeasurement', 'USG'], axis=1)
dfWell.head()


# In[12]:


# We do not require Bed Time and Wake Time as well. We'll be using SleepHours instead
dfWell = dfWell.drop(['BedTime', 'WakeTime'], axis=1)
# Convert TrainingReadiness from String to fraction
dfWell['TrainingReadiness'] = dfWell['TrainingReadiness'].str.rstrip('%').astype('float') / 100.0 * 7


# In[13]:


dfWell.Pain.value_counts()


# In[14]:


dfWell.Illness.value_counts()


# In[15]:


# Convert into numerical values
dfWell.Pain = np.where(dfWell.Pain == 'No', 1, 0)
dfWell.Menstruation = np.where(dfWell.Menstruation == 'No', 1, 0)
dfWell.Nutrition = np.where(dfWell.Nutrition == 'Poor', 0, np.where(dfWell.Nutrition == 'Okay', 1, 2))
dfWell.Illness = np.where(dfWell.Illness == 'Yes', 0, np.where(dfWell.Illness == 'Slightly Off', 1, 2))
dfWell.NutritionAdjustment = np.where(dfWell.NutritionAdjustment == 'No', 0, \
                                      np.where(dfWell.NutritionAdjustment == 'I Don\'t Know', 1, 2))
dfWell.head()


# In[16]:


from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
df = dfWell[dfWell.columns.difference(['Date','PlayerID','MonitoringScore'])]
chi_square_value, p_value = calculate_bartlett_sphericity(df)
chi_square_value, p_value


# In[17]:


from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model = calculate_kmo(df)
kmo_model


# In[18]:


# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer()
fa.analyze(df, 12, rotation=None)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev


# In[19]:


# Create scree plot using matplotlib
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()


# In[20]:


fa.analyze(df, 4, rotation="varimax")
fa.loadings


# In[21]:


fa.get_factor_variance()


# In[22]:


df_fact = pd.DataFrame(np.dot(df, fa.loadings))
df_fact.columns = ["Energy","Nourishment","Discomfort","Determination"] 
df_fact


# In[23]:


dfWell = pd.concat([dfWell, df_fact], axis=1)
dfWell.info()


# In[24]:


dfWell = dfWell[["Date","PlayerID","MonitoringScore","Energy","Nourishment","Discomfort","Determination"]]
dfWell


# In[25]:


df = dfWell[dfWell.columns.difference(['Date','PlayerID'])]
ax = sns.heatmap(df.corr(), annot=True, fmt=".2f")
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.show()


# In[26]:


dfWell = dfWell.drop(['MonitoringScore'], axis=1)
dfWell.head()