#!/usr/bin/env python # coding: utf-8 # In[2]: import numpy as np import pandas as pd from factor_analyzer import FactorAnalyzer import matplotlib.pyplot as plt import seaborn as sns dfWell = pd.read_csv('https://www.dropbox.com/s/170bc3dimgn8ru8/wellness.csv?dl=1') dfWell.info() # In[3]: dfWell.head() # In[4]: dfWell.Menstruation.value_counts() # In[5]: dfWell.Nutrition.value_counts() # In[6]: dfWell.NutritionAdjustment.value_counts() # In[7]: # As Menstruation, Nutrition and Nutrition Adjustment are categorical, fill NA with their Modes dfWell.Menstruation.fillna(dfWell.Menstruation.mode()[0],inplace=True) dfWell.Nutrition.fillna(dfWell.Nutrition.mode()[0],inplace=True) dfWell.NutritionAdjustment.fillna(dfWell.NutritionAdjustment.mode()[0],inplace=True) dfWell.info() # In[8]: dfWell.Menstruation.value_counts() # In[9]: dfWell.Nutrition.value_counts() # In[10]: dfWell.NutritionAdjustment.value_counts() # In[11]: # As USG measurement has almost 87% NA, it doesn't make sense to fill NA with random or mean values. # Hence, drop USG related columns dfWell = dfWell.drop(['USGMeasurement', 'USG'], axis=1) dfWell.head() # In[12]: # We do not require Bed Time and Wake Time as well. We'll be using SleepHours instead dfWell = dfWell.drop(['BedTime', 'WakeTime'], axis=1) # Convert TrainingReadiness from String to fraction dfWell['TrainingReadiness'] = dfWell['TrainingReadiness'].str.rstrip('%').astype('float') / 100.0 * 7 # In[13]: dfWell.Pain.value_counts() # In[14]: dfWell.Illness.value_counts() # In[15]: # Convert into numerical values dfWell.Pain = np.where(dfWell.Pain == 'No', 1, 0) dfWell.Menstruation = np.where(dfWell.Menstruation == 'No', 1, 0) dfWell.Nutrition = np.where(dfWell.Nutrition == 'Poor', 0, np.where(dfWell.Nutrition == 'Okay', 1, 2)) dfWell.Illness = np.where(dfWell.Illness == 'Yes', 0, np.where(dfWell.Illness == 'Slightly Off', 1, 2)) dfWell.NutritionAdjustment = np.where(dfWell.NutritionAdjustment == 'No', 0, \ np.where(dfWell.NutritionAdjustment == 'I Don\'t Know', 1, 2)) dfWell.head() # In[16]: from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity df = dfWell[dfWell.columns.difference(['Date','PlayerID','MonitoringScore'])] chi_square_value, p_value = calculate_bartlett_sphericity(df) chi_square_value, p_value # In[17]: from factor_analyzer.factor_analyzer import calculate_kmo kmo_all, kmo_model = calculate_kmo(df) kmo_model # In[18]: # Create factor analysis object and perform factor analysis fa = FactorAnalyzer() fa.analyze(df, 12, rotation=None) # Check Eigenvalues ev, v = fa.get_eigenvalues() ev # In[19]: # Create scree plot using matplotlib plt.scatter(range(1,df.shape[1]+1),ev) plt.plot(range(1,df.shape[1]+1),ev) plt.title('Scree Plot') plt.xlabel('Factors') plt.ylabel('Eigenvalue') plt.grid() plt.show() # In[20]: fa.analyze(df, 4, rotation="varimax") fa.loadings # In[21]: fa.get_factor_variance() # In[22]: df_fact = pd.DataFrame(np.dot(df, fa.loadings)) df_fact.columns = ["Energy","Nourishment","Discomfort","Determination"] df_fact # In[23]: dfWell = pd.concat([dfWell, df_fact], axis=1) dfWell.info() # In[24]: dfWell = dfWell[["Date","PlayerID","MonitoringScore","Energy","Nourishment","Discomfort","Determination"]] dfWell # In[25]: df = dfWell[dfWell.columns.difference(['Date','PlayerID'])] ax = sns.heatmap(df.corr(), annot=True, fmt=".2f") bottom, top = ax.get_ylim() ax.set_ylim(bottom + 0.5, top - 0.5) plt.show() # In[26]: dfWell = dfWell.drop(['MonitoringScore'], axis=1) dfWell.head()