#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().system('pip install pandas numpy matplotlib seaborn') # In[2]: # Importing necessary libraries import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # In[3]: import pandas as pd pd.set_option('display.max_rows', 100) pd.set_option('display.max_columns', 100) # In[4]: pd.set_option('display.max_columns', None) pd.set_option('display.width', 1000) # In[5]: file_path = 'Employee_Attrition.csv' data = pd.read_csv(file_path) # In[6]: print(data.head()) # In[7]: print("\nBasic Descriptive Statistics:") print(data.describe()) # In[8]: print(data.head().T) # In[9]: from IPython.display import display display(data.describe()) # In[10]: display(data.head()) # In[11]: print("\nMissing Values:") print(data.isnull().sum()) # In[12]: data.shape # In[13]: data_cleaned = data.dropna() print("New Data Shape:", data_cleaned.shape) # In[14]: for col in data.select_dtypes(include=np.number).columns: data[col].fillna(data[col].mean(), inplace=True) for col in data.select_dtypes(include='object').columns: data[col].fillna(data[col].mode()[0], inplace=True) print("Data Shape after Imputation:", data.shape) # In[15]: print("Missing values after imputation:", data.isnull().sum().sum()) # In[18]: numerical_features = data.select_dtypes(include=np.number).columns.tolist() categorical_features = data.select_dtypes(include='object').columns.tolist() print("Numerical features:", numerical_features) print("Categorical features:", categorical_features) # In[19]: # In[22]: for col in numerical_features: plt.figure(figsize=(8, 4)) sns.histplot(data[col], kde=True) plt.title(f'Distribution of {col}') plt.xlabel(col) plt.ylabel('Frequency') plt.show() # In[24]: for col in categorical_features: plt.figure(figsize=(8, 4)) sns.countplot(x=col, data=data) plt.title(f'Distribution of {col}') plt.xticks(rotation=45) plt.xlabel(col) plt.ylabel('Count') plt.show() # In[ ]: # In[25]: from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), numerical_features), ('cat', OneHotEncoder(), categorical_features) ]) data_processed = preprocessor.fit_transform(data) # In[29]: cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features) all_features = numerical_features + cat_features.tolist() data_processed_df = pd.DataFrame(data_processed, columns=all_features) # In[30]: print(data_processed_df.describe()) # In[31]: pd.set_option('display.float_format', lambda x: '%.2f' % x) data_processed_df.describe() # In[32]: selected_columns = ['Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome', 'Attrition_No', 'Attrition_Yes'] data_processed_df[selected_columns].describe() # In[33]: import seaborn as sns import matplotlib.pyplot as plt # Age Distribution sns.histplot(data['Age'], kde=True) plt.title('Age Distribution') plt.xlabel('Age') plt.ylabel('Count') plt.show() # In[34]: # Job Role Distribution plt.figure(figsize=(10, 6)) sns.countplot(data=data, y='JobRole') plt.title('Job Role Distribution') plt.xlabel('Count') plt.ylabel('Job Role') plt.show() # In[35]: # Job Role Distribution plt.figure(figsize=(10, 6)) sns.countplot(data=data, y='JobRole') plt.title('Job Role Distribution') plt.xlabel('Count') plt.ylabel('Job Role') plt.show() # In[36]: # Monthly Income Distribution sns.histplot(data['MonthlyIncome'], kde=True) plt.title('Monthly Income Distribution') plt.xlabel('Monthly Income') plt.ylabel('Count') plt.show() # In[39]: # Attrition Rate sns.countplot(data=data, x='Attrition') plt.title('Attrition Rate') plt.xlabel('Attrition') plt.ylabel('Count') plt.show() # In[40]: from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA import matplotlib.pyplot as plt # In[42]: lda = LDA(n_components=1) data_lda = lda.fit_transform(data_processed, class_labels) plt.hist(data_lda[class_labels == 'Yes'], color='red', alpha=0.5, label='Attrition Yes') plt.hist(data_lda[class_labels == 'No'], color='blue', alpha=0.5, label='Attrition No') plt.title('LDA with 1 Component') plt.xlabel('LD1') plt.ylabel('Frequency') plt.legend() plt.show() # In[44]: print(data_lda[:10]) # In[45]: from sklearn.decomposition import PCA # In[46]: pca = PCA(n_components=2) data_pca = pca.fit_transform(data_processed) # In[47]: plt.figure(figsize=(8, 6)) plt.scatter(data_pca[:, 0], data_pca[:, 1], c=class_labels.map({'Yes': 1, 'No': 0}), cmap='rainbow', alpha=0.7) plt.title('PCA with 2 Components') plt.xlabel('PC1') plt.ylabel('PC2') plt.colorbar() plt.show() # Statistical Analysis # Descriptive Statistics: Highlight key insights, such as average monthly income, typical job satisfaction scores, and other notable findings. # Correlation Analysis: Point out any strong correlations that might influence employee satisfaction or productivity. # Hypothesis Testing: Discuss whether the t-test and ANOVA results reveal significant differences that the company should consider. # Regression Analysis: Explain how factors like years at the company are predictive of monthly income. # Use Clear Visualizations: Include graphs and charts that clearly demonstrate your findings. # Use Appropriate Terminology: Explain statistical terms in a way that's accessible to stakeholders without a statistical background.