!pip install pandas numpy matplotlib seaborn
Requirement already satisfied: pandas in c:\users\luke holmes\anaconda3\lib\site-packages (1.5.3) Requirement already satisfied: numpy in c:\users\luke holmes\anaconda3\lib\site-packages (1.24.3) Requirement already satisfied: matplotlib in c:\users\luke holmes\anaconda3\lib\site-packages (3.7.1) Requirement already satisfied: seaborn in c:\users\luke holmes\anaconda3\lib\site-packages (0.12.2) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from pandas) (2022.7) Requirement already satisfied: contourpy>=1.0.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (23.0) Requirement already satisfied: pillow>=6.2.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (9.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: six>=1.5 in c:\users\luke holmes\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
file_path = 'Employee_Attrition.csv'
data = pd.read_csv(file_path)
print(data.head())
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager 0 41.920786 Yes Travel_Rarely 990.110009 Sales 1.016716 1.770313 Life Sciences 1.033298 1.054148 2.146294 Male 93.827486 3.426960 2.464492 Sales Executive 3.397665 Single 6438.949336 20845.103714 7.077818 Y Yes 9.898765 2.844897 1.012832 95.612654 0.000000 8.063833 0.000000 1.052104 5.795945 3.993896 0.000000 4.594972 1 51.589037 No Travel_Frequently 276.776030 Research & Development NaN 1.035333 Life Sciences 0.944859 2.107031 3.470803 Male 52.304157 1.813458 1.788697 Research Scientist 1.584059 Married 4457.659622 24877.898697 1.114423 Y No NaN 4.120281 4.312008 85.370868 1.104642 8.965059 3.264952 2.526470 10.070232 7.937505 1.032295 5.901207 2 33.131540 Yes Travel_Rarely 1204.158501 Research & Development 2.198662 2.118801 Other 0.810809 4.008125 3.979420 Male 87.271332 1.877226 0.833488 Laboratory Technician 3.301009 Single 2124.545220 2262.942954 5.539591 Y Yes 15.401596 2.781168 2.332558 76.248084 0.000000 8.427238 3.004521 2.924784 0.000000 0.000000 0.000000 0.000000 3 34.707073 No Travel_Frequently 1352.752432 Sales 2.937851 3.670674 Life Sciences 0.822976 4.587269 3.734705 Female 56.378982 2.932646 1.095372 Research Scientist 3.220844 Married 2771.699080 27189.905714 0.772793 NaN Yes 9.757546 2.769576 3.029591 75.888697 0.000000 7.662606 2.856001 3.117053 8.435324 5.563667 2.845429 0.000000 4 24.790188 No Travel_Rarely 485.053333 Research & Development 1.988953 1.120496 Medical NaN 7.228740 1.079412 Male 40.433887 3.117739 0.937746 Laboratory Technician 1.697330 Married 3254.601575 17675.541599 10.118125 NaN NaN 11.920147 3.282183 NaN 72.670937 0.953012 6.241733 2.814719 3.112317 2.040124 1.970786 2.495949 2.219353
print("\nBasic Descriptive Statistics:")
print(data.describe())
Basic Descriptive Statistics: Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager count 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 1323.000000 mean 36.635268 802.030050 9.091097 2.923447 0.999325 1026.679166 2.726443 65.647722 2.743028 2.058270 2.715272 6544.718401 14364.334764 2.654394 15.295717 3.149344 2.720431 80.241313 0.814757 11.426152 2.811756 2.762825 6.933624 4.233363 2.160710 4.117695 std 9.882161 414.031025 8.177870 1.072203 0.099656 610.480802 1.129679 21.469709 0.765568 1.127018 1.146786 4766.580903 7269.420895 2.508952 3.995702 0.476027 1.115959 8.022535 0.863076 8.075870 1.329044 0.761802 6.046798 3.658111 3.221481 3.594053 min 14.544608 86.828045 0.750112 0.770362 0.628985 1.054148 0.759655 23.855272 0.722008 0.715172 0.724192 891.481007 1567.502382 0.000000 7.684723 2.116951 0.720696 53.793059 0.000000 0.000000 0.000000 0.799138 0.000000 0.000000 0.000000 0.000000 25% 29.624982 456.479553 2.147069 2.122706 0.931146 501.854842 1.894818 47.701897 2.177638 1.045228 1.846039 2961.844066 8138.576694 0.948843 12.195197 2.845502 1.909010 74.882946 0.000000 5.953221 1.967328 2.193390 2.673269 1.824392 0.000000 1.687177 50% 34.951950 782.958705 6.920808 3.015658 0.998949 1022.025017 2.887313 64.934490 2.859143 1.936150 2.895032 4957.070475 14258.293365 1.708812 14.495151 3.064551 2.891235 80.351400 0.913651 9.306742 2.690934 2.899807 5.422160 2.968874 0.967140 2.939240 75% 42.748741 1130.577949 13.584900 3.700560 1.067371 1533.811096 3.676600 83.038355 3.179128 2.609707 3.676663 8268.002496 20322.279885 3.959710 17.745736 3.335525 3.639961 85.664169 1.082523 15.324548 3.366949 3.206117 9.238005 7.052591 2.586631 6.987964 max 69.402515 1784.394456 33.684424 6.285161 1.290613 2530.919375 5.254672 123.200439 5.241968 5.942269 5.251277 22858.020388 32751.715800 10.855733 29.797261 5.013371 5.092280 107.378705 3.474319 47.641892 7.503201 4.856086 36.848408 19.091844 17.273662 19.976003
print(data.head().T)
0 1 2 3 4 Age 41.920786 51.589037 33.13154 34.707073 24.790188 Attrition Yes No Yes No No BusinessTravel Travel_Rarely Travel_Frequently Travel_Rarely Travel_Frequently Travel_Rarely DailyRate 990.110009 276.77603 1204.158501 1352.752432 485.053333 Department Sales Research & Development Research & Development Sales Research & Development DistanceFromHome 1.016716 NaN 2.198662 2.937851 1.988953 Education 1.770313 1.035333 2.118801 3.670674 1.120496 EducationField Life Sciences Life Sciences Other Life Sciences Medical EmployeeCount 1.033298 0.944859 0.810809 0.822976 NaN EmployeeNumber 1.054148 2.107031 4.008125 4.587269 7.22874 EnvironmentSatisfaction 2.146294 3.470803 3.97942 3.734705 1.079412 Gender Male Male Male Female Male HourlyRate 93.827486 52.304157 87.271332 56.378982 40.433887 JobInvolvement 3.42696 1.813458 1.877226 2.932646 3.117739 JobLevel 2.464492 1.788697 0.833488 1.095372 0.937746 JobRole Sales Executive Research Scientist Laboratory Technician Research Scientist Laboratory Technician JobSatisfaction 3.397665 1.584059 3.301009 3.220844 1.69733 MaritalStatus Single Married Single Married Married MonthlyIncome 6438.949336 4457.659622 2124.54522 2771.69908 3254.601575 MonthlyRate 20845.103714 24877.898697 2262.942954 27189.905714 17675.541599 NumCompaniesWorked 7.077818 1.114423 5.539591 0.772793 10.118125 Over18 Y Y Y NaN NaN OverTime Yes No Yes Yes NaN PercentSalaryHike 9.898765 NaN 15.401596 9.757546 11.920147 PerformanceRating 2.844897 4.120281 2.781168 2.769576 3.282183 RelationshipSatisfaction 1.012832 4.312008 2.332558 3.029591 NaN StandardHours 95.612654 85.370868 76.248084 75.888697 72.670937 StockOptionLevel 0.0 1.104642 0.0 0.0 0.953012 TotalWorkingYears 8.063833 8.965059 8.427238 7.662606 6.241733 TrainingTimesLastYear 0.0 3.264952 3.004521 2.856001 2.814719 WorkLifeBalance 1.052104 2.52647 2.924784 3.117053 3.112317 YearsAtCompany 5.795945 10.070232 0.0 8.435324 2.040124 YearsInCurrentRole 3.993896 7.937505 0.0 5.563667 1.970786 YearsSinceLastPromotion 0.0 1.032295 0.0 2.845429 2.495949 YearsWithCurrManager 4.594972 5.901207 0.0 0.0 2.219353
from IPython.display import display
display(data.describe())
Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 | 1323.000000 |
mean | 36.635268 | 802.030050 | 9.091097 | 2.923447 | 0.999325 | 1026.679166 | 2.726443 | 65.647722 | 2.743028 | 2.058270 | 2.715272 | 6544.718401 | 14364.334764 | 2.654394 | 15.295717 | 3.149344 | 2.720431 | 80.241313 | 0.814757 | 11.426152 | 2.811756 | 2.762825 | 6.933624 | 4.233363 | 2.160710 | 4.117695 |
std | 9.882161 | 414.031025 | 8.177870 | 1.072203 | 0.099656 | 610.480802 | 1.129679 | 21.469709 | 0.765568 | 1.127018 | 1.146786 | 4766.580903 | 7269.420895 | 2.508952 | 3.995702 | 0.476027 | 1.115959 | 8.022535 | 0.863076 | 8.075870 | 1.329044 | 0.761802 | 6.046798 | 3.658111 | 3.221481 | 3.594053 |
min | 14.544608 | 86.828045 | 0.750112 | 0.770362 | 0.628985 | 1.054148 | 0.759655 | 23.855272 | 0.722008 | 0.715172 | 0.724192 | 891.481007 | 1567.502382 | 0.000000 | 7.684723 | 2.116951 | 0.720696 | 53.793059 | 0.000000 | 0.000000 | 0.000000 | 0.799138 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 29.624982 | 456.479553 | 2.147069 | 2.122706 | 0.931146 | 501.854842 | 1.894818 | 47.701897 | 2.177638 | 1.045228 | 1.846039 | 2961.844066 | 8138.576694 | 0.948843 | 12.195197 | 2.845502 | 1.909010 | 74.882946 | 0.000000 | 5.953221 | 1.967328 | 2.193390 | 2.673269 | 1.824392 | 0.000000 | 1.687177 |
50% | 34.951950 | 782.958705 | 6.920808 | 3.015658 | 0.998949 | 1022.025017 | 2.887313 | 64.934490 | 2.859143 | 1.936150 | 2.895032 | 4957.070475 | 14258.293365 | 1.708812 | 14.495151 | 3.064551 | 2.891235 | 80.351400 | 0.913651 | 9.306742 | 2.690934 | 2.899807 | 5.422160 | 2.968874 | 0.967140 | 2.939240 |
75% | 42.748741 | 1130.577949 | 13.584900 | 3.700560 | 1.067371 | 1533.811096 | 3.676600 | 83.038355 | 3.179128 | 2.609707 | 3.676663 | 8268.002496 | 20322.279885 | 3.959710 | 17.745736 | 3.335525 | 3.639961 | 85.664169 | 1.082523 | 15.324548 | 3.366949 | 3.206117 | 9.238005 | 7.052591 | 2.586631 | 6.987964 |
max | 69.402515 | 1784.394456 | 33.684424 | 6.285161 | 1.290613 | 2530.919375 | 5.254672 | 123.200439 | 5.241968 | 5.942269 | 5.251277 | 22858.020388 | 32751.715800 | 10.855733 | 29.797261 | 5.013371 | 5.092280 | 107.378705 | 3.474319 | 47.641892 | 7.503201 | 4.856086 | 36.848408 | 19.091844 | 17.273662 | 19.976003 |
display(data.head())
Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 41.920786 | Yes | Travel_Rarely | 990.110009 | Sales | 1.016716 | 1.770313 | Life Sciences | 1.033298 | 1.054148 | 2.146294 | Male | 93.827486 | 3.426960 | 2.464492 | Sales Executive | 3.397665 | Single | 6438.949336 | 20845.103714 | 7.077818 | Y | Yes | 9.898765 | 2.844897 | 1.012832 | 95.612654 | 0.000000 | 8.063833 | 0.000000 | 1.052104 | 5.795945 | 3.993896 | 0.000000 | 4.594972 |
1 | 51.589037 | No | Travel_Frequently | 276.776030 | Research & Development | NaN | 1.035333 | Life Sciences | 0.944859 | 2.107031 | 3.470803 | Male | 52.304157 | 1.813458 | 1.788697 | Research Scientist | 1.584059 | Married | 4457.659622 | 24877.898697 | 1.114423 | Y | No | NaN | 4.120281 | 4.312008 | 85.370868 | 1.104642 | 8.965059 | 3.264952 | 2.526470 | 10.070232 | 7.937505 | 1.032295 | 5.901207 |
2 | 33.131540 | Yes | Travel_Rarely | 1204.158501 | Research & Development | 2.198662 | 2.118801 | Other | 0.810809 | 4.008125 | 3.979420 | Male | 87.271332 | 1.877226 | 0.833488 | Laboratory Technician | 3.301009 | Single | 2124.545220 | 2262.942954 | 5.539591 | Y | Yes | 15.401596 | 2.781168 | 2.332558 | 76.248084 | 0.000000 | 8.427238 | 3.004521 | 2.924784 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
3 | 34.707073 | No | Travel_Frequently | 1352.752432 | Sales | 2.937851 | 3.670674 | Life Sciences | 0.822976 | 4.587269 | 3.734705 | Female | 56.378982 | 2.932646 | 1.095372 | Research Scientist | 3.220844 | Married | 2771.699080 | 27189.905714 | 0.772793 | NaN | Yes | 9.757546 | 2.769576 | 3.029591 | 75.888697 | 0.000000 | 7.662606 | 2.856001 | 3.117053 | 8.435324 | 5.563667 | 2.845429 | 0.000000 |
4 | 24.790188 | No | Travel_Rarely | 485.053333 | Research & Development | 1.988953 | 1.120496 | Medical | NaN | 7.228740 | 1.079412 | Male | 40.433887 | 3.117739 | 0.937746 | Laboratory Technician | 1.697330 | Married | 3254.601575 | 17675.541599 | 10.118125 | NaN | NaN | 11.920147 | 3.282183 | NaN | 72.670937 | 0.953012 | 6.241733 | 2.814719 | 3.112317 | 2.040124 | 1.970786 | 2.495949 | 2.219353 |
print("\nMissing Values:")
print(data.isnull().sum())
Missing Values: Age 147 Attrition 147 BusinessTravel 147 DailyRate 147 Department 147 DistanceFromHome 147 Education 147 EducationField 147 EmployeeCount 147 EmployeeNumber 147 EnvironmentSatisfaction 147 Gender 147 HourlyRate 147 JobInvolvement 147 JobLevel 147 JobRole 147 JobSatisfaction 147 MaritalStatus 147 MonthlyIncome 147 MonthlyRate 147 NumCompaniesWorked 147 Over18 147 OverTime 147 PercentSalaryHike 147 PerformanceRating 147 RelationshipSatisfaction 147 StandardHours 147 StockOptionLevel 147 TotalWorkingYears 147 TrainingTimesLastYear 147 WorkLifeBalance 147 YearsAtCompany 147 YearsInCurrentRole 147 YearsSinceLastPromotion 147 YearsWithCurrManager 147 dtype: int64
data.shape
(1470, 35)
data_cleaned = data.dropna()
print("New Data Shape:", data_cleaned.shape)
New Data Shape: (40, 35)
for col in data.select_dtypes(include=np.number).columns:
data[col].fillna(data[col].mean(), inplace=True)
for col in data.select_dtypes(include='object').columns:
data[col].fillna(data[col].mode()[0], inplace=True)
print("Data Shape after Imputation:", data.shape)
Data Shape after Imputation: (1470, 35)
print("Missing values after imputation:", data.isnull().sum().sum())
Missing values after imputation: 0
numerical_features = data.select_dtypes(include=np.number).columns.tolist()
categorical_features = data.select_dtypes(include='object').columns.tolist()
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)
Numerical features: ['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'] Categorical features: ['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']
for col in numerical_features:
plt.figure(figsize=(8, 4))
sns.histplot(data[col], kde=True)
plt.title(f'Distribution of {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()
for col in categorical_features:
plt.figure(figsize=(8, 4))
sns.countplot(x=col, data=data)
plt.title(f'Distribution of {col}')
plt.xticks(rotation=45)
plt.xlabel(col)
plt.ylabel('Count')
plt.show()
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numerical_features),
('cat', OneHotEncoder(), categorical_features)
])
data_processed = preprocessor.fit_transform(data)
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_features = numerical_features + cat_features.tolist()
data_processed_df = pd.DataFrame(data_processed, columns=all_features)
print(data_processed_df.describe())
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager Attrition_No Attrition_Yes BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager \ count 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 mean 4.326094e-16 9.667248e-18 1.377583e-16 -1.885113e-16 7.540454e-16 1.546760e-16 1.933450e-17 -8.700523e-17 2.477232e-16 2.392644e-16 -2.513485e-16 -3.746059e-17 0.000000e+00 -5.075305e-17 8.338001e-17 1.595096e-16 -2.368476e-16 1.457338e-15 -1.933450e-17 -9.667248e-18 1.413835e-16 7.250436e-18 1.836777e-16 -8.338001e-17 5.316986e-17 -3.456041e-16 0.824490 0.175510 0.104762 0.189796 0.705442 0.072109 0.660544 0.267347 0.033333 0.442857 0.101361 0.270068 0.056463 0.095918 0.367347 0.632653 0.080272 0.040816 0.155782 0.068707 std 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 0.380532 0.380532 0.306351 0.392273 0.455999 0.258756 0.473686 0.442726 0.179567 0.496893 0.301908 0.444146 0.230891 0.294579 0.482246 0.482246 0.271806 0.197932 0.362772 0.253042 min -2.357218e+00 -1.821540e+00 -1.075524e+00 -2.117518e+00 -3.918668e+00 -1.771575e+00 -1.835884e+00 -2.052649e+00 -2.783746e+00 -1.256666e+00 -1.830835e+00 -1.250643e+00 -1.856289e+00 -1.115619e+00 -2.008590e+00 -2.286948e+00 -1.889588e+00 -3.476389e+00 -9.954557e-01 -1.491948e+00 -2.230906e+00 -2.718147e+00 -1.209143e+00 -1.220314e+00 -7.072677e-01 -1.208127e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 25% -6.698885e-01 -7.624073e-01 -8.801388e-01 -7.106715e-01 -6.436999e-01 -8.417690e-01 -7.190978e-01 -8.163050e-01 -6.644049e-01 -9.287072e-01 -7.166879e-01 -7.448754e-01 -8.133326e-01 -7.092988e-01 -7.476689e-01 -6.092411e-01 -7.110232e-01 -6.288545e-01 -9.954557e-01 -6.643384e-01 -6.341142e-01 -6.566636e-01 -6.900668e-01 -6.737752e-01 -7.072677e-01 -6.762744e-01 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 50% -3.979301e-02 2.895478e-16 -1.572434e-01 -4.367532e-16 1.174756e-15 0.000000e+00 0.000000e+00 0.000000e+00 5.007715e-02 -5.525191e-02 1.392373e-02 -2.320246e-01 2.638598e-16 -2.457437e-01 -7.684195e-02 -7.601347e-02 2.714607e-02 1.867891e-15 5.094126e-02 -1.828699e-01 0.000000e+00 6.701606e-02 -1.713352e-01 -2.132420e-01 -3.683742e-01 -2.082280e-01 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 75% 5.420049e-01 7.616039e-01 3.138841e-01 6.636076e-01 6.382783e-01 7.653682e-01 7.239376e-01 7.769248e-01 5.457430e-01 2.357327e-01 7.701983e-01 2.309499e-01 7.748776e-01 4.373216e-01 4.802629e-01 3.401735e-01 7.136293e-01 6.185459e-01 2.966171e-01 3.619964e-01 3.578518e-01 5.629550e-01 3.250412e-01 7.273780e-01 0.000000e+00 7.691807e-01 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 max 3.496479e+00 2.501973e+00 3.171174e+00 3.306181e+00 3.082195e+00 2.598293e+00 2.359958e+00 2.826719e+00 3.442032e+00 3.634052e+00 2.331904e+00 3.608925e+00 2.667246e+00 3.446954e+00 3.827049e+00 4.129180e+00 2.241205e+00 3.566970e+00 3.249407e+00 4.728800e+00 3.722291e+00 2.897504e+00 5.216790e+00 4.283122e+00 4.946940e+00 4.652810e+00 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes count 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 mean 0.082993 0.054422 0.174830 0.291156 0.051020 0.202721 0.505442 0.291837 1.0 0.731973 0.268027 std 0.275966 0.226925 0.379951 0.454450 0.220114 0.402163 0.500141 0.454762 0.0 0.443083 0.443083 min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0 0.000000 0.000000 25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0 0.000000 0.000000 50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.0 1.000000 0.000000 75% 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000 1.0 1.000000 1.000000 max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.0 1.000000 1.000000
pd.set_option('display.float_format', lambda x: '%.2f' % x)
data_processed_df.describe()
Age | DailyRate | DistanceFromHome | Education | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | HourlyRate | JobInvolvement | JobLevel | JobSatisfaction | MonthlyIncome | MonthlyRate | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | Attrition_No | Attrition_Yes | BusinessTravel_Non-Travel | BusinessTravel_Travel_Frequently | BusinessTravel_Travel_Rarely | Department_Human Resources | Department_Research & Development | Department_Sales | EducationField_Human Resources | EducationField_Life Sciences | EducationField_Marketing | EducationField_Medical | EducationField_Other | EducationField_Technical Degree | Gender_Female | Gender_Male | JobRole_Healthcare Representative | JobRole_Human Resources | JobRole_Laboratory Technician | JobRole_Manager | JobRole_Manufacturing Director | JobRole_Research Director | JobRole_Research Scientist | JobRole_Sales Executive | JobRole_Sales Representative | MaritalStatus_Divorced | MaritalStatus_Married | MaritalStatus_Single | Over18_Y | OverTime_No | OverTime_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 |
mean | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 | 0.00 | -0.00 | -0.00 | 0.00 | -0.00 | 0.00 | 0.00 | -0.00 | 0.00 | -0.00 | -0.00 | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 | -0.00 | 0.82 | 0.18 | 0.10 | 0.19 | 0.71 | 0.07 | 0.66 | 0.27 | 0.03 | 0.44 | 0.10 | 0.27 | 0.06 | 0.10 | 0.37 | 0.63 | 0.08 | 0.04 | 0.16 | 0.07 | 0.08 | 0.05 | 0.17 | 0.29 | 0.05 | 0.20 | 0.51 | 0.29 | 1.00 | 0.73 | 0.27 |
std | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.38 | 0.38 | 0.31 | 0.39 | 0.46 | 0.26 | 0.47 | 0.44 | 0.18 | 0.50 | 0.30 | 0.44 | 0.23 | 0.29 | 0.48 | 0.48 | 0.27 | 0.20 | 0.36 | 0.25 | 0.28 | 0.23 | 0.38 | 0.45 | 0.22 | 0.40 | 0.50 | 0.45 | 0.00 | 0.44 | 0.44 |
min | -2.36 | -1.82 | -1.08 | -2.12 | -3.92 | -1.77 | -1.84 | -2.05 | -2.78 | -1.26 | -1.83 | -1.25 | -1.86 | -1.12 | -2.01 | -2.29 | -1.89 | -3.48 | -1.00 | -1.49 | -2.23 | -2.72 | -1.21 | -1.22 | -0.71 | -1.21 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 0.00 |
25% | -0.67 | -0.76 | -0.88 | -0.71 | -0.64 | -0.84 | -0.72 | -0.82 | -0.66 | -0.93 | -0.72 | -0.74 | -0.81 | -0.71 | -0.75 | -0.61 | -0.71 | -0.63 | -1.00 | -0.66 | -0.63 | -0.66 | -0.69 | -0.67 | -0.71 | -0.68 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 0.00 |
50% | -0.04 | 0.00 | -0.16 | -0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.05 | -0.06 | 0.01 | -0.23 | 0.00 | -0.25 | -0.08 | -0.08 | 0.03 | 0.00 | 0.05 | -0.18 | 0.00 | 0.07 | -0.17 | -0.21 | -0.37 | -0.21 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 1.00 | 1.00 | 0.00 |
75% | 0.54 | 0.76 | 0.31 | 0.66 | 0.64 | 0.77 | 0.72 | 0.78 | 0.55 | 0.24 | 0.77 | 0.23 | 0.77 | 0.44 | 0.48 | 0.34 | 0.71 | 0.62 | 0.30 | 0.36 | 0.36 | 0.56 | 0.33 | 0.73 | 0.00 | 0.77 | 1.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 1.00 | 1.00 | 0.00 | 1.00 | 0.00 | 1.00 | 0.00 | 0.00 | 1.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
max | 3.50 | 2.50 | 3.17 | 3.31 | 3.08 | 2.60 | 2.36 | 2.83 | 3.44 | 3.63 | 2.33 | 3.61 | 2.67 | 3.45 | 3.83 | 4.13 | 2.24 | 3.57 | 3.25 | 4.73 | 3.72 | 2.90 | 5.22 | 4.28 | 4.95 | 4.65 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
selected_columns = ['Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome', 'Attrition_No', 'Attrition_Yes']
data_processed_df[selected_columns].describe()
Age | DailyRate | DistanceFromHome | MonthlyIncome | Attrition_No | Attrition_Yes | |
---|---|---|---|---|---|---|
count | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 | 1470.00 |
mean | 0.00 | 0.00 | 0.00 | -0.00 | 0.82 | 0.18 |
std | 1.00 | 1.00 | 1.00 | 1.00 | 0.38 | 0.38 |
min | -2.36 | -1.82 | -1.08 | -1.25 | 0.00 | 0.00 |
25% | -0.67 | -0.76 | -0.88 | -0.74 | 1.00 | 0.00 |
50% | -0.04 | 0.00 | -0.16 | -0.23 | 1.00 | 0.00 |
75% | 0.54 | 0.76 | 0.31 | 0.23 | 1.00 | 0.00 |
max | 3.50 | 2.50 | 3.17 | 3.61 | 1.00 | 1.00 |
import seaborn as sns
import matplotlib.pyplot as plt
# Age Distribution
sns.histplot(data['Age'], kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
# Job Role Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, y='JobRole')
plt.title('Job Role Distribution')
plt.xlabel('Count')
plt.ylabel('Job Role')
plt.show()
# Job Role Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, y='JobRole')
plt.title('Job Role Distribution')
plt.xlabel('Count')
plt.ylabel('Job Role')
plt.show()
# Monthly Income Distribution
sns.histplot(data['MonthlyIncome'], kde=True)
plt.title('Monthly Income Distribution')
plt.xlabel('Monthly Income')
plt.ylabel('Count')
plt.show()
# Attrition Rate
sns.countplot(data=data, x='Attrition')
plt.title('Attrition Rate')
plt.xlabel('Attrition')
plt.ylabel('Count')
plt.show()
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt
lda = LDA(n_components=1)
data_lda = lda.fit_transform(data_processed, class_labels)
plt.hist(data_lda[class_labels == 'Yes'], color='red', alpha=0.5, label='Attrition Yes')
plt.hist(data_lda[class_labels == 'No'], color='blue', alpha=0.5, label='Attrition No')
plt.title('LDA with 1 Component')
plt.xlabel('LD1')
plt.ylabel('Frequency')
plt.legend()
plt.show()
print(data_lda[:10])
[[ 2.20817721] [ 0.41066158] [ 1.61055669] [ 0.96858941] [ 1.04417906] [ 0.79475395] [ 0.56639182] [ 0.40650383] [ 0.99770173] [-0.33157394]]
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_processed)
plt.figure(figsize=(8, 6))
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=class_labels.map({'Yes': 1, 'No': 0}), cmap='rainbow', alpha=0.7)
plt.title('PCA with 2 Components')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar()
plt.show()
Statistical Analysis
Descriptive Statistics: Highlight key insights, such as average monthly income, typical job satisfaction scores, and other notable findings. Correlation Analysis: Point out any strong correlations that might influence employee satisfaction or productivity. Hypothesis Testing: Discuss whether the t-test and ANOVA results reveal significant differences that the company should consider. Regression Analysis: Explain how factors like years at the company are predictive of monthly income. Use Clear Visualizations: Include graphs and charts that clearly demonstrate your findings. Use Appropriate Terminology: Explain statistical terms in a way that's accessible to stakeholders without a statistical background.