#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().system('pip install pandas numpy matplotlib seaborn')


# In[2]:


# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# In[3]:


import pandas as pd

pd.set_option('display.max_rows', 100)  
pd.set_option('display.max_columns', 100)  


# In[4]:


pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


# In[5]:


file_path = 'Employee_Attrition.csv'
data = pd.read_csv(file_path)


# In[6]:


print(data.head())


# In[7]:


print("\nBasic Descriptive Statistics:")
print(data.describe())


# In[8]:


print(data.head().T)


# In[9]:


from IPython.display import display
display(data.describe())


# In[10]:


display(data.head())


# In[11]:


print("\nMissing Values:")
print(data.isnull().sum())


# In[12]:


data.shape


# In[13]:


data_cleaned = data.dropna()
print("New Data Shape:", data_cleaned.shape)


# In[14]:


for col in data.select_dtypes(include=np.number).columns:
    data[col].fillna(data[col].mean(), inplace=True)

for col in data.select_dtypes(include='object').columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

print("Data Shape after Imputation:", data.shape)


# In[15]:


print("Missing values after imputation:", data.isnull().sum().sum())


# In[18]:


numerical_features = data.select_dtypes(include=np.number).columns.tolist()
categorical_features = data.select_dtypes(include='object').columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)


# In[19]:


# In[22]:


for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()


# In[24]:


for col in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(x=col, data=data)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()


# In[ ]:


# In[25]:


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

data_processed = preprocessor.fit_transform(data)


# In[29]:


cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)

all_features = numerical_features + cat_features.tolist()

data_processed_df = pd.DataFrame(data_processed, columns=all_features)


# In[30]:


print(data_processed_df.describe())


# In[31]:


pd.set_option('display.float_format', lambda x: '%.2f' % x)
data_processed_df.describe()


# In[32]:


selected_columns = ['Age', 'DailyRate', 'DistanceFromHome', 'MonthlyIncome', 'Attrition_No', 'Attrition_Yes']
data_processed_df[selected_columns].describe()


# In[33]:


import seaborn as sns
import matplotlib.pyplot as plt

# Age Distribution
sns.histplot(data['Age'], kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


# In[34]:


# Job Role Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, y='JobRole')
plt.title('Job Role Distribution')
plt.xlabel('Count')
plt.ylabel('Job Role')
plt.show()


# In[35]:


# Job Role Distribution
plt.figure(figsize=(10, 6))
sns.countplot(data=data, y='JobRole')
plt.title('Job Role Distribution')
plt.xlabel('Count')
plt.ylabel('Job Role')
plt.show()


# In[36]:


# Monthly Income Distribution
sns.histplot(data['MonthlyIncome'], kde=True)
plt.title('Monthly Income Distribution')
plt.xlabel('Monthly Income')
plt.ylabel('Count')
plt.show()


# In[39]:


# Attrition Rate
sns.countplot(data=data, x='Attrition')
plt.title('Attrition Rate')
plt.xlabel('Attrition')
plt.ylabel('Count')
plt.show()


# In[40]:


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt


# In[42]:


lda = LDA(n_components=1) 
data_lda = lda.fit_transform(data_processed, class_labels)

plt.hist(data_lda[class_labels == 'Yes'], color='red', alpha=0.5, label='Attrition Yes')
plt.hist(data_lda[class_labels == 'No'], color='blue', alpha=0.5, label='Attrition No')
plt.title('LDA with 1 Component')
plt.xlabel('LD1')
plt.ylabel('Frequency')
plt.legend()
plt.show()


# In[44]:


print(data_lda[:10])


# In[45]:


from sklearn.decomposition import PCA


# In[46]:


pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_processed)


# In[47]:


plt.figure(figsize=(8, 6))
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=class_labels.map({'Yes': 1, 'No': 0}), cmap='rainbow', alpha=0.7)
plt.title('PCA with 2 Components')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.colorbar()
plt.show()


# Statistical Analysis

# Descriptive Statistics: Highlight key insights, such as average monthly income, typical job satisfaction scores, and other notable findings.
# Correlation Analysis: Point out any strong correlations that might influence employee satisfaction or productivity.
# Hypothesis Testing: Discuss whether the t-test and ANOVA results reveal significant differences that the company should consider.
# Regression Analysis: Explain how factors like years at the company are predictive of monthly income.
# Use Clear Visualizations: Include graphs and charts that clearly demonstrate your findings.
# Use Appropriate Terminology: Explain statistical terms in a way that's accessible to stakeholders without a statistical background.