#!/usr/bin/env python
# coding: utf-8

# # Pandas Profiling: HCC Dataset
# Source of data: https://www.kaggle.com/datasets/mrsantos/hcc-dataset
# 
# As modifiations have been introduced for the purpose of this use case, the .csv file is provided (hcc.csv).

# ## Import libraries

# In[ ]:


import pandas as pd

from ydata_profiling import ProfileReport


# ## Load the dataset

# In[ ]:


# Read the HCC Dataset
df = pd.read_csv("hcc.csv")


# ## Produce and save the profiling report

# In[ ]:


original_report = ProfileReport(df, title="Original Data")
original_report.to_file("original_report.html")


# ## Analysis of "Alerts"
# Pandas Profiling alerts for the presence of 4 potential data quality problems:
# 
# - `DUPLICATES`: 4 duplicate rows in data
# - `CONSTANT`: Constant value “999” in ‘O2’
# - `HIGH CORRELATION`: Several features marked as highly correlated
# - `MISSING`: Missing Values in ‘Ferritin’
# 
# 

# ### Removing Duplicate Rows

# In[ ]:


# Drop duplicate rows
df_transformed = df.copy()
df_transformed = df_transformed.drop_duplicates()


# ### Removing Irrelevant Features

# In[ ]:


# Remove O2
df_transformed = df_transformed.drop(columns="O2")


# ### Missing Data Imputation

# In[ ]:


# Impute Missing Values
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy="mean")
df_transformed["Ferritin"] = mean_imputer.fit_transform(
    df_transformed["Ferritin"].values.reshape(-1, 1)
)


# ## Produce Comparison Report

# In[ ]:


transformed_report = ProfileReport(df_transformed, title="Transformed Data")
comparison_report = original_report.compare(transformed_report)
comparison_report.to_file("original_vs_transformed.html")