#!/usr/bin/env python # coding: utf-8 # # Pandas Profiling: HCC Dataset # Source of data: https://www.kaggle.com/datasets/mrsantos/hcc-dataset # # As modifiations have been introduced for the purpose of this use case, the .csv file is provided (hcc.csv). # ## Import libraries # In[ ]: import pandas as pd from ydata_profiling import ProfileReport # ## Load the dataset # In[ ]: # Read the HCC Dataset df = pd.read_csv("hcc.csv") # ## Produce and save the profiling report # In[ ]: original_report = ProfileReport(df, title="Original Data") original_report.to_file("original_report.html") # ## Analysis of "Alerts" # Pandas Profiling alerts for the presence of 4 potential data quality problems: # # - `DUPLICATES`: 4 duplicate rows in data # - `CONSTANT`: Constant value “999” in ‘O2’ # - `HIGH CORRELATION`: Several features marked as highly correlated # - `MISSING`: Missing Values in ‘Ferritin’ # # # ### Removing Duplicate Rows # In[ ]: # Drop duplicate rows df_transformed = df.copy() df_transformed = df_transformed.drop_duplicates() # ### Removing Irrelevant Features # In[ ]: # Remove O2 df_transformed = df_transformed.drop(columns="O2") # ### Missing Data Imputation # In[ ]: # Impute Missing Values from sklearn.impute import SimpleImputer mean_imputer = SimpleImputer(strategy="mean") df_transformed["Ferritin"] = mean_imputer.fit_transform( df_transformed["Ferritin"].values.reshape(-1, 1) ) # ## Produce Comparison Report # In[ ]: transformed_report = ProfileReport(df_transformed, title="Transformed Data") comparison_report = original_report.compare(transformed_report) comparison_report.to_file("original_vs_transformed.html")