from pathlib import Path
import numpy as np
import pandas as pd
import requests
from IPython.display import display
from IPython.utils.capture import capture_output
import ydata_profiling
from ydata_profiling.utils.cache import cache_file
file_name = cache_file(
"meteorites.csv",
"https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
)
df = pd.read_csv(file_name)
# Note: Pandas does not support dates before 1880, so we ignore these for this analysis
df["year"] = pd.to_datetime(df["year"], errors="coerce")
# Example: Constant variable
df["source"] = "NASA"
# Example: Boolean variable
df["boolean"] = np.random.choice([True, False], df.shape[0])
# Example: Mixed with base types
df["mixed"] = np.random.choice([1, "A"], df.shape[0])
# Example: Highly correlated variables
df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df)))
# Example: Duplicate observations
duplicates_to_add = pd.DataFrame(df.iloc[0:10])
duplicates_to_add["name"] = duplicates_to_add["name"] + " copy"
df = pd.concat([df, duplicates_to_add], ignore_index=True)
# Inline report without saving
with capture_output() as out:
pr = df.profile_report(
sort=None,
html={"style": {"full_width": True}},
progress_bar=False,
minimal=True,
)
display(pr)
assert len(out.outputs) == 2
assert out.outputs[0].data["text/plain"] == "<IPython.core.display.HTML object>"
assert out.outputs[1].data["text/plain"] == ""
# There should also 2 progress bars in minimal mode
with capture_output() as out:
pfr = df.profile_report(
html={"style": {"full_width": True}},
minimal=True,
progress_bar=True,
lazy=False,
)
assert all(
any(v in s.data["text/plain"] for v in ["%|", "FloatProgress"]) for s in out.outputs
)
assert len(out.outputs) == 2
# Write to a file
with capture_output() as out:
pfr.to_file("/tmp/example.html")
assert all(
any(v in s.data["text/plain"] for v in ["%|", "FloatProgress"]) for s in out.outputs
)
assert len(out.outputs) == 2
# Print existing ProfileReport object inline
with capture_output() as out:
display(pfr)
assert len(out.outputs) == 2
assert out.outputs[0].data["text/plain"] == "<IPython.core.display.HTML object>"
assert out.outputs[1].data["text/plain"] == ""