#!/usr/bin/env python # coding: utf-8 # # Test notebook Meteorites # In[ ]: from pathlib import Path import numpy as np import pandas as pd import requests from IPython.display import display from IPython.utils.capture import capture_output import ydata_profiling from ydata_profiling.utils.cache import cache_file # In[ ]: file_name = cache_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Highly correlated variables df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df))) # Example: Duplicate observations duplicates_to_add = pd.DataFrame(df.iloc[0:10]) duplicates_to_add["name"] = duplicates_to_add["name"] + " copy" df = pd.concat([df, duplicates_to_add], ignore_index=True) # In[ ]: # Inline report without saving with capture_output() as out: pr = df.profile_report( sort=None, html={"style": {"full_width": True}}, progress_bar=False, minimal=True, ) display(pr) assert len(out.outputs) == 2 assert out.outputs[0].data["text/plain"] == "" assert out.outputs[1].data["text/plain"] == "" # In[ ]: # There should also 2 progress bars in minimal mode with capture_output() as out: pfr = df.profile_report( html={"style": {"full_width": True}}, minimal=True, progress_bar=True, lazy=False, ) assert all( any(v in s.data["text/plain"] for v in ["%|", "FloatProgress"]) for s in out.outputs ) assert len(out.outputs) == 2 # In[ ]: # Write to a file with capture_output() as out: pfr.to_file("/tmp/example.html") assert all( any(v in s.data["text/plain"] for v in ["%|", "FloatProgress"]) for s in out.outputs ) assert len(out.outputs) == 2 # In[ ]: # Print existing ProfileReport object inline with capture_output() as out: display(pfr) assert len(out.outputs) == 2 assert out.outputs[0].data["text/plain"] == "" assert out.outputs[1].data["text/plain"] == ""