In this post I will explore the dataset relate to Kaggle's competition "What's cooking? to get understanding what kind of data we are dealing with. Getting know your data helps to clean-up the data in preprocessing in order to prepare dataset for classification.
import pandas as pd
df_train = pd.read_json("train.json")
df_train.head()
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
df_train["cuisine"].value_counts().plot(kind="barh", figsize=(8, 6));
from collections import Counter
counters = {}
for cuisine in df_train["cuisine"].unique():
counters[cuisine] = Counter()
indices = df_train["cuisine"] == cuisine
for ingredients in df_train[indices]["ingredients"]:
counters[cuisine].update(ingredients)
counters["italian"].most_common(10)
top10 = pd.DataFrame(
[[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters],
index=[cuisine for cuisine in counters],
columns=["top{}".format(i) for i in range(1, 11)],
)
display(top10.head(8))
df_train["all_ingredients"] = df_train["ingredients"].map(";".join)
df_train.head()
indices = df_train["all_ingredients"].str.contains("garlic cloves")
df_train[indices]["cuisine"].value_counts().plot(
kind="barh", title="garlic cloves per cuisine", figsize=(8, 6)
);
relative_freq = (
df_train[indices]["cuisine"].value_counts() / df_train["cuisine"].value_counts()
)
relative_freq.sort_values(inplace=True)
relative_freq.plot(kind="barh", figsize=(8, 6));
import numpy as np
unique = np.unique(top10.values.ravel())
unique
fig, axes = plt.subplots(2, 2, figsize=(20, 20))
for ingredient, ax_index in zip(unique, range(4)):
indices = df_train["all_ingredients"].str.contains(ingredient)
relative_freq = (
df_train[indices]["cuisine"].value_counts() / df_train["cuisine"].value_counts()
)
relative_freq.plot(
kind="barh", ax=axes.ravel()[ax_index], fontsize=24, title=ingredient
);
# Ensure this cell has remove_input tag added (to hide it in blog post text)
# Ensure this cell has remove_input tag added (to hide it in blog post text)
from IPython.core.display import HTML
def css_styling():
styles = open("../../styles/notebook_custom_style.css", "r").read()
return HTML(styles)
css_styling()