In this post I will explore the dataset relate to Kaggle's competition "What's cooking? to get understanding what kind of data we are dealing with. Getting know your data helps to clean-up the data in preprocessing in order to prepare dataset for classification.
import pandas as pd
df_train = pd.read_json("train.json")
df_train.head()
cuisine | id | ingredients | |
---|---|---|---|
0 | greek | 10259 | [romaine lettuce, black olives, grape tomatoes... |
1 | southern_us | 25693 | [plain flour, ground pepper, salt, tomatoes, g... |
2 | filipino | 20130 | [eggs, pepper, salt, mayonaise, cooking oil, g... |
3 | indian | 22213 | [water, vegetable oil, wheat, salt] |
4 | indian | 13162 | [black pepper, shallots, cornflour, cayenne pe... |
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
df_train["cuisine"].value_counts().plot(kind="barh", figsize=(8, 6));
from collections import Counter
counters = {}
for cuisine in df_train["cuisine"].unique():
counters[cuisine] = Counter()
indices = df_train["cuisine"] == cuisine
for ingredients in df_train[indices]["ingredients"]:
counters[cuisine].update(ingredients)
counters["italian"].most_common(10)
[('salt', 3454), ('olive oil', 3111), ('garlic cloves', 1619), ('grated parmesan cheese', 1580), ('garlic', 1471), ('ground black pepper', 1444), ('extra-virgin olive oil', 1362), ('onions', 1240), ('water', 1052), ('butter', 1030)]
top10 = pd.DataFrame(
[[items[0] for items in counters[cuisine].most_common(10)] for cuisine in counters],
index=[cuisine for cuisine in counters],
columns=["top{}".format(i) for i in range(1, 11)],
)
display(top10.head(8))
top1 | top2 | top3 | top4 | top5 | top6 | top7 | top8 | top9 | top10 | |
---|---|---|---|---|---|---|---|---|---|---|
greek | salt | olive oil | dried oregano | garlic cloves | feta cheese crumbles | extra-virgin olive oil | fresh lemon juice | ground black pepper | garlic | pepper |
southern_us | salt | butter | all-purpose flour | sugar | large eggs | baking powder | water | unsalted butter | milk | buttermilk |
filipino | salt | garlic | water | onions | soy sauce | pepper | oil | sugar | carrots | ground black pepper |
indian | salt | onions | garam masala | water | ground turmeric | garlic | cumin seed | ground cumin | vegetable oil | oil |
jamaican | salt | onions | water | garlic | ground allspice | pepper | scallions | dried thyme | black pepper | garlic cloves |
spanish | salt | olive oil | garlic cloves | extra-virgin olive oil | onions | water | tomatoes | ground black pepper | red bell pepper | pepper |
italian | salt | olive oil | garlic cloves | grated parmesan cheese | garlic | ground black pepper | extra-virgin olive oil | onions | water | butter |
mexican | salt | onions | ground cumin | garlic | olive oil | chili powder | jalapeno chilies | sour cream | avocado | corn tortillas |
df_train["all_ingredients"] = df_train["ingredients"].map(";".join)
df_train.head()
cuisine | id | ingredients | all_ingredients | |
---|---|---|---|---|
0 | greek | 10259 | [romaine lettuce, black olives, grape tomatoes... | romaine lettuce;black olives;grape tomatoes;ga... |
1 | southern_us | 25693 | [plain flour, ground pepper, salt, tomatoes, g... | plain flour;ground pepper;salt;tomatoes;ground... |
2 | filipino | 20130 | [eggs, pepper, salt, mayonaise, cooking oil, g... | eggs;pepper;salt;mayonaise;cooking oil;green c... |
3 | indian | 22213 | [water, vegetable oil, wheat, salt] | water;vegetable oil;wheat;salt |
4 | indian | 13162 | [black pepper, shallots, cornflour, cayenne pe... | black pepper;shallots;cornflour;cayenne pepper... |
indices = df_train["all_ingredients"].str.contains("garlic cloves")
df_train[indices]["cuisine"].value_counts().plot(
kind="barh", title="garlic cloves per cuisine", figsize=(8, 6)
);
relative_freq = (
df_train[indices]["cuisine"].value_counts() / df_train["cuisine"].value_counts()
)
relative_freq.sort_values(inplace=True)
relative_freq.plot(kind="barh", figsize=(8, 6));
import numpy as np
unique = np.unique(top10.values.ravel())
unique
array(['all-purpose flour', 'avocado', 'baking powder', 'baking soda', 'black pepper', 'butter', 'buttermilk', 'cachaca', 'cajun seasoning', 'carrots', 'cayenne pepper', 'chili powder', 'coconut milk', 'corn starch', 'corn tortillas', 'cumin seed', 'dried oregano', 'dried thyme', 'eggs', 'extra-virgin olive oil', 'feta cheese crumbles', 'fish sauce', 'fresh lemon juice', 'fresh lime juice', 'garam masala', 'garlic', 'garlic cloves', 'ginger', 'grated parmesan cheese', 'green bell pepper', 'green onions', 'ground allspice', 'ground black pepper', 'ground cinnamon', 'ground cumin', 'ground ginger', 'ground turmeric', 'jalapeno chilies', 'large eggs', 'lime', 'milk', 'mirin', 'oil', 'olive oil', 'onions', 'paprika', 'pepper', 'potatoes', 'red bell pepper', 'rice vinegar', 'sake', 'salt', 'scallions', 'sesame oil', 'sesame seeds', 'shallots', 'sour cream', 'soy sauce', 'sugar', 'tomatoes', 'unsalted butter', 'vegetable oil', 'water'], dtype=object)
fig, axes = plt.subplots(2, 2, figsize=(20, 20))
for ingredient, ax_index in zip(unique, range(4)):
indices = df_train["all_ingredients"].str.contains(ingredient)
relative_freq = (
df_train[indices]["cuisine"].value_counts() / df_train["cuisine"].value_counts()
)
relative_freq.plot(
kind="barh", ax=axes.ravel()[ax_index], fontsize=24, title=ingredient
);
# Ensure this cell has remove_input tag added (to hide it in blog post text)
# Ensure this cell has remove_input tag added (to hide it in blog post text)
from IPython.core.display import HTML
def css_styling():
styles = open("../../styles/notebook_custom_style.css", "r").read()
return HTML(styles)
css_styling()