%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from collections import Counter
from nltk.corpus import stopwords
import pprint
features = ['Category',
'Item',
'Serving Size',
'Calories',
'Calories from Fat',
'Total Fat',
'Saturated Fat',
'Trans Fat',
'Cholesterol',
'Sodium',
'Carbohydrates',
'Dietary Fiber',
'Sugars',
'Protein',
'Vitamin A (% Daily Value)',
'Vitamin C (% Daily Value)',
'Calcium (% Daily Value)',
'Iron (% Daily Value)']
dataset = pd.read_csv('menu.csv',header=0,usecols=features)
dataset.head(5)
Category | Item | Serving Size | Calories | Calories from Fat | Total Fat | Saturated Fat | Trans Fat | Cholesterol | Sodium | Carbohydrates | Dietary Fiber | Sugars | Protein | Vitamin A (% Daily Value) | Vitamin C (% Daily Value) | Calcium (% Daily Value) | Iron (% Daily Value) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Breakfast | Egg McMuffin | 4.8 oz (136 g) | 300 | 120 | 13.0 | 5.0 | 0.0 | 260 | 750 | 31 | 4 | 3 | 17 | 10 | 0 | 25 | 15 |
1 | Breakfast | Egg White Delight | 4.8 oz (135 g) | 250 | 70 | 8.0 | 3.0 | 0.0 | 25 | 770 | 30 | 4 | 3 | 18 | 6 | 0 | 25 | 8 |
2 | Breakfast | Sausage McMuffin | 3.9 oz (111 g) | 370 | 200 | 23.0 | 8.0 | 0.0 | 45 | 780 | 29 | 4 | 2 | 14 | 8 | 0 | 25 | 10 |
3 | Breakfast | Sausage McMuffin with Egg | 5.7 oz (161 g) | 450 | 250 | 28.0 | 10.0 | 0.0 | 285 | 860 | 30 | 4 | 2 | 21 | 15 | 0 | 30 | 15 |
4 | Breakfast | Sausage McMuffin with Egg Whites | 5.7 oz (161 g) | 400 | 210 | 23.0 | 8.0 | 0.0 | 50 | 880 | 30 | 4 | 2 | 21 | 6 | 0 | 25 | 10 |
df = dataset.drop(["Category","Item","Serving Size"],axis=1)
z = np.abs(stats.zscore(df))
print(np.where(z > 7))
print(dataset.iloc[82])
dataset = dataset.drop([82],axis=0)
df = df.drop([82],axis=0)
(array([ 82, 82, 135]), array([ 1, 2, 12])) Category Chicken & Fish Item Chicken McNuggets (40 piece) Serving Size 22.8 oz (646 g) Calories 1880 Calories from Fat 1060 Total Fat 118 Saturated Fat 20 Trans Fat 1 Cholesterol 265 Sodium 3600 Carbohydrates 118 Dietary Fiber 6 Sugars 1 Protein 87 Vitamin A (% Daily Value) 0 Vitamin C (% Daily Value) 15 Calcium (% Daily Value) 8 Iron (% Daily Value) 25 Name: 82, dtype: object
normalized_df=(df-df.mean())/df.std()
normalized_df["Category"] = dataset[["Category"]]
normalized_df.head(5)
Calories | Calories from Fat | Total Fat | Saturated Fat | Trans Fat | Cholesterol | Sodium | Carbohydrates | Dietary Fiber | Sugars | Protein | Vitamin A (% Daily Value) | Vitamin C (% Daily Value) | Calcium (% Daily Value) | Iron (% Daily Value) | Category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.281865 | -0.030612 | -0.060316 | -0.181297 | -0.470098 | 2.381337 | 0.488727 | -0.574834 | 1.542477 | -0.925115 | 0.376316 | -0.142577 | -0.322415 | 0.233469 | 0.845334 | Breakfast |
1 | -0.507601 | -0.468647 | -0.454811 | -0.561506 | -0.470098 | -0.336969 | 0.525441 | -0.610597 | 1.542477 | -0.925115 | 0.471683 | -0.306517 | -0.322415 | 0.233469 | 0.038283 | Breakfast |
2 | 0.034165 | 0.670245 | 0.728672 | 0.389017 | -0.470098 | -0.105624 | 0.543798 | -0.646360 | 1.542477 | -0.959981 | 0.090213 | -0.224547 | -0.322415 | 0.233469 | 0.268869 | Breakfast |
3 | 0.395343 | 1.108280 | 1.123166 | 0.769227 | -0.470098 | 2.670519 | 0.690654 | -0.610597 | 1.542477 | -0.959981 | 0.757787 | 0.062348 | -0.322415 | 0.527005 | 0.845334 | Breakfast |
4 | 0.169607 | 0.757852 | 0.728672 | 0.389017 | -0.470098 | -0.047788 | 0.727368 | -0.610597 | 1.542477 | -0.959981 | 0.757787 | -0.306517 | -0.322415 | 0.233469 | 0.268869 | Breakfast |
len(dataset)
259
corr = df.corr()
sns.heatmap(corr,linewidths=.5, cmap="YlGnBu")
<matplotlib.axes._subplots.AxesSubplot at 0x7f600524f4e0>
plotData = normalized_df[["Category","Calories","Total Fat","Carbohydrates","Protein"]]
sns.set(style="ticks")
sns.pairplot(plotData, hue="Category")
<seaborn.axisgrid.PairGrid at 0x7f600316bda0>
x = dataset["Total Fat"]
y = dataset["Carbohydrates"]
z = dataset["Protein"]
c = dataset["Calories"]
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter(x, y, z, c=c, cmap='viridis', linewidth=0.5);
ax.view_init(25,55)
plotData = normalized_df[["Category","Calories","Cholesterol","Sodium","Sugars"]]
sns.set(style="ticks")
sns.pairplot(plotData, hue="Category")
<seaborn.axisgrid.PairGrid at 0x7f60029d1828>