#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np import scipy from scipy import stats from sklearn.cluster import KMeans from sklearn.metrics import davies_bouldin_score import seaborn as sns from mpl_toolkits.mplot3d import Axes3D from collections import Counter from nltk.corpus import stopwords import pprint # In[2]: features = ['Category', 'Item', 'Serving Size', 'Calories', 'Calories from Fat', 'Total Fat', 'Saturated Fat', 'Trans Fat', 'Cholesterol', 'Sodium', 'Carbohydrates', 'Dietary Fiber', 'Sugars', 'Protein', 'Vitamin A (% Daily Value)', 'Vitamin C (% Daily Value)', 'Calcium (% Daily Value)', 'Iron (% Daily Value)'] dataset = pd.read_csv('menu.csv',header=0,usecols=features) # In[3]: dataset.head(5) # In[4]: df = dataset.drop(["Category","Item","Serving Size"],axis=1) # In[5]: z = np.abs(stats.zscore(df)) print(np.where(z > 7)) print(dataset.iloc[82]) dataset = dataset.drop([82],axis=0) df = df.drop([82],axis=0) # In[6]: normalized_df=(df-df.mean())/df.std() normalized_df["Category"] = dataset[["Category"]] normalized_df.head(5) # In[7]: len(dataset) # In[8]: corr = df.corr() sns.heatmap(corr,linewidths=.5, cmap="YlGnBu") # In[9]: plotData = normalized_df[["Category","Calories","Total Fat","Carbohydrates","Protein"]] sns.set(style="ticks") sns.pairplot(plotData, hue="Category") # In[10]: x = dataset["Total Fat"] y = dataset["Carbohydrates"] z = dataset["Protein"] c = dataset["Calories"] fig = plt.figure() ax = plt.axes(projection='3d') ax.scatter(x, y, z, c=c, cmap='viridis', linewidth=0.5); ax.view_init(25,55) # In[11]: plotData = normalized_df[["Category","Calories","Cholesterol","Sodium","Sugars"]] sns.set(style="ticks") sns.pairplot(plotData, hue="Category") # In[12]: x = dataset["Cholesterol"] y = dataset["Sodium"] z = dataset["Sugars"] c = dataset["Calories"] fig = plt.figure() ax = plt.axes(projection='3d') ax.scatter(x, y, z, c=c, cmap='viridis', linewidth=0.5); ax.view_init(25,70) # In[13]: points = dataset.drop(['Category','Item','Serving Size'],axis=1) points dbScoreList = [] for i in range(2,21): clusterCount = i kmeans = KMeans(n_clusters=i) clusters = kmeans.fit_predict(points) dbScore = davies_bouldin_score(points,clusters) dbScoreList.append(dbScore) clusterList = list(clusters) dbScoreList # In[14]: clusterCount = 4 kmeans = KMeans(n_clusters=4) clusters = kmeans.fit_predict(points) clusters # In[15]: dataPlusClusters = dataset dataPlusClusters['Cluster'] = clusters dataPlusClusters # In[16]: for i in range(clusterCount): foodText = list(dataPlusClusters[dataPlusClusters['Cluster'] == i]['Item']) pprint.pprint(foodText) print("----------------------------------------------------------") #print(topFoods(foodText,3)) # In[17]: #dataPlusClusters.head(5) #sortedData = dataPlusClusters.sort_values(by=['Cluster']) #sortedData len(clusters) normalized_df = normalized_df.drop(["Category"],axis=1) normalized_df["Cluster"] = clusters normalized_df = normalized_df.sort_values(by=['Cluster']) normalized_df.head(5) # In[18]: f, ax = plt.subplots(figsize=(11, 9)) sns.heatmap(normalized_df) # In[ ]: