from google.colab import drive import pandas as pd from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler drive.mount('/content/gdrive') df = pd.read_csv("/content/gdrive/My Drive/datasets/nyc_italian.csv") # Save a copy of the orignal df before you do any transformation to it, we'll want this later for plotting. original_df = df.copy() # Inspect it with df.head() print("Sample:") display(df.sample(5)) X = df[["Price", "Food", "Decor", "Service", "East", "latitude", "longitude"]] # Fit scaler and transform scaler = StandardScaler() X = pd.DataFrame( scaler.fit_transform(X), columns=X.columns, index=X.index ) # Determine best PCA pca = PCA(n_components=None) pca_temp = pca.fit(X) n_components = sum(pca_temp.explained_variance_ratio_ > 0.1) print(f"Number of components with variance > 0.1: {n_components}") # Now fit PCA with optimal components pca = PCA(n_components=n_components) X = pd.DataFrame( pca.fit_transform(X), index=X.index ) n_clusters = 4 kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X) train_clusters = kmeans.labels_ original_df["kmeans"] = kmeans.labels_ clusters_df = original_df.drop(["Case", "Restaurant"], axis=1).groupby("kmeans").mean() print("Clusters:") display(clusters_df) import seaborn as sns import matplotlib.pyplot as plt import folium fmap = folium.Map(location=[40.7128, -74.0060], zoom_start=12) colors = ['beige', 'lightblue', 'gray', 'blue', 'darkred', 'lightgreen', 'purple', 'red', 'green', 'lightred', 'white', 'darkblue', 'darkpurple', 'cadetblue', 'orange', 'pink', 'lightgray', 'darkgreen'] # Plot each entry in df by it's latitude and longitude on the folium map for index, row in original_df.iterrows(): color = colors[kmeans.labels_[index]] description = f"{row['Restaurant']} price={row['Price']} food={row['Food']} decor={row['Decor']} service={row['Service']}" folium.Marker([row["latitude"], row["longitude"]], popup=description, icon=folium.Icon(color=color)).add_to(fmap) # Display the map display(fmap)