from google.colab import drive import pandas as pd import matplotlib.pyplot as plt from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import warnings drive.mount('/content/gdrive') df = pd.read_csv('/content/gdrive/My Drive/datasets/penguins.csv') df = df.dropna() df = df.drop([9, 14]) # Inspect the results df.head() df.body_mass_g.hist() from sklearn.model_selection import train_test_split # Create train/test split (20% test) df_train, df_test = train_test_split(df, test_size=0.2, random_state=42) print(f"Training set size: {len(df_train)}") print(f"Test set size: {len(df_test)}") # One-hot encode training data df_train = pd.get_dummies(df_train).drop("sex_.", axis=1, errors="ignore") # One-hot encode test data df_test = pd.get_dummies(df_test).drop("sex_.", axis=1, errors="ignore") df_train scaler = StandardScaler() # Fit scaler on training data and transform X_train = pd.DataFrame( scaler.fit_transform(df_train), columns=df_train.columns, index=df_train.index ) # Transform test data using the fitted scaler X_test = pd.DataFrame( scaler.transform(df_test), columns=df_test.columns, index=df_test.index ) X_test.plot(kind="scatter", x="culmen_length_mm", y="body_mass_g") # First, determine optimal number of components using training data pca = PCA(n_components=None) pca_temp = pca.fit(X_train) n_components = sum(pca_temp.explained_variance_ratio_ > 0.1) print(f"Number of components with variance > 0.1: {n_components}") # Now fit PCA with optimal components pca = PCA(n_components=n_components) X_train = pd.DataFrame( pca.fit_transform(X_train), index=X_train.index ) # Transform test data X_test = pd.DataFrame( pca.transform(X_test), index=X_test.index ) X_test X_test.plot(kind="scatter", x=0, y=1) inertia = [] for k in range(1, 10): kmeans = KMeans(n_clusters=k, random_state=42).fit(X_train) inertia.append(kmeans.inertia_) plt.plot(range(1, 10), inertia, marker="o") plt.xlabel("Number of clusters") plt.ylabel("Inertia") plt.title("Elbow Method") plt.show() n_clusters = 4 # Fit K-means on training data kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_train) # Get cluster assignments for training data train_clusters = kmeans.labels_ # Visualize the training clusters on the first two principal components plt.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], c=train_clusters, cmap="viridis") plt.xlabel("First Principal Component") plt.ylabel("Second Principal Component") plt.title(f"K-means Clustering on Training Data (K={n_clusters})") plt.show() # Predict clusters for test data y_pred = kmeans.predict(X_test) # Show cluster distribution in test set print("Test set cluster distribution:") print(pd.Series(y_pred).value_counts().sort_index()) # Visualize test set predictions plt.figure(figsize=(12, 5)) # Plot training data plt.subplot(1, 2, 1) plt.scatter(X_train.iloc[:, 0], X_train.iloc[:, 1], c=train_clusters, cmap="viridis", alpha=0.6) plt.xlabel("First Principal Component") plt.ylabel("Second Principal Component") plt.title(f"Training Data Clusters (K={n_clusters})") # Plot test data plt.subplot(1, 2, 2) plt.scatter(X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_pred, cmap="viridis", alpha=0.6) plt.xlabel("First Principal Component") plt.ylabel("Second Principal Component") plt.title(f"Test Data Predictions") plt.tight_layout() plt.show() # This clustering could be useful for: # - Identifying different penguin subgroups for conservation efforts # - Understanding natural groupings in the population # - Feature engineering for supervised learning tasks print("\nClustering complete! These groups could represent different penguin subpopulations or behavioral patterns.")