#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np import warnings import seaborn as sns import matplotlib.pyplot as plt import torch from torch.utils.data import DataLoader, TensorDataset from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from pyro.optim import Adam from sklearn.metrics import silhouette_score, roc_curve, roc_auc_score from statsmodels.graphics.gofplots import qqplot from dotenv import load_dotenv import os import mysql.connector from skopt import BayesSearchCV from sklearn.preprocessing import LabelEncoder from statsmodels.graphics.gofplots import qqplot from scipy.stats import norm, gaussian_kde get_ipython().run_line_magic('matplotlib', 'inline') # Set font for plots plt.rcParams['font.family'] = 'Arial' # Filter out RuntimeWarnings warnings.filterwarnings("ignore") # In[2]: # Load environment variables from the .env file for security load_dotenv() # Get database connection details from env variables user = os.environ.get('DB_USER') password = os.environ.get('DB_PASSWORD') host = os.environ.get('DB_HOST') database = os.environ.get('DB_DATABASE') # Establish a MySQL connection connection = mysql.connector.connect( user=user, password=password, host=host, database=database ) # ```MySQL # -- Create the database # CREATE DATABASE IF NOT EXISTS test; # # -- Use the database # USE test; # # -- Create the initial table # CREATE TABLE IF NOT EXISTS Student_Stress ( # `id` INT AUTO_INCREMENT PRIMARY KEY, # `Kindly Rate your Sleep Quality` VARCHAR(255), # `How many times a week do you suffer headaches?` VARCHAR(255), # `How would you rate your academic performance?` VARCHAR(255), # `how would you rate your study load?` VARCHAR(255), # `How many times a week you practice extracurricular activities?` VARCHAR(255), # `How would you rate your stress levels?` VA data to insert... # # -- Alter the table structure # ALTER TABLE Student_Stress # DROP COLUMN `Kindly Rate your Sleep Quality`; # # ALTER TABLE Student_Stress # ADD COLUMN `Sleep_Quality` FLOAT, # ADD COLUMN `Frequency_of_Headaches` FLOAT, # ADD COLUMN `Academic_Performance_Score` FLOAT, # ADD COLUMN `Study_Load_Hours` FLOAT, # ADD COLUMN `Extracurricular_Frequency` FLOAT, # ADD COLUMN `Stress_Level` FLOAT, # ADD COLUMN `Performance_to_Load_Ratio` FLOAT, # ADD COLUMN `High_sports_Low_Sleep` FLOAT, # ADD COLUMN `Overall_Burden` FLOAT, # ADD COLUMN `Balanced_Lifestyle` FLOAT; # # -- Populate the newly added columns # UPDATE Student_Stress # SET `Performance_to_Load_Ratio` = Study_Load_Hours / Academic_Performance_Score, # `High_sports_Low_Sleep` = Extracurricular_Frequency / Sleep_Quality, # `Overall_Burden` = Study_Load_Hours + Extracurricular_Frequency + Frequency_of_Headaches, # `Balanced_Lifestyle` = Sleep_Quality + 0.5 * Extracurricular_Frequency - 0.2 * Frequency_of_Headaches; # # -- Categorise headaches # UPDATE Student_Stress # SET `Frequency_of_Headaches` = # CASE # WHEN `Frequency_of_Headaches` < 2 THEN 'rare' # WHEN `Frequency_of_Headaches` <= 3 THEN 'common' # ELSE 'frequent' # END; # # -- Categorise sleep quality # UPDATE Student_Stress # SET `Sleep_Quality` = # CASE # WHEN `Sleep_Quality` <= 2 THEN 'poor' # WHEN `Sleep_Quality` <= 4 THEN 'average' # ELSE 'excellent' # END; # # -- Categorise extracurricular activity # UPDATE Student_Stress # SET `Extracurricular_Frequency` = # CASE # WHEN `Extracurricular_Frequency` <= 2 THEN 'low' # WHEN `Extracurricular_Frequency` <= 4 THEN 'moderate' # ELSE 'high' # END; # # -- Categorise stress level (assuming it's already a numerical value) # UPDATE Student_Stress # SET `Stress_Level` = # CASE # WHEN `Stress_Level` <= 2 THEN 'low' # WHEN `Stress_Level` <= 4 THEN 'moderate' # ELSE 'high' # END; # # -- Clean up # COMMIT; # ``` # # # Feature Engineering # 1. **Performance-to-Load Ratio:** # # $$\\ Formula: \text{{Performance\_to\_Load\_Ratio}} = \frac{{\text{{Study\_Load\_Hours}}}}{{\text{{Academic\_Performance\_Score}}}}\ \\ $$ # # This ratio measures the relationship between study load hours and academic performance. A higher ratio may suggest that the student is achieving better academic performance with less study load, lowering stress. # # 2. **High_sports_Low_Sleep:** # # $$\\ Formula: \text{{High\_sports\_Low\_Sleep}} = \frac{{\text{{Extracurricular\_Frequency}}}}{{\text{{Sleep\_Quality}}}}\ \\ $$ # # This ratio indicates the balance between involvement in extracurricular activities and the quality of sleep. A higher ratio may suggest that the student is highly involved in extracurricular activities while maintaining good sleep quality, lowering stress. # # 3. **Overall_Burden:** # # $$\\ Formula: \text{{Overall\_Burden}} = \text{{Sleep\_Quality}} + \text{{Extracurricular\_Frequency}} +\text{{Frequency\_of\_Headaches}} \\ $$ # # This metric combines study load hours, extracurricular activity frequency, and headache frequency. It provides an overall measure of the student's academic and extracurricular workload and the frequency of headaches, which could potentially increase stress. # # 4. **Balanced Lifestyle:** # # $$\\ Formula: \text{{Balanced\_Lifestyle}} = \text{{Sleep\_Quality}} + 0.5 \times \text{{Extracurricular\_Frequency}} - 0.2 \times \text{{Frequency\_of\_Headaches}} \\ $$ # # This metric attempts to quantify the balance in the student's lifestyle by considering sleep quality, extracurricular activity frequency, and the impact of headaches. It aims to capture a balanced lifestyle, considering both positive and negative factors. Leading to a low or average stress level. # In[3]: #SQL Query query = 'SELECT * FROM Student_Stress' # Load data into a Pandas DataFrame df = pd.read_sql(query, con=connection) # In[4]: # Display the first few rows of the dataset print("First few rows of the dataset:") display(df.head()) # Check for missing values print("\nMissing values:") display(df.isnull().sum()) # Summary statistics print("\nSummary statistics:") display(df.describe()) # In[5]: # Rename the columns for ease of use df.columns = ['Sleep_Quality', 'Frequency_of_Headaches', 'Academic_Performance_Score', 'Study_Load_Hours', 'Extracurricular_Frequency', 'Stress_Level'] # Set the Seaborn style sns.set(style="dark") # Create subplots using Seaborn's FacetGrid g = sns.FacetGrid(df.melt(), col_wrap=3, col="variable", height=6, sharey=False) # Map histograms onto the grid g.map(sns.histplot, "value", bins=5) # Adjust layout to prevent overlap of titles g.set_titles("Distribution of {col_name}", size=12) g.set_axis_labels("Value", "Frequency") # Customise the overall title plt.subplots_adjust(top=0.9) g.fig.suptitle("Distribution of Features", size=16) plt.show() # In[6]: # Create a heatmap of Pearson correlation coefficients corr_matrix = df.corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f") plt.title('Pearson Correlation Coefficients') plt.show() # In[7]: #Collect only Features df_features = df.iloc[:, :-1] # Convert DataFrame to NumPy array features_array = df_features.to_numpy(dtype=np.float32) # Load features and labels into PyTorch tensors features_tensor = torch.tensor(features_array, dtype=torch.float32) stress_labels_tensor = torch.tensor(df['Stress_Level'].to_numpy(), dtype=torch.float32) # Assuming stress levels are continuous # Create a PyTorch dataset and DataLoader dataset = TensorDataset(features_tensor, stress_labels_tensor) batch_size = 32 # Set your desired batch size dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(features_tensor, stress_labels_tensor, test_size=0.2, random_state=42) # In[8]: # Define the Random Forest model with the best parameters #(Taken from bayes optimisation later on) rf_model = RandomForestClassifier(max_depth=20, min_samples_split=2, n_estimators=300) # Fit the model on the training data rf_model.fit(X_train, y_train) # Convert the PyTorch tensor X_train back to a pandas DataFrame X_train_df = pd.DataFrame(X_train.numpy(), columns=df.columns[:-1]) # Get feature importance from the trained model feature_importance = rf_model.feature_importances_ # Create a DataFrame to display feature importance feature_importance_df = pd.DataFrame({ 'Feature': X_train_df.columns, 'Importance': feature_importance }) # Sort the DataFrame by importance to identify features associated with higher stress levels feature_importance_df_sorted = feature_importance_df.sort_values(by='Importance', ascending=False) # Generate predictions on the test set y_pred = rf_model.predict(X_test) # Generate confusion matrix conf_matrix = confusion_matrix(y_test, y_pred) # Visualise confusion matrix sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False) plt.xlabel('Predicted Labels') plt.ylabel('True Labels') plt.show() # Display the sorted feature importance DataFrame print("Sorted Feature Importance:") print(feature_importance_df_sorted) # Print classification report print('Classification Report:') print(classification_report(y_test, y_pred)) # In[9]: #Query to the old database and add the engineering features, this is done for data visualisation query = 'SELECT * FROM Student_Stress_Altered' # Load data into a Pandas DataFrame df_query = pd.read_sql(query, con=connection) # Extract the last four columns from the loaded DataFrame last_four_columns = df_query.iloc[:, -4:] # # Add the last four columns to your own DataFrame df = pd.concat([df, last_four_columns], axis=1) df # In[10]: df = df.astype(float) # Normalise the data min_val = df.min() max_val = df.max() df = (df - min_val) / (max_val - min_val) * 4 + 1 # Scale between 1 and 5 df # In[11]: def univariate_numerical_plot(data, column): plt.figure(figsize=(12, 4)) if data[column].dtype == 'object': data[column] = pd.to_numeric(data[column], errors='coerce') # Box plot plt.subplot(1, 3, 1) data.boxplot(column=[column]) plt.title(f'Box Plot for {column}') # Histogram with mean line and KDE plt.subplot(1, 3, 2) plt.hist(data[column], bins=5, density=True, color='skyblue', edgecolor='black') # Plot kernel density estimate (KDE) line manually xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) kde = gaussian_kde(data[column]) plt.plot(x, kde(x), color='red', linestyle='dashed', linewidth=2, label='KDE') plt.axvline(data[column].mean(), color='green', linestyle='dashed', linewidth=2, label='Mean') plt.xlabel(column) plt.ylabel('Density') plt.title(f'Univariate Numerical Plot for {column}') plt.legend() # QQ plot plt.subplot(1, 3, 3) qqplot(data[column], line='s', ax=plt.gca()) plt.title(f'QQ Plot for {column}') plt.tight_layout() plt.show() numerical_cols_to_plot = ['Sleep_Quality', 'Frequency_of_Headaches', 'Academic_Performance_Score', 'Study_Load_Hours', 'Extracurricular_Frequency', 'Stress_Level', 'Performance_to_Load_Ratio', 'High_sports_Low_Sleep', 'Overall_Burden', 'Balanced_Lifestyle'] for x in enumerate(numerical_cols_to_plot): univariate_numerical_plot(df, numerical_cols_to_plot[x[0]]) # In[12]: corr_matrix = df.corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f") plt.title('Pearson Correlation Coefficients') plt.show() # In[13]: # Calculate the number of rows and columns for subplots numerical_cols_to_plot = ['Sleep_Quality', 'Frequency_of_Headaches', 'Academic_Performance_Score', 'Study_Load_Hours', 'Extracurricular_Frequency', 'Performance_to_Load_Ratio', 'High_sports_Low_Sleep', 'Overall_Burden', 'Balanced_Lifestyle'] num_cols = 3 num_rows = len(numerical_cols_to_plot) // num_cols num_rows += len(numerical_cols_to_plot) % num_cols # Create subplots with separate x-axes fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 5 * num_rows)) # Flatten the axes array axes = axes.flatten() # Loop through numerical columns and create probability density plots against 'Stress_Level' for i, col in enumerate(numerical_cols_to_plot): sns.kdeplot(data=df, x=col, y='Stress_Level', fill=True, cmap='Blues', thresh=0, levels=30, ax=axes[i]) axes[i].set_ylabel('Density') axes[i].set_xlabel(col) fig.suptitle('Probability Density Plots against Stress Levels') plt.tight_layout() plt.show() # In[14]: # Feature columns X_columns = ['Sleep_Quality', 'Frequency_of_Headaches', 'Academic_Performance_Score', 'Study_Load_Hours', 'Extracurricular_Frequency', 'Performance_to_Load_Ratio', 'High_sports_Low_Sleep','Overall_Burden','Balanced_Lifestyle'] # Target variable y_column = 'Stress_Level' # Extract features (X) and target variable (y) X = df[X_columns] y = df[y_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Define the models and their hyperparameter search spaces models = { 'Random Forest': (RandomForestClassifier(), {'n_estimators': (10, 300), 'max_depth': (1, 20)}), 'SVM': (SVC(), {'C': (0.1, 10.0), 'gamma': (0.01, 1.0)}), 'Decision Tree': (DecisionTreeClassifier(), {'max_depth': (1, 20)}), 'k-Nearest Neighbors': (KNeighborsClassifier(), {'n_neighbors': [1,10], 'weights': ['uniform', 'distance'], 'p': [1, 3]}) } #Eval Metrics metrics = {'Accuracy': {}, 'F1 Score': {}, 'Precision': {}, 'Recall': {}} for model_name, (model, param_space) in models.items(): print(f"Results for {model_name}:") # Use Bayesian search for hyperparameter optimisation bayes_search = BayesSearchCV(model, param_space, n_iter=40, n_jobs=-1, cv=7, scoring='accuracy') bayes_search.fit(X_train, y_train) best_params = bayes_search.best_params_ best_model = bayes_search.best_estimator_ best_score = bayes_search.best_score_ print(f"Best Parameters: {best_params}") print(f"Best Accuracy: {best_score}") # Print classification report for all models y_pred = best_model.predict(X_test) classification_rep = classification_report(y_test, y_pred, output_dict=True) print("Classification Report:") print(classification_report(y_test, y_pred)) print("\n") # If the model is RandomForestClassifier or DecisionTreeClassifier, print feature importance, the other models do not have this func if isinstance(best_model, (RandomForestClassifier, DecisionTreeClassifier)): feature_importance = best_model.feature_importances_ print("Feature Importance:") for feature, importance in zip(X_columns, feature_importance): print(f"{feature}: {importance}") print("\n") # Store metrics for each model metrics['Accuracy'][model_name] = best_score metrics['F1 Score'][model_name] = classification_rep['macro avg']['f1-score'] metrics['Precision'][model_name] = classification_rep['macro avg']['precision'] metrics['Recall'][model_name] = classification_rep['macro avg']['recall'] # Create a bar graph with lesser log y-axis to exaggerate differences. For accuracy, F1 score, precision, and recall fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10)) fig.suptitle('Metrics Comparison for Each Model') for i, (metric, scores) in enumerate(metrics.items()): ax = axes[i // 2, i % 2] ax.bar(scores.keys(), scores.values(), color='blue', alpha=0.7) ax.set_yscale('log', basey=2) # Set log scale with base 2 on the y-axis ax.set_ylabel(metric) ax.set_title(metric) plt.tight_layout() plt.show() # In[15]: best_params # In[16]: # Define the k-Nearest Neighbors model knn_model = KNeighborsClassifier(n_neighbors=1, p=2, weights='distance') # Train the model knn_model.fit(X_train, y_train) # Make predictions on the test set y_pred = knn_model.predict(X_test) # Calculate accuracy scores train_accuracy = knn_model.score(X_train, y_train) test_accuracy = accuracy_score(y_test, y_pred) # Print accuracy scores print(f'Training Accuracy: {train_accuracy:}') print(f'Test Accuracy: {test_accuracy:}') # Generate confusion matrix conf_matrix = confusion_matrix(y_test, y_pred) # Visualise confusion matrix with Seaborn heatmap sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False) plt.xlabel('Predicted Labels') plt.ylabel('True Labels') plt.show() # Print classification report print('Classification Report:') print(classification_report(y_test, y_pred)) # In[17]: #Now we query to the actual feature engineered data set query = 'SELECT * FROM Student_Stress_Altered' # Load data into a Pandas DataFrame df = pd.read_sql(query, con=connection) df # In[18]: from sklearn.preprocessing import normalize label_encoder = LabelEncoder() # Encode Sleep Quality Category df['Sleep_Quality'] = label_encoder.fit_transform(df['Sleep_Quality']) # Encode Headache Category df['Frequency_of_Headaches'] = label_encoder.fit_transform(df['Frequency_of_Headaches']) # Encode Extracurricular Activity Category df['Extracurricular_Frequency'] = label_encoder.fit_transform(df['Extracurricular_Frequency']) df['Stress_Level'] = label_encoder.fit_transform(df['Stress_Level']) # In[19]: df # In[20]: # Feature columns X_columns = ['Sleep_Quality', 'Frequency_of_Headaches', 'Academic_Performance_Score', 'Study_Load_Hours', 'Extracurricular_Frequency', 'Performance_to_Load_Ratio', 'High_sports_Low_Sleep','Overall_Burden','Balanced_Lifestyle'] # Target variable y_column = 'Stress_Level' # Extract features (X) and target variable (y) X = df[X_columns] y = df[y_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # In[21]: # Define the models and their hyperparameter search spaces models = { 'Random Forest': (RandomForestClassifier(), {'n_estimators': (10, 300), 'max_depth': (1, 20)}), 'SVM': (SVC(), {'C': (0.1, 10.0), 'gamma': (0.01, 1.0)}), 'Decision Tree': (DecisionTreeClassifier(), {'max_depth': (1, 20)}), 'k-Nearest Neighbors': (KNeighborsClassifier(), {'n_neighbors': [1,10], 'weights': ['uniform', 'distance'], 'p': [1, 3]}) } #Eval Metrics metrics = {'Accuracy': {}, 'F1 Score': {}, 'Precision': {}, 'Recall': {}} for model_name, (model, param_space) in models.items(): print(f"Results for {model_name}:") # Use Bayesian search for hyperparameter optimisation bayes_search = BayesSearchCV(model, param_space, n_iter=40, n_jobs=-1, cv=7, scoring='accuracy') bayes_search.fit(X_train, y_train) best_params = bayes_search.best_params_ best_model = bayes_search.best_estimator_ best_score = bayes_search.best_score_ print(f"Best Parameters: {best_params}") print(f"Best Accuracy: {best_score}") # Print classification report for all models y_pred = best_model.predict(X_test) classification_rep = classification_report(y_test, y_pred, output_dict=True) print("Classification Report:") print(classification_report(y_test, y_pred)) print("\n") # If the model is RandomForestClassifier or DecisionTreeClassifier, print feature importance, the other models do not have this func if isinstance(best_model, (RandomForestClassifier, DecisionTreeClassifier)): feature_importance = best_model.feature_importances_ print("Feature Importance:") for feature, importance in zip(X_columns, feature_importance): print(f"{feature}: {importance}") print("\n") # Store metrics for each model metrics['Accuracy'][model_name] = best_score metrics['F1 Score'][model_name] = classification_rep['macro avg']['f1-score'] metrics['Precision'][model_name] = classification_rep['macro avg']['precision'] metrics['Recall'][model_name] = classification_rep['macro avg']['recall'] # Create a bar graph with lesser log y-axis to exaggerate differences. For accuracy, F1 score, precision, and recall fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10)) fig.suptitle('Metrics Comparison for Each Model') for i, (metric, scores) in enumerate(metrics.items()): ax = axes[i // 2, i % 2] ax.bar(scores.keys(), scores.values(), color='blue', alpha=0.7) ax.set_yscale('log', basey=2) # Set log scale with base 2 on the y-axis ax.set_ylabel(metric) ax.set_title(metric) plt.tight_layout() plt.show() # In[22]: # Define the k-Nearest Neighbors model knn_model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], p=best_params['p'], weights=best_params['weights']) # Train the model knn_model.fit(X_train, y_train) # Make predictions on the test set y_pred = knn_model.predict(X_test) # Calculate accuracy scores train_accuracy = knn_model.score(X_train, y_train) test_accuracy = accuracy_score(y_test, y_pred) # Print accuracy scores print(f'Training Accuracy: {train_accuracy:}') print(f'Test Accuracy: {test_accuracy:}') # Generate confusion matrix conf_matrix = confusion_matrix(y_test, y_pred) # Visualise confusion matrix with Seaborn heatmap sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False) plt.xlabel('Predicted Labels') plt.ylabel('True Labels') plt.show() # Print classification report print('Classification Report:') print(classification_report(y_test, y_pred)) # In[23]: from sklearn.metrics import roc_curve, auc from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier classes = [0, 1, 2] # Binarize the output y_train_bin = label_binarize(y_train, classes=classes) # Use 'classes=' instead of a positional argument y_test_bin = label_binarize(y_test, classes=classes) # Use 'classes=' instead of a positional argument # Define the k-Nearest Neighbors model with OneVsRestClassifier knn_model = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], p=best_params['p'], weights=best_params['weights'])) # Train the model knn_model.fit(X_train, y_train_bin) # Make predictions on the test set y_score = knn_model.predict_proba(X_test) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for i in range(len(classes)): fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Plot ROC curves plt.figure(figsize=(8, 8)) colors = ['red', 'green', 'blue'] # Adjust the colors accordingly for i in range(len(classes)): plt.plot(fpr[i], tpr[i], color=colors[i], lw=2, label=f'Class {i} (AUC = {roc_auc[i]:.2f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC) Curve for each class') plt.legend(loc='lower right') plt.show() # In[25]: # Convert to PyTorch tensor query = 'SELECT * FROM Student_Stress' # Load data into a Pandas DataFrame df = pd.read_sql(query, con=connection) # connection.close() # Convert to PyTorch tensor features_tensor = torch.tensor(df.values, dtype=torch.float32) # Apply PCA for visualisation purposes (reduce to 2D) pca = PCA(n_components=2) features_2d = pca.fit_transform(features_tensor) # Elbow Method to find the optimal number of clusters sse = [] num_clusters_range = range(1, 75) # You can adjust the range based on your data for num_clusters in num_clusters_range: kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10) kmeans.fit(features_tensor) sse.append(kmeans.inertia_) # Plot the elbow curve plt.plot(num_clusters_range, sse, marker='o') plt.title('Elbow Method for Optimal Number of Clusters') plt.xlabel('Number of Clusters') plt.ylabel('Sum of Squared Distances (SSE)') plt.show() # Find the optimal number of clusters diff = np.diff(sse) diff_r = diff[1:] / diff[:-1] optimal_num_clusters = np.argmin(diff_r) + 1 # Choose the optimal number of clusters based on the elbow method # Apply k-means clustering with the optimal number of clusters kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=42, n_init=10) cluster_labels = kmeans.fit_predict(features_tensor) # Visualise the clusters in 2D plt.scatter(features_2d[:, 0], features_2d[:, 1], c=cluster_labels, cmap='viridis') plt.title(f'K-Means Clustering with {optimal_num_clusters} Clusters') plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.show() # Calculate and print the silhouette score silhouette_avg = silhouette_score(features_tensor, cluster_labels) print(f"Silhouette Score: {silhouette_avg}") # # Report and Conclusions # # ## Introduction # In this analysis, we explored a dataset related to student stress, aiming to understand the factors contributing to stress levels and build predictive models for stress classification. We performed a series of data preprocessing steps, feature engineering, and model training. The final models were evaluated based on multiple metrics, and the results are presented below. # # ## Data Preprocessing and Feature Engineering # ### 1. Database Connection and Data Loading # We established a connection to a MySQL database, loaded the student stress data, and performed initial exploration. # # ### 2. Feature Engineering # We created new features to better capture the relationship between various factors and stress levels. Notable features include: # - **Performance-to-Load Ratio:** Measures the relationship between study load hours and academic performance. # - **High_sports_Low_Sleep:** Indicates the balance between involvement in extracurricular activities and the quality of sleep. # - **Overall_Burden:** Combines study load, extracurricular activity frequency, and headache frequency. # - **Balanced_Lifestyle:** Quantifies the balance in the student's lifestyle considering sleep quality, extracurricular activity, a headaches. # # ### 3. Categorisasion # We categorised headache frequency, sleep quality, extracurricular activity frequency, and stress levels into discrete classes to facilitate analysisand modeling. # # ### 4. Data Visualisation # We used various visualisations, including histograms, box plots, and heatmaps, to explore the distribution of features and understand relationships between variables. # # ### 5. Correlation Analysis # A heatmap of Pearson correlation coefficients was created to identify relationships between features. This helped in understanding potential multicollinearity and feature importance. # # ## Model Training and Evaluation # ### 1. Data Preparation for Modeling # We converted categorical variables into numerical format using label encoding and split the data into training and testing sets. # # ### 2. Model Selection and Hyperparameter Tuning # We considered four models: Random Forest, Support Vector Machine (SVM), Decision Tree, and k-Nearest Neighbors. Hyperparameter tuning was performed using Bayesian optimisation. # # ### 3. Model Evaluation # Each model was evaluated based on accuracy, F1 score, precision, and recall. Random Forest and SVM emerged as the top-performing models. # # ### 4. Feature Importance # For Random Forest and Decision Tree models, feature importance was analysed to identify variables contributing most to stress level predictions. # # ## Conclusion # ### 1. Model Performance # - **Random Forest and SVM:** These models exhibited high accuracy (>90%) in stress level classification, making them suitable for predicting students. # - **K - Nearest Neighbour:** This model performed best, achieving an overall accuracy of 96%, with similar figures for precision, recall, and F1. # # ### 2. Feature Importance # - **Performance_to_Load_Ratio, High_sports_Low_Sleep, Overall_Burden:** These engineered features played crucial roles in stress level predictions. # - **Sleep_Quality, Study_Load_Hours:** Traditional factors also contributed significantly to the models. # # 1. Balanced Lifestyle (Feature Engineered) # 1. Study Load Hours # 1. High Sports Low Sleep (Feature Engineered) # 1. Performance to Load Ratio (Feature Engineered) # 1. Extracurricular Frequency # 1. Academic Performance Score # 1. Overall Burden (Feature Engineered) # 1. Sleep Quality # 1. Frequency of Headaches # # ### 3. Recommendations # - **Intervention Strategies:** Based on feature importance, interventions focused on improving sleep quality, managing study loads, and promoting a balanced lifestyle could positively impact stress levels. # - **Monitoring and Support:** Implementing a system to monitor and support students with high extracurricular activity involvement and low sleep quality may be beneficial. # # ### 4. Limitations and Future Work # - **Data Limitations:** The quality of predictions is dependent on the available data. Future work could involve collecting additional variables to enhance model performance. # - **External Factors:** Consideration of external factors affecting stress levels, such as personal issues or external pressures, could provide a more comprehensive understanding. # # ### 5. Final Recommendations # - **Implement Predictive Model:** Deploy the Random Forest or SVM model in a real-world setting to assist in early identification and support for students experiencing high stress levels. # - **Continuous Monitoring:** Regularly update the model with new data to ensure its relevance and effectiveness over time. # # In conclusion, this analysis provides valuable insights into student stress factors and effective predictive models, offering a foundation for proactive interventions and support systems in educational institutions. # # Using K-Means clustering, students can be grouped into distinct clusters based on similar stress profiles and characteristics. For instance, one cluster may comprise students with consistently low stress levels across various factors, while another cluster may include those experiencing high stress levels in multiple areas such as academic performance, study load, and extracurricular activity frequency. By analysing the characteristics within each cluster, such as sleep quality, academic performance, and self-rated stress levels, common traits and behaviors can be identified, allowing for the recognition of distinct student profiles. This approach enables targeted interventions and support strategies tailored to the specific needs and challenges of different student groups, ultimately promoting student well-being and academic success. # In[ ]: