# Section: Loading Packages and Libraries for Visualizing Dataset and Performing ML Analysis

## Loading Important Libraries

#Before proceeding, ensure to install all the packages listed inside the `library()` function.
# Install ranger package if not already installed
# install.packages("ranger")

# ggplot2: A powerful and flexible plotting system for creating visualizations in R.
library(ggplot2)

# tidyverse: A collection of packages (including ggplot2) that provide a consistent and efficient data manipulation workflow.
library(tidyverse)

# explore: A package for exploratory data analysis, offering various visualizations and summary statistics.
library(explore)

# GGally: Extension to ggplot2, designed for creating ggplot2 visualizations with multiple plots.
library(GGally)

# ggridges: Creates ridge plots for visualizing the distribution of a numeric value by one or more categorical factors.
library(ggridges)

# Metrics: Provides various metrics for evaluating machine learning models, including RMSE and MAE.
library(Metrics)

# car: Companion to Applied Regression package, offering various regression-related functions.
library(car)

# corrgram: Package for creating correlograms to visualize correlation matrices.
library(corrgram)

# corrplot: Package for creating correlation plots.
library(corrplot)

# caret: Classification and Regression Training package, a comprehensive framework for building predictive models.
library(caret)

# randomForest: Implements random forest algorithms for classification and regression.
library(randomForest)

# ranger: Another implementation of random forest algorithms for faster performance.
library(ranger)

# DMwR2: Data Mining with R, provides functions and data sets for teaching data mining.
library(DMwR2)

# ipred: Improved Predictors package, provides bagging and bootstrapping algorithms for predictive modeling.
library(ipred)

# caTools: Provides various tools for data splitting and manipulation, often used in predictive modeling.
library(caTools)

# viridis: A colorblind-friendly color palette for data visualizations.
library(viridis)

# lubridate: Simplifies date and time handling in R.
library(lubridate)

# readxl: Package for reading Excel files.
library(readxl)


library(readxl)

# Provide the correct path to your Excel file
file_path <- "/content/1657875746_day.xlsx"

# Read the Excel file into a data frame
bike_df <- read_excel(file_path)

# Print the first few rows of the data frame using the `head` function
head(bike_df)

# Alternatively, you can use `head.matrix` to print the first few rows
#head.matrix(bike_df)

# Explanation:
# The `head` function is commonly used to display the first few rows of a data frame.
# It helps to quickly inspect the structure and contents of the dataset.

# Print the last few rows of the data frame using the `tail` function
#tail(bike_df)

# Alternatively, you can use `tail.matrix` to print the last few rows
tail.matrix(bike_df)

# Explanation:
# The `tail` function is commonly used to display the last few rows of a data frame.
# It helps to check the end of the dataset and ensures data has been read correctly.


# Using the paste function to concatenate strings and print information about the dataset dimensions
# dim(bikes_df) returns a vector with the number of rows and columns in the dataset
# The concatenated string includes "Dimension of dataset: " followed by the number of rows and columns
paste("Dimension of dataset: ", dim(bike_df))

#Rename the columns
names(bike_df)<-c('record_id','datetime','season','year','month','holiday','weekday','workingday','weather_condition','temp','atemp','humidity','windspeed','casual_users','registerd_users','count')

head(bike_df)


# Summary of the dataset using the summary() function to get a quick overview of key statistics.
summary(bike_df)

# Display the structure of the dataset using the str() function to understand variable types and their distribution.
str(bike_df)

#Typecasting the datetime and numerical attributes to category

# Convert 'datetime' to Date format
bike_df$datetime<- as.Date(bike_df$datetime)
bike_df$year<-as.factor(bike_df$year)
bike_df$month<-as.factor(bike_df$month)
bike_df$season <- as.factor(bike_df$season)
bike_df$holiday<- as.factor(bike_df$holiday)
bike_df$weekday<- as.factor(bike_df$weekday)
bike_df$workingday<- as.factor(bike_df$workingday)
bike_df$weather_condition<- as.factor(bike_df$weather_condition)

# Display the structure of the dataset using the str() function to understand variable types and their distribution.
str(bike_df)


#Missing values in dataset
missing_val<-data.frame(apply(bike_df,2,function(x){sum(is.na(x))}))
names(missing_val)[1]='missing_val'
missing_val

# Section: Visualization of Numerical Variables

# Visualize the relationships and distributions of key numerical variables using a pairs plot.
# This plot includes temperature ('temp'), "feels-like" temperature ('atemp'), humidity ('hum'),
# windspeed ('windspeed'), casual rentals ('casual'), registered rentals ('registered'),
# and the total number of rentals ('cnt'). The plot helps identify potential patterns, correlations,
# and outliers among these variables.

# Reduce the outer margins to avoid "figure margins too large" error
 par(mar = c(0.1, 0.1, 0.1, 0.1))

num_vars <- c("temp", "atemp", "humidity", "windspeed", "casual_users", "registerd_users", "count")
pairs(bike_df[, num_vars],
      main = "Pairs Plot of Numerical Variables",
      col = bike_df$season, # Color points based on the season for additional insights
      pch = 16)  # Use filled circles for better visibility

# Adding custom titles for each panel of the pairs plot.
for (i in 1:length(num_vars)) {
  for (j in 1:length(num_vars)) {
    panel_var1 <- num_vars[i]
    panel_var2 <- num_vars[j]
    panel_title <- paste("Relationship between", panel_var1, "and", panel_var2)
    panel_subtitle <- ifelse(i == j, "Distribution", paste("Colored by Season:", unique(bike_df$season)))
    title(panel_title, line = 2.5, cex.main = 0.8)
    title(panel_subtitle, line = 4, cex.sub = 0.6)
  }
}

# Reset the outer margins to default after creating the plot
par(mar = c(5, 4, 4, 2) + 0.1)


# Section: Exploring Bike Rentals Distribution

# Explore the distribution of bike rentals using a histogram.
# This plot provides an overview of the distribution of total bike rentals ('cnt') per observation.
# Plotting the histogram to visualize the distribution of bike rentals.
hist(bike_df$count,
     main = "Distribution of Bike Rentals",
     xlab = "Number of Rentals",
     col = "#75AADB",  # Custom color for better visibility
     border = "#333333",  # Border color
     breaks = 30,  # Adjust the number of bins
     xlim = c(0, max(bike_df$count) + 50),  # Set x-axis limits for better readability
     las = 1,  # Keep axis labels horizontal
     cex.main = 1.5,  # Increase main title size
     cex.lab = 1.2,  # Increase axis label size
     cex.axis = 1.2)  # Increase axis tick label size

# Adding informative elements to the plot.
abline(v = mean(bike_df$count), col = "red", lwd = 2, lty = 2)  # Add a vertical line for mean
text(mean(bike_df$count) + 10, 250, "Mean", col = "red", font = 2, cex = 1.2)  # Label for the mean

# Adding additional information inside the plot
mtext(" The distribution is slightly right-skewed.", side = 1, line = 2, cex = 1.1)
mtext("The mean number of rentals is marked by a red dashed line.", side = 3, line = 3, cex = 1.1)


# Load the required library for data visualization
library(ggplot2)


# Create a histogram to explore the distribution of the target variable 'count'.
ggplot(bike_df, aes(x = count)) +
  geom_histogram(bins = 30, colour = "black", fill = "#56B4E9") +
  ggtitle("Distribution of Bike Rentals ('count')") +
  xlab("Count Variable") + ylab("Density of the Sample") +
  theme(
    plot.title = element_text(color = "blue", size = 18, face = "bold"),
    axis.title.x = element_text(color = "blue", size = 14),
    axis.title.y = element_text(color = "blue", size = 14)
  ) +
  # Adding informative elements on the plot
  geom_vline(aes(xintercept = mean(bike_df$count)),
             color = "red", linetype = "dashed", linewidth = 1.2) +
  annotate("text", x = mean(bike_df$count) + 10, y = 30,
           label = "Mean", color = "red", size = 5)


# Section: Log Transformation of Bike Rentals

# Create a histogram and density plot after applying log transformation to 'count'
ggplot(bike_df, aes(x = log(count))) +
  geom_histogram(aes(y = after_stat(density)), bins = 30, colour = "black", fill = "grey") +
  geom_density(alpha = 0.2, fill = "cyan") +
  ggtitle("Distribution of Log-Transformed Bike Rentals ('count')") +
  xlab("Log-Transformed Count Variable") + ylab("Density of the Sample") +
  theme(
    plot.title = element_text(color = "black", size = 18, face = "bold"),
    axis.title.x = element_text(color = "blue", size = 14),
    axis.title.y = element_text(color = "blue", size = 14)
  ) +
  # Adding additional information on the plot
  geom_vline(aes(xintercept = mean(log(bike_df$count))),
             color = "red", linetype = "dashed", linewidth = 1.2) +
  annotate("text", x = mean(log(bike_df$count)) + 0.1, y = 0.2,
           label = "Mean (log-transformed)", color = "red", size = 4) +
  geom_vline(aes(xintercept = median(log(bike_df$count))),
             color = "green", linetype = "dashed", linewidth = 1.2) +
  annotate("text", x = median(log(bike_df$count)) + 0.1, y = 0.4,
           label = "Median (log-transformed)", color = "green", size = 4) +
  # Adding insights on the plot
  annotate("text", x = 6, y = 0.6,
           label = "Insights:", color = "red", size = 4, fontface = "bold") +
  annotate("text", x = 6, y = 0.55,
           label = "1. Applying a log transformation has helped in reducing skewness.", color = "red", size = 2) +
  annotate("text", x = 6, y = 0.5,
           label = "2. The mean and median values are marked for reference.", color = "red", size = 2)

# Section: Creating Correlogram with ggpairs

# Create a new plotting device
#dev.new()
#install.packages("viridis")
# Load necessary libraries
library(GGally)
library(viridis)

# Create a correlogram with ggpairs
ggpairs(bike_df, title = "Correlogram with ggpairs()")

ggcorr(bike_df, method = c("everything", "pearson"), label=TRUE) +
  ggtitle("Plot of correlation variables") +
  theme(plot.title = element_text(color="blue", size=18, face="bold"))
cat("With the correlation plot, we can see how the variables that most interfere/correlate with the target are temperature, apparent temperature (atemp), weather, year, season, and finally, wind.\n")


# Load required packages
library(explore)
library(lubridate)


# Extract the year from the 'datetime' variable
bike_df$year_from_date <- year(bike_df$datetime)

# Set display format to show full numeric values
options(scipen = 999)

# Use 'explore' to analyze the dataset, focusing on the 'cnt' variable with the extracted 'year_from_date' as the target
explore(bike_df, count, target = year_from_date)

# Section: Exploring Correlation Among Numerical Variables

# Explore the correlation matrix for numerical variables.
# The matrix and the subsequent heatmap help identify strong correlations among variables.

# Calculate the correlation matrix
cor_matrix <- cor(bike_df[, c("temp", "atemp", "humidity", "windspeed", "casual_users", "registerd_users", "count")])

# Display the correlation matrix
cat("Correlation Matrix:\n")
print(cor_matrix)

# Create a correlation heatmap for a visual representation of variable relationships.
# Strong positive or negative correlations are visually highlighted.
library(corrplot)
# Customize the correlation plot
corrplot(
  cor_matrix,
  method = "color",
  font = 4,
  col = colorRampPalette(c("#4575b4", "#91bfdb", "#e0f3f8", "#fee090", "#d73027"))(100),  # Attractive color scheme
  tl.col = "black",
  tl.srt = 40,
  addCoef.col = "black",
  order = "hclust",
  addrect = 3  # Highlight cells with correlations above a certain threshold
)

# Add informative elements to the plot
title("Correlation heatmap", line = 0.5, cex.main = 1.8)  # Lowercase title
mtext("Strong Positive Correlation", side = 3, line = 2, col = "#d73027", cex = 0.8)
mtext("Strong Negative Correlation", side = 1, line = 3, col = "#4575b4", cex = 0.8)


# Load the required library
library(dplyr)
# Plot season-wise monthly distribution of counts
season_month_plot <- ggplot(bike_df, aes(x = month, y = count, fill = season)) +
  theme_minimal() +
  geom_col(position = "dodge", color = "black", alpha = 0.8) +
  labs(x = 'Month', y = 'Total Count',
       title = 'Season-wise Monthly Distribution of Counts',
       subtitle = 'Comparison of counts across different seasons',
       fill = 'Season') +
  scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00'))  # Custom colors

# Add data labels on top of each bar
season_month_plot <- season_month_plot +
  geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3)

# Highlight the peak month for each season
peak_months <- bike_df %>%
  group_by(season) %>%
  slice(which.max(count))

season_month_plot <- season_month_plot +
  geom_point(data = peak_months, aes(x = month, y = count), color = "red", size = 3) +
  geom_text(data = peak_months, aes(x = month, y = count, label = paste("Peak\nMonth")),
            vjust = -0.5, hjust = 1, color = "red", size = 3)

# Customize legend
season_month_plot <- season_month_plot +
  guides(fill = guide_legend(title = "Season"))

# Print the enhanced season-wise plot
print(season_month_plot)

# Plot weekday-wise monthly distribution of counts
weekday_month_plot <- ggplot(bike_df, aes(x = month, y = count, fill = factor(weekday))) +
  theme_minimal() +
  geom_col(position = "dodge", color = "white", alpha = 0.8) +
  labs(x = 'Month', y = 'Total Count',
       title = 'Weekday-wise Monthly Distribution of Counts',
       subtitle = 'Comparison of counts across different weekdays',
       fill = 'Weekday') +
  scale_fill_discrete(name = 'Weekday', labels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) +  # Customize weekday labels

  # Add data labels on top of each bar
  geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3)

# Highlight the peak day of the week
peak_days <- bike_df %>%
  group_by(weekday) %>%
  slice_max(order_by = count)

# Customize legend
weekday_month_plot <- weekday_month_plot +
  guides(fill = guide_legend(title = "Weekday"))

# Print the enhanced weekday-wise plot
print(weekday_month_plot)


# Section: Bike Rentals by Season

# Explore the distribution of bike rentals based on the season using a boxplot.
# This boxplot shows how the total number of rentals ('cnt') varies across different seasons.

# Create a boxplot to visualize bike rentals by season
ggplot(bike_df, aes(x = season, y = count, fill = as.factor(season))) +
  geom_boxplot() +
  ggtitle("Bike Rentals by Season") +
  xlab("Season") + ylab("Number of Rentals") +
  scale_fill_manual(values = c("#FF6666", "#FFCC66", "#99FF99", "#6666FF"), name = "Season") +
  theme_minimal() +
  theme(
    plot.title = element_text(color = "blue", size = 18, face = "bold"),
    axis.title.x = element_text(color = "blue", size = 14),
    axis.title.y = element_text(color = "blue", size = 14),
    legend.position = "top",
    legend.title = element_text(color = "blue", size = 14)
  ) +
  # Adding additional information on the plot
  annotate("text", x = c(1, 2, 3, 4), y = 6000,
           label = c("Spring", "Summer", "Fall", "Winter"),
           color = "blue", size = 4, fontface = "bold") +
  annotate("text", x = 1, y = 7000, label = "Median", color = "red", size = 4) +
  annotate("text", x = 2, y = 7000, label = "Median", color = "red", size = 4) +
  annotate("text", x = 3, y = 7000, label = "Median", color = "red", size = 4) +
  annotate("text", x = 4, y = 7000, label = "Median", color = "red", size = 4) +
  geom_hline(yintercept = median(bike_df$count), linetype = "dashed", color = "red", linewidth = 1.2)


# Section: Violin Plot for Yearly Distribution of Counts
library(ggplot2)
# Violin plot for Yearly wise distribution of counts
yearly_violin_plot <- ggplot(bike_df, aes(x = year, y = count, fill = as.factor(year))) +
  geom_violin() +
  theme_bw() +
  labs(x = 'Year', y = 'Total Count',
       title = 'Yearly Distribution of Counts',
       subtitle = 'Comparison of counts across different years',
       fill = 'Year') +

  # Add data points on top of the violin plot for better insight
  geom_jitter(aes(color = as.factor(year)), width = 0.2, alpha = 0.5) +

  # Customize legend
  scale_fill_manual(values = c('#1F78B4', '#33A02C'), name = 'Year') +
  scale_color_manual(values = c('#1F78B4', '#33A02C'), name = 'Year') +

  # Add a box plot inside the violin plot
  geom_boxplot(width = 0.1, fill = 'white', color = 'black', alpha = 0.7) +

  # Improve plot appearance
  theme(legend.position = 'top') +

  # Add informative annotations
  annotate('text', x = c(0.8, 1.2), y = c(6000, 6000), label = c('2011', '2012'), size = 4, color = 'black') +
  annotate('text', x = 1, y = 8000, label = 'Data Distribution', size = 6, color = 'black', fontface = 'bold') +
  annotate('text', x = 1, y = 7000, label = 'Comparison of counts across different years using violin plots and box plots.', size = 3, color = 'black') +
  annotate('text', x = 0.8, y = 6800, label = 'Data points are added for better insight.', size = 3, color = 'black')

# Print the Violin plot
print(yearly_violin_plot)
cat("From the voilin plot, we can analysis that the bike rental count distribution is highest in year 2012 then the previous year.\nIn the graph, year 0 represent 2011 and year 1 represent 2012 respectively\n")


# Section: Exploring Bike Rentals During Holidays

# Explore summary statistics for the 'count' variable based on the 'holiday' status.
# This analysis provides insights into whether there are notable differences in rentals during holidays.

# Use tapply to calculate summary statistics for 'count' grouped by 'holiday' status
cnt_summary_by_holiday <- tapply(bike_df$count, bike_df$holiday, summary)

# Display the summary statistics
cat("Summary Statistics for 'count' Variable Based on 'holiday' Status:\n")
print(cnt_summary_by_holiday)

# Create an analytical plot to visualize the distribution of rentals during holidays and non-holidays
par(mfrow = c(1, 3))  # Set up a 1x3 grid for side-by-side plots

# Boxplot for Rentals by Holiday Status
boxplot(bike_df$count ~ bike_df$holiday, main = "Boxplot of Rentals by Holiday Status", xlab = "Holiday", ylab = "Number of Rentals", col = "#75AADB")
# Add statistics on the boxplot
# Calculate boxplot statistics
stats <- boxplot.stats(bike_df$count[bike_df$holiday == 1])

stats_text <- paste("Median:", round(stats$stats[3], 2), "\nIQR:", round(IQR(bike_df$count[bike_df$holiday == 1]), 2))
mtext(stats_text, side = 3, line = 0.5, at = 1.8, adj = 0, font = 1, cex = 0.8)

# Histogram for Distribution of Rentals on Non-Holidays
hist(bike_df$count[bike_df$holiday == 0], main = "Distribution of Rentals on Non-Holidays", xlab = "Number of Rentals", col = "#75AADB", border = "#333333", xlim = c(0, max(bike_df$count) + 50))
# Add statistics on the histogram
hist_stats_text <- paste("Mean:", round(mean(bike_df$count[bike_df$holiday == 0]), 2), "\nSD:", round(sd(bike_df$count[bike_df$holiday == 0]), 2))
mtext(hist_stats_text, side = 3, line = 0.5, at = max(bike_df$count[bike_df$holiday == 0]) * 0.8, adj = 0, font = 1, cex = 0.8)
# Add mean and sd lines
abline(v = mean(bike_df$count[bike_df$holiday == 0]), col = "red", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 0]) - sd(bike_df$count[bike_df$holiday == 0]), col = "purple", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 0]) + sd(bike_df$count[bike_df$holiday == 0]), col = "purple", lty = 2, lwd = 2)
# Mark the observation points on the histogram
rug(bike_df$count[bike_df$holiday == 0], col = "blue", lwd = 1.5)

# Histogram for Distribution of Rentals on Holidays
hist(bike_df$count[bike_df$holiday == 1], main = "Distribution of Rentals on Holidays", xlab = "Number of Rentals", col = "#75AADB", border = "#333333", xlim = c(0, max(bike_df$count) + 50))
# Add statistics on the histogram
hist_stats_text_holiday <- paste("Mean:", round(mean(bike_df$count[bike_df$holiday == 1]), 2), "\nSD:", round(sd(bike_df$count[bike_df$holiday == 1]), 2))
mtext(hist_stats_text_holiday, side = 3, line = 0.5, at = max(bike_df$count[bike_df$holiday == 1]) * 0.8, adj = 0, font = 1, cex = 0.8)
# Add mean and sd lines
abline(v = mean(bike_df$count[bike_df$holiday == 1]), col = "red", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 1]) - sd(bike_df$count[bike_df$holiday == 1]), col = "purple", lty = 2, lwd = 2)
abline(v = mean(bike_df$count[bike_df$holiday == 1]) + sd(bike_df$count[bike_df$holiday == 1]), col = "purple", lty = 2, lwd = 2)
# Mark the observation points on the histogram
rug(bike_df$count[bike_df$holiday == 1], col = "blue", lwd = 1.5)


# Section: Analyzing Bike Rentals on Holidays

# Column plot for holiday-wise distribution of counts
holiday_plot <- ggplot(bike_df, aes(x = factor(holiday), y = count, fill = season)) +
  geom_col(position = "dodge", color = "black", alpha = 0.8) +
  theme_minimal() +
  labs(
    x = 'Holiday',
    y = 'Total Count',
    title = 'Bike Rentals Analysis: Holidays vs. Non-Holidays',
    subtitle = 'Comparing counts on holidays and non-holidays across different seasons',
    fill = 'Season'
  ) +
  scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) +  # Custom colors +
  geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
  scale_x_discrete(labels = c("Non-Holiday", "Holiday")) +  # Adding scale for 0 and 1 +
  theme(legend.position = "top") +
  guides(fill = guide_legend(title = "Season")) +
  annotate("text", x = 1.5, y = max(bike_df$count) * 1.2,
           label = "Insight: During non-holidays, bike rental counts are highest compared to holidays for different seasons.",
           color = "red", size = 3, fontface = "bold")

# Print  plot
print(holiday_plot)


# Column plot for working day-wise distribution of counts
workingday_plot <- ggplot(bike_df, aes(x = factor(workingday), y = count, fill = season)) +
  geom_col(position = "dodge", color = "black", alpha = 0.8) +
  theme_minimal() +
  labs(
    x = 'Working Day',
    y = 'Total Count',
    title = 'Bike Rentals Analysis: neiher weekend nor holiday vs. other days',
    subtitle = 'Comparing counts on working days and non-working days across different seasons',
    fill = 'Season'
  ) +
  scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) +  # Custom colors +
  geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
  scale_x_discrete(labels = c("neiher weekend nor holiday", "other days")) +  # Adding scale for 0 and 1 +
  theme(legend.position = "top") +
  guides(fill = guide_legend(title = "Season")) +
  annotate("text", x = 1.5, y = max(bike_df$count) * 1.1,
           label = "Insight: neiher weekend nor holiday - Bike rental count is higher.\n other day - Bike rental count is lower",
           color = "red", size = 3, fontface = "bold")


print(workingday_plot)


# Section: Impact of Weather Conditions on Bike Rentals

# Column plot for weather condition-wise distribution of counts
weather_condition_plot <- ggplot(bike_df, aes(x = factor(weather_condition), y = count, fill = season)) +
  geom_col(position = "dodge", color = "black", alpha = 1) +
  theme_minimal() +
  labs(
    x = 'Weather Condition',
    y = 'Total Count',
    title = 'Impact of Weather Conditions on Bike Rentals',
    subtitle = 'Comparing bike rental counts under different weather conditions across different seasons',
    fill = 'Season'
  ) +
  scale_fill_manual(values = c('#E41A1C', '#377EB8', '#4DAF4A', '#FF7F00')) +  # Custom colors +
  geom_text(aes(label = count), position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
  scale_x_discrete(labels = c("Clear", "Mist + Cloudy", "Light Snow/Light Rain", "Heavy Rain")) +  # Adding scale +
  theme(legend.position = "top") +
  guides(fill = guide_legend(title = "Season")) +
  annotate(
    "text",
    x = 3.0,
    y = max(bike_df$count) * 1.1,
    label = "Insight: 1. Clear - Bike rental count is very high.\n 2. Mist + Cloudy - Bike rental count is second highest.\n 3. Light Snow/Light Rain - Bike rental count is third highest.\n 4. Heavy Rain - there is no data of Bike renting.",
    color = "red",
    size = 3,
    fontface = "bold"
  )

# Print plot
print(weather_condition_plot)


# Convert temperature variables to Celsius
bike_df$temp_celsius <- bike_df$temp * 41  # Assuming temp is in normalized units, adjust accordingly
bike_df$atemp_celsius <- bike_df$atemp * 50  # Assuming atemp is in normalized units, adjust accordingly

# Section: Combined Temperature Analysis

# Convert temperature variables to Celsius
bike_df$temp_celsius <- bike_df$temp * 41  # Assuming temp is in normalized units, adjust accordingly
bike_df$atemp_celsius <- bike_df$atemp * 50  # Assuming atemp is in normalized units, adjust accordingly


# Scatter plot for bike rentals against temperature and apparent temperature in Celsius
combined_temp_plot <- ggplot(bike_df, aes(x = temp_celsius, y = count, color = "Temperature")) +
  geom_point() +
  geom_point(aes(x = atemp_celsius, color = "Apparent Temperature"), alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black",
              aes(group = 1), formula = y ~ x) +  # Add a linear trend line
  theme_minimal() +
  labs(
    x = 'Temperature (Celsius)',
    y = 'Total Count',
    title = 'Bike Rentals vs. Temperature and Apparent Temperature',
    subtitle = 'Scatter plot showing the relationship between bike rentals and temperature variables',
    color = 'Variable'
  ) +
  scale_color_manual(values = c("Temperature" = "blue", "Apparent Temperature" = "red")) +
  annotate("text", x = 25, y = 8000, label = "Trend Line: Linear Regression", color = "black", size = 4) +
  annotate("text", x = 5, y = 3000, label = "Temperature", color = "blue", size = 4) +
  annotate("text", x = 30, y = 6000, label = "Apparent Temperature", color = "red", size = 4)

# Print the combined temperature plot
print(combined_temp_plot)


# Section: Boxplot for Bike Rental Count with Outliers

# Boxplot for bike rental count with outliers
boxplot(bike_df$count, main = 'Bike Rental Count', sub = ifelse(length(boxplot.stats(bike_df$count)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$count)$out)),
        ylab = 'Count', col = "cyan", border = "blue")

# Add statistical values
text(1, boxplot.stats(bike_df$count)$stats[1], paste("Min:", round(boxplot.stats(bike_df$count)$stats[1], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[2], paste("1st Quartile:", round(boxplot.stats(bike_df$count)$stats[2], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[3], paste("Median:", round(boxplot.stats(bike_df$count)$stats[3], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[4], paste("Mean:", round(mean(bike_df$count), 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[5], paste("3rd Quartile:", round(boxplot.stats(bike_df$count)$stats[5], 2)), pos = 4, cex = 1)
text(1, boxplot.stats(bike_df$count)$stats[6], paste("Max:", round(boxplot.stats(bike_df$count)$stats[6], 2)), pos = 4, cex = 1)


# Set up the layout for multiple boxplots
par(mfrow = c(2, 2))

# Box plot for temperature outliers
boxplot(bike_df$temp, main = "Temperature", sub = ifelse(length(boxplot.stats(bike_df$temp)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$temp)$out)),
        col = "#FF6347", border = "#8B0000", notch = TRUE, outline = FALSE)

# Box plot for temperature outliers
boxplot(bike_df$atemp, main = "Feellike-Temperature", sub = ifelse(length(boxplot.stats(bike_df$atemp)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$atemp)$out)),
        col = "pink", border = "red", notch = TRUE, outline = FALSE)

# Box plot for humidity outliers
boxplot(bike_df$humidity, main = "Humidity", sub = ifelse(length(boxplot.stats(bike_df$humidity)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$humidity)$out)),
        col = "#87CEEB", border = "#1E90FF", notch = TRUE, outline = FALSE)

# Box plot for windspeed outliers
boxplot(bike_df$windspeed, main = "Windspeed", sub = ifelse(length(boxplot.stats(bike_df$windspeed)$out) == 0, "No Outliers", paste("Outliers: ", boxplot.stats(bike_df$windspeed)$out)),
        col = "#98FB98", border = "#008000", notch = TRUE, outline = FALSE)


# Outlier Replacement and Imputation
# Section: Replacing and Imputing Outliers in Humidity and Windspeed

# Install the dplyr package if not installed

# Load the required libraries
library(DMwR2)
library(dplyr)

# Create a subset for windspeed and humidity variables
wind_hum <- subset(bike_df, select = c('windspeed', 'humidity'))

# Function to replace outliers with NA
replace_outliers <- function(x) {
  q <- quantile(x, c(0.25, 0.75))
  iqr <- q[2] - q[1]
  lower_bound <- q[1] - 1.5 * iqr
  upper_bound <- q[2] + 1.5 * iqr
  x[x < lower_bound | x > upper_bound] <- NA
  return(x)
}

# Apply the function to each column
wind_hum <- wind_hum %>% mutate(across(everything(), replace_outliers))
wind_hum

# Impute missing values using mean imputation method
wind_hum$windspeed[is.na(wind_hum$windspeed)] <- mean(wind_hum$windspeed, na.rm = TRUE)
wind_hum$humidity[is.na(wind_hum$humidity)] <- mean(wind_hum$humidity, na.rm = TRUE)

#Section : Combining the imputed dataset and original dataset
new_df <- subset(bike_df, select = -c(windspeed, humidity))  # Remove original windspeed and humidity
bike_df <- cbind(new_df, wind_hum)  # Combine new_df and wind_hum data frames

# Display the first 5 rows of the updated dataset
head(bike_df)

summary(bike_df)

# Load the required libraries
library(car)
library(ggplot2)

# Select numerical columns for probability plots in combined dataset
numerical_columns <- sapply(bike_df, is.numeric)

for (column in names(bike_df[, numerical_columns])) {
  hist(bike_df[, column], main = paste("Histogram for", column),
       xlab = column, col = "skyblue", border = "black")
}


# Create normal probability plots for numerical variables in combined dataset
for (column in names(bike_df[, numerical_columns])) {
  qqnorm(bike_df[, column], main = paste("Normal Probability Plot for", column))
  qqline(bike_df[, column], col = 2)

  # Add insight annotation
  annotation <- "Some data points are deviating from normality in a good way."
  text(quantile(bike_df[, column], 1.0), quantile(bike_df[, column], 0.1), annotation, adj = c(0, 1), cex = 0.8, col = "darkgreen")
}

# Section: Correlation Analysis of in combined dataset
# Load the corrgram package for correlation analysis
library(corrgram)

# Identify numeric columns for correlation analysis
numeric_columns <- sapply(bike_df[, 8:19], is.numeric)

# Create a correlation plot
corrgram(bike_df[, 8:19][, numeric_columns], order = FALSE, upper.panel = panel.pie, text.panel = panel.txt, main = 'Correlation Plot')

# Add insight on positive and negative correlations
cat("Positive Correlations: temp, atemp, and year have positive correlations with the target variable.\n")
cat("Negative Correlations: weather_condition, humidity, and windspeed have negative correlations with the target variable.\n")

# Identify variables that may not be needed for further analysis based on correlation
cat("\nVariables with weak correlation (abs(correlation) <= 0.1) with the target variable:\n")
weak_corr_vars <- names(bike_df[, 8:19][, numeric_columns])[sapply(bike_df[, 8:19][, numeric_columns], function(x) abs(cor(x, bike_df$count)) <= 0.1)]
print(weak_corr_vars)

# Load the purrr library for functions and vectors
library(purrr)

# Split the dataset based on simple random resampling
train_index <- sample(1:nrow(bike_df), 0.7 * nrow(bike_df))
train_data <- bike_df[train_index,]
test_data <- bike_df[-train_index,]

# Display dimensions of the training and testing datasets
cat("Dimensions of Training Data:", dim(train_data), "\n")
cat("Dimensions of Testing Data:", dim(test_data), "\n")


#splitted data exploration
head(train_data)

head(test_data)

# Section: Creating Subsets for Training and Testing

# Create a new subset for train attributes
train <- subset(train_data, select = c('season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weather_condition', 'temp','atemp', 'humidity', 'windspeed', 'count'))

# Create a new subset for test attributes
test <- subset(test_data, select = c('season', 'year', 'month', 'holiday', 'weekday', 'workingday', 'weather_condition', 'temp', 'atemp','humidity', 'windspeed', 'count'))

# Display the first few rows of the training subset
cat("Training Subset:")
head(train)

# Display the first few rows of the testing subset
cat("\nTesting Subset:")
head(test)

# Section: Creating Subsets for Training and Testing

# Create a new subset for train categorical attributes
train_cat <- subset(train, select = c('season', 'holiday', 'workingday', 'weather_condition', 'year'))

# Create a new subset for test categorical attributes
test_cat <- subset(test, select = c('season', 'holiday', 'workingday', 'weather_condition', 'year'))

# Create a new subset for train numerical attributes
train_num <- subset(train, select = c('weekday', 'month', 'temp', 'atemp','humidity', 'windspeed', 'count'))

# Create a new subset for test numerical attributes
test_num <- subset(test, select = c('weekday', 'month', 'temp', 'atemp','humidity', 'windspeed', 'count'))

# Display the first few rows of the training categorical attributes subset
cat("Training Categorical Attributes Subset:")
head(train_cat)

# Display the first few rows of the testing categorical attributes subset
cat("\nTesting Categorical Attributes Subset:")
head(test_cat)


# Display the first few rows of the training numerical attributes subset
cat("\nTraining Numerical Attributes Subset:")
head(train_num)

# Display the first few rows of the testing numerical attributes subset
cat("\nTesting Numerical Attributes Subset:")
head(test_num)

# Load the required libraries
library(caret)

# Define variables for dummy encoding
othervars <- c('month', 'weekday', 'temp', 'atemp', 'humidity', 'windspeed', 'count')
set.seed(2626)

# Identify categorical variables
vars <- setdiff(colnames(train), c(train$count, othervars))
vars

# Create a formula for encoding
f <- paste('~', paste(vars, collapse = ' + '))

# Use dummyVars to encode categorical variables
encoder <- dummyVars(as.formula(f), train)
encode_attributes <- predict(encoder, train)

# Combine numerical and encoded attributes
train_encoded_attributes <- cbind(train_num, encode_attributes)

# Display the head of the encoded dataset
head(train_encoded_attributes)


# Section: Encoding Categorical Features (Test Dataset)

# Load the required libraries
library(caret)

# Define variables for dummy encoding
othervars <- c('month', 'weekday', 'temp', 'atemp', 'humidity', 'windspeed', 'count')
set.seed(5662)

# Identify categorical variables in the test dataset
vars <- setdiff(colnames(test), c(test$count, othervars))
vars

# Create a formula for encoding
f <- paste('~', paste(vars, collapse = ' + '))

# Use dummyVars to encode categorical variables
encoder <- dummyVars(as.formula(f), test)
encode_attributes <- predict(encoder, test)

# Combine numerical and encoded attributes for the test dataset
test_encoded_attributes <- cbind(test_num, encode_attributes)

# Display the head of the encoded test dataset
head(test_encoded_attributes)

# Section: Modeling the Training Dataset with Linear Regression

# Set seed to reproduce the results of random sampling
set.seed(672)

# Train the Linear Regression model
lr_model <- lm(count ~ ., data = train_encoded_attributes[, -c(6)])

# Display the summary of the Linear Regression model
summary(lr_model)

# Section: Cross Validation Prediction with Linear Regression

# Ignore warning messages
options(warn = -1)

# Set seed to reproduce results of random sampling
set.seed(623)

# Cross validation resampling method
train_control <- trainControl(method = 'cv', number = 3)

# Cross validation prediction
CV_predict <- train(count ~ .,data = train_encoded_attributes[, -c(6)],
                    method = 'lm', trControl = train_control)

# Display summary of cross validation prediction
summary(CV_predict)

# Observations from the output
cat("\n**Observations:**\n")
cat("- The model's residuals range from -3713.0 to 3165.0, indicating the prediction performance.\n")
cat("- Coefficients and p-values provide insights into variable significance.\n")
cat("- The adjusted R-squared (0.8316) indicates the model's goodness of fit.\n")
cat("- Variables with smaller p-values are considered statistically significant.\n")
cat("- Overall, the linear regression model captures relationships and offers insights into predictor importance and model fit.\n")


#Section: Cross Validation Prediction Plot with Linear Regression

# Increase the size of the plot
par(mfrow=c(1, 1), mar=c(5, 5, 2, 2))

# Cross-validation prediction plot
residuals <- resid(CV_predict)
y_train <- train_encoded_attributes$count

# Scatter plot with residuals
plot(y_train, residuals, ylab = 'Residuals', xlab = 'Observed', main = 'Cross Validation Prediction Plot', pch = 16, col = 'blue')

# Add a reference line at zero
abline(h = 0, col = 'red', lwd = 2)

# Observations from the plot
text(5000, -1500, 'Residuals spread around zero indicates a good model fit', col = 'darkgreen', cex = 0.8)
text(5000, -2500, 'Scatter points should be random and evenly distributed', col = 'darkgreen', cex = 0.8)

# Section: Model Performance on Test Data Analysis

# Set seed for reproducibility
set.seed(6872)

# Suppress warning messages
options(warn = -1)

# Predict using the lr_model on the test_encoded_attributes
lm_predictions <- predict(lr_model, test_encoded_attributes[, -c(6)])

# Display the first 10 predictions
head(lm_predictions, 10)

# Section: Model Performance Visualization using Linear Regressor Model

# Extract actual and predicted values
actual_values <- test_encoded_attributes$count
predicted_values <- lm_predictions

# Plot past actual values and future 10 predicted values
plot(1:length(actual_values), actual_values, type = "l", col = "blue",
     xlab = "Sample Index", ylab = "Count", main = "Actual vs Future Predicted Values",
     xlim = c(1, length(actual_values) + 10), ylim = c(0, max(actual_values, predicted_values)))
lines(length(actual_values) + 1:length(predicted_values), predicted_values, col = "orange")

# Add lines for future predicted values
lines(length(actual_values) + 1:length(predicted_values), predicted_values, col = "orange")

# Add legend
legend("topright", legend = c("Actual", "Predicted"), col = c("blue", "orange"), lty = 1)

# Highlight the last observed data point
points(length(actual_values), actual_values[length(actual_values)], pch = 19, col = "red")

# Highlight the starting point of predicted values
points(length(actual_values) + 1, predicted_values[1], pch = 19, col = "green")

# Section: Model Evaluation Metrics on Rootmean squred error and mean abosolute error

# Set seed for reproducibility
set.seed(688)

# Root mean squared error (RMSE)
rmse <- RMSE(lm_predictions, test_encoded_attributes$count)
print(paste("Root Mean Squared Error (RMSE):", round(rmse, 2)))


# Mean absolute error (MAE)
mae <- MAE(lm_predictions, test_encoded_attributes$count)
print(paste("Mean Absolute Error (MAE):", round(mae, 2)))

# Section: Residual Analysis

# Set heading for the code block
cat("Residual :\n")

# Calculate residuals
y_test <- test_encoded_attributes$count
residuals_lm <- y_test - lm_predictions
residuals_lm

# Create an informative residual plot
plot(y_test, residuals_lm, xlab = 'Observed Count', ylab = 'Residuals', main = 'Residual Plot',
     col = ifelse(residuals_lm >= 0, 'blue', 'red'), pch = 16, cex = 1.2)

# Add a reference line at y = 0
abline(h = 0, col = 'black', lty = 2, lw = 2)

# Add legend for positive and negative residuals
legend('topright', legend = c('Positive Residuals', 'Negative Residuals'), col = c('blue', 'red'), pch = 16)

# Interpretation:
cat("\nInterpretation:\n")
cat("The residual plot shows the difference between observed and predicted counts.\n")
cat("Positive residuals (in blue) indicate underpredictions, while negative residuals (in red) indicate overpredictions.\n")
cat("Overall, the model seems to perform well, with residuals centered around zero.\n")


# Section: Decision Tree Regressor

# Set heading for the code block
cat("Decision Tree Regressor:\n")

# Set seed for reproducibility
set.seed(568)

# Load the rpart library for decision trees
library(rpart)

# Set control parameters for rpart
rpart.control <- rpart.control(minbucket = 2, cp = 0.01, maxcompete = 3, maxsurrogate = 4,
                               usesurrogate = 2, xval = 3, surrogatestyle = 0, maxdepth = 10)

# Train the decision tree regressor model
dtr <- rpart(train_encoded_attributes$count ~ ., data = train_encoded_attributes[,-c(6)],
             control = rpart.control, method = 'anova', cp = 0.01)

# Display summary of the decision tree model
summary(dtr)

# Interpretation:
cat("\nInterpretation:\n")
cat("The decision tree regressor is trained with control parameters to control its performance.\n")
cat("The summary provides insights into the structure of the decision tree, including splits, terminal nodes, and node statistics.\n")


# Section: Visualizing Enhanced Decision Tree Model

# Set heading for the code block
cat("Visualizing Enhanced Decision Tree Model:\n")

# Load the rpart.plot library for plotting the learned decision tree model
library(rpart.plot)

# Set custom color palette for the decision tree plot
custom_palette <- c("#FF0000", "#FF6347", "#FFA500", "#008000", "#0000FF", "#4B0082")

# Plot the learned decision tree model with enhanced visualization
# Use 'col' parameter to set the color palette
rpart.plot(dtr, col = custom_palette, shadow.col = "blue", nn = TRUE, roundint = FALSE,
           main = "Enhanced Decision Tree Model", branch.lty = 3, under = TRUE, box.col = "lightblue",
           split.cex = 1.5, compress = TRUE, extra = 101, varlen = 0, tweak = 2.0)

# Interpretation:
cat("\nInterpretation:\n")
cat("The enhanced visualization of the learned decision tree model improves aesthetics and clarity.\n")
cat("Custom color palette and styling options are applied to make the plot more attractive.\n")
cat("Understanding the decision tree structure becomes more engaging and informative.\n")


# Section: Cross Validation Prediction for Decision Tree Regressor

# Set heading for the code block
cat("Cross Validation Prediction for Decision Tree Regressor:\n")

# Ignore warning messages during execution
options(warn = -1)

# Load the caret package
library(caret)

# Set seed for reproducibility of results
set.seed(5769)

# Define cross-validation resampling method
train.control <- trainControl(method = 'CV', number = 3)

# Perform cross-validation prediction using the decision tree regressor
dtr_CV_predict <- train(count ~ ., data = train_encoded_attributes, method = 'rpart', trControl = train.control)

# Display the cross-validation results
dtr_CV_predict

# Section: Cross-validation prediction plot for decision tree regression

# Create a new color palette for the plot
plot_colors <- c('skyblue', 'darkred')

# Extract predicted values from the list
predicted_counts <- as.numeric(dtr_CV_predict$pred)

# Calculate residuals
observed_counts <- as.numeric(train_encoded_attributes$count)
residuals_dr <- resid(dtr_CV_predict)
residuals_dr

# Cross-validation prediction plot
plot(observed_counts, residuals_dr,
     xlab = 'Observed Counts', ylab = 'Residuals',
     main = 'Cross Validation Residual Plot', col = plot_colors[1], pch = 16)

# Add a horizontal line at y = 0 for reference
abline(h = 0, col = 'gray', lty = 2)

# Add a smooth line to visualize trends
lines(lowess(y_train, residuals_dr), col = plot_colors[2], lwd = 2)

# Add informative labels
text(4000, 1200, 'Residuals should be randomly scattered around zero.',
     adj = c(0, 0), col = 'darkgreen', font = 1)
text(4000, -800, 'A clear pattern indicates a model misfit.',
     adj = c(0, 0), col = 'darkred', font = 1)
text(4000, 400, 'Lowess smoother line to identify trends.',
     adj = c(0, 0), col = plot_colors[2], font = 1)
text(4000, 200, 'Plot show some finite varience b/w them & for some not.',
     adj = c(0, 0), col = plot_colors[2], font = 1)
# Add legend
legend('topright', legend = c('Residuals', 'Lowess Smoother'),
       col = plot_colors, pch = 16, lwd = 2, cex = 0.8)


# Section: Model Performance on the Test Dataset

# Set seed for reproducibility
set.seed(7882)

# Predict using the trained decision tree regression model
dtr_predictions <- predict(dtr, test_encoded_attributes[,-c(6)])

# Display the first 10 predictions
head(dtr_predictions, 20)

# Section: Decision Tree Regressor - Model Performance Visualization

# Generate a random sample of 10 indices from the test dataset
sample_indices <- sample(1:nrow(test_encoded_attributes), 20)

# Extract the corresponding actual and predicted values for the sampled data
actual_values_dtr <- test_encoded_attributes$count
predicted_values_dtr <- dtr_predictions

# Plot all actual and predicted values using Decision Tree Regressor
plot(1:length(actual_values_dtr), actual_values_dtr, type = "l", col = "blue",
     xlab = "Sample Index", ylab = "Count", main = "Actual vs Predicted Values (Decision Tree Regressor)",
     xlim = c(1, length(actual_values_dtr) + 20), ylim = c(0, max(actual_values_dtr, predicted_values_dtr)))
lines(length(actual_values_dtr) + 1:length(predicted_values_dtr), predicted_values_dtr, col = "orange")

# Add legend
legend("topright", legend = c("Actual", "Predicted"), col = c("blue", "orange"), lty = 1)

# Highlight the last observed data point
points(length(actual_values_dtr), actual_values_dtr[length(actual_values_dtr)], pch = 19, col = "red")

# Highlight the starting point of predicted values
points(length(actual_values_dtr) + 1, predicted_values_dtr[1], pch = 19, col = "green")

# Add information
text(length(actual_values_dtr) + 5, max(actual_values_dtr, predicted_values_dtr)/2,
     "Actual values and Predicted values  using Decision Tree Regressor",
     pos = 1, col = "darkgreen", cex = 0.8)

#Section: Evaluation of Decision tree regressor on Root mean squared error and mean absolute error

set.seed(6889)
#Root mean squared error
rmse<-RMSE(y_test,dtr_predictions)
print(rmse)
#Mean absolute error
mae<-MAE(y_test,dtr_predictions)
print(mae)

# Section: Residual Plot for Decision Tree Regressor

# Calculate residuals for Decision Tree Regressor predictions
residuals_dtr <- y_test- dtr_predictions

# Plot the Residual plot
plot(y_test, residuals_dtr, xlab = 'Observed', ylab = 'Residuals', main = 'Residual Plot for Decision Tree Regressor', col = 'blue', pch = 16)
abline(0, 0, col = 'red', lwd = 2)

# Add informative text
text(2500, 800, "Some data points have the same finite variance between them,\nwhile for others, it may not hold true.", col = "red", cex = 0.8)

# Section: Random Forest Regression
# Load the randomForest library
library(randomForest)
# Set seed for reproducibility
set.seed(6788271)

# Train the Random Forest model
rf_model <- randomForest(count ~ ., data = train_encoded_attributes, importance = TRUE, ntree = 200)
rf_model

# Section: Cross Validation Prediction for Random Forest

# Ignore warning messages during execution
options(warn = -1)

# Set seed for reproducibility
set.seed(6772)

# Load the ranger library for Random Forest CV
library(ranger)

# Define cross-validation resampling method
train.control <- trainControl(method = 'CV', number = 3)

# Cross-validation prediction
rf_CV_predict <- train(count ~ ., data = train_encoded_attributes, method = 'ranger', trControl = train.control)
rf_CV_predict

cat("The Random Forest model was trained on 511 samples with 19 predictors. The training utilized cross-validated resampling with 3 folds, resulting in sample sizes of 340, 342, and 340 across the folds. The model was tuned with different values for the number of variables randomly sampled at each split (mtry) and the split rule. The final model, selected based on the smallest Root Mean Squared Error (RMSE), had the parameters mtry = 18, splitrule = extratrees, and min.node.size = 5. The performance metrics for the selected model were RMSE = 733.25, Rsquared = 0.8505, and MAE = 503.57, indicating a good fit to the data.\n")

# Section: Cross Validation Prediction Plot for Random Forest

# Extracting residuals from Random Forest cross-validation predictions
residuals <- resid(rf_CV_predict)

# Plotting the Cross Validation Prediction Plot
plot(y_train, residuals, xlab = 'Observed', ylab = 'Residuals',
     main = 'Cross Validation Prediction Plot for Random Forest', col = 'blue', pch = 16, cex.main = 1.2)

# Adding a reference line at y = 0
abline(0, 0, col = 'red', lwd = 2)

# Adding informative labels and title
text(6000, 800, "Residuals from Random Forest cross-validation predictions",
     col = "darkgreen", cex = 0.8)

# Adding grid lines for better readability
grid()

# Adding a legend
legend('topright', legend = 'Residuals', col = 'blue', pch = 16, cex = 0.8)

# Adding more context to the observation
text(5500, 500, "Observation: The residuals should be randomly scattered around the reference line at y = 0.",
     col = "purple", cex = 0.8)
text(5500, 400, "Patterns or trends in the residuals may suggest areas where the model can be improved.",
     col = "purple", cex = 0.8)

cat("Observation: The residuals should ideally be randomly scattered around the reference line at y = 0,\n")
cat("indicating unbiased predictions. Patterns or trends in the residuals may suggest areas where the model can be improved.\n")

# Section: Model Performance on Test Data for Random Forest

# Set seed for reproducibility
set.seed(7889)

# Predict the Random Forest model on the test dataset
rf_predictions <- predict(rf_model, test_encoded_attributes[,-c(7)])

# Display the first 10 predictions
head(rf_predictions, 10)

#Section: Model performation visualisation based on random forest regressor

# Generate a random sample of 10 indices from the test dataset
sample_indices <- sample(1:nrow(test_encoded_attributes), 10)

# Extract the corresponding actual and predicted values for the sampled data
actual_values_rf <- test_encoded_attributes$count
predicted_values_rf <- rf_predictions

# Plot all actual and predicted values using Random forest Regressor
plot(1:length(actual_values_rf), actual_values_rf, type = "l", col = "blue",
     xlab = "Sample Index", ylab = "Count", main = "Actual vs Predicted Values (Random forest Regressor)",
     xlim = c(1, length(actual_values_rf) + 10), ylim = c(0, max(actual_values_rf, predicted_values_rf)))
lines(length(actual_values_rf) + 1:length(predicted_values_rf), predicted_values_rf, col = "orange")

# Add legend
legend("topright", legend = c("Actual", "Predicted"), col = c("blue", "orange"), lty = 1)

# Highlight the last observed data point
points(length(actual_values_rf), actual_values_rf[length(actual_values_rf)], pch = 19, col = "red")

# Highlight the starting point of predicted values
points(length(actual_values_rf) + 1, predicted_values_rf[1], pch = 19, col = "green")

# Add information
text(length(actual_values_rf) + 5, max(actual_values_rf, predicted_values_rf)/2,
     "Actual values and Predicted values  using Random forest Regressor",
     pos = 1, col = "darkgreen", cex = 0.8)

# Section: Model Performance Evaluation of Random Forest regressor based on RMSE and MAE

# Set seed for reproducibility
set.seed(667)

# Root Mean Squared Error (RMSE) calculation
rmse <- RMSE(y_test, rf_predictions)
print(rmse)

# Mean Absolute Error (MAE) calculation
mae <- MAE(y_test, rf_predictions)
print(mae)

# Section: Residual Plot for Random Forest Regressor

# Calculate residuals for Random Forest predictions
residuals <- y_test - rf_predictions

# Plot the Residual Plot
plot(y_test, residuals,
     xlab = 'Observed', ylab = 'Residuals',
     main = 'Residual Plot for Random Forest Regressor', col = 'blue', pch = 16)

# Add a reference line at y = 0
abline(0, 0, col = 'red', lty = 2)

# Enhance plot aesthetics
grid(col = "lightgray")
title(main = "Residual Plot for Random Forest Regressor", col.main = "darkgreen", font.main = 2)
axis(side = 1, col = "darkblue", col.axis = "darkblue")
axis(side = 2, col = "darkblue", col.axis = "darkblue")

# Add informative labels
text(3000, 1000, "The plot shows the residuals from the Random Forest predictions.",
     col = "red", cex = 0.8)

# Add observations
cat("Observation: In an ideal scenario, residuals should be randomly scattered around the reference line at y = 0.\n")
cat("Patterns or trends in the residuals may suggest areas where the model can be improved.\n")

#Section: Selecting best Model in all three for futher prediction

# Calculate RMSE and MAE for each model
lm_rmse <- RMSE(lm_predictions, test_encoded_attributes$count)
lm_mae <- MAE(lm_predictions, test_encoded_attributes$count)

dtr_rmse <- RMSE(dtr_predictions, test_encoded_attributes$count)
dtr_mae <- MAE(dtr_predictions, test_encoded_attributes$count)

rf_rmse <- RMSE(rf_predictions, test_encoded_attributes$count)
rf_mae <- MAE(rf_predictions, test_encoded_attributes$count)

# Create an accuracy table
accuracy_table <- data.frame(
  Model = c("Linear Regression", "Decision Tree Regressor", "Random Forest Regressor"),
  RMSE = c(lm_rmse, dtr_rmse, rf_rmse),
  MAE = c(lm_mae, dtr_mae, rf_mae)
)

# Find the best model based on the minimum RMSE and MAE
best_model <- accuracy_table[which.min(accuracy_table$RMSE + accuracy_table$MAE), ]


# Print the accuracy table
print(accuracy_table)

# Print the best model
print("Best Model:")
print(best_model)

# Section: Selecting Final Model as Random Forest Regressor for Prediction of Bike Rental Count

# Combine observed and predicted values
Bike_predictions <- data.frame(Observed = y_test, Predicted = rf_predictions)

# Write predictions to a CSV file
write.csv(Bike_predictions, 'Bike_Renting_Predictions.csv', row.names = FALSE)

# Display the predictions
Bike_predictions


# Conclusion: Random Forest Model Outperforms
cat("When comparing RMSE and MAE of all 3 models, the random forest model shows the least errors.\n")
cat("Thus, the random forest model is considered the best for predicting daily bike rental counts.\n")