#!/usr/bin/env python
# coding: utf-8

# # Yearly Global Average CO2 Concentrations in parts per million (ppm) and Linear Regression
# 
# AIM: To practice Linear Regression using Python code.
# DATA: The dataset includes monthly mean carbon dioxide globally averaged over marine surface sites for the span 1980-2020. 
# 
# Data Source: National Oceanic and Atmospheric Administration (NOAA)
# https://gml.noaa.gov/ccgg/trends/global.html
# 
# 
# 

# In[6]:


#Import necessary librarires of Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn import metrics
import math
import sklearn
import statistics


# In[2]:


print("Yearly Global Average CO2 Concentrations in parts per million (ppm) and Linear Regression")


# In[7]:


#Read the Dataset
df=pd.read_csv('global-atm-co2.csv')

#Know the basics of the dataset
print (df.head(10))       # display first 10 entries
print(df.shape)           # display the dimensions of the dataset (rows and columns)
print(df.columns.values)  #display columns names
df.info()                 # display data types and memory usage


# In[5]:


#Scatter plot : Plot the scatter plot of yearly average_co2_concentrations variable 

df.plot.scatter(x="year",y="average_co2_concentrations")
plt.xlabel('Year') 
plt.ylabel('Global Average CO2 Concentrations (ppm)') 
plt.title ('Yearly Global Average CO2 Concentrations in parts per million (ppm)')
plt.show()


# ## Linear Regression
# Let us try to fit a Line to the data. 
# Equation of a line is y = b0 + b1*x, where b0 is Y-intercept and b1 is the slope.

# In[8]:


# Use NumPy library to convert the DataFrame to NumPy Array which would be used in the further steps. 
x=[]
y=[]
x=df['year'].to_numpy()
y=df['average_co2_concentrations'].to_numpy()
n = np.size(x)  # number of observations/points 


# In[24]:


# Function: Calculate Regression Coefficients : b0 is Y-intercept and b1 is slope for a Regression Line b0 + b1*x  
def estimate_coef(x, y): 
    
   # mean of x and y vector 
   m_x, m_y = np.mean(x), np.mean(y) 
 
   # calculating cross-deviation and deviation about x 
   SS_xy = np.sum(y*x) - n*m_y*m_x 
   SS_xx = np.sum(x*x) - n*m_x*m_x 
 
  
   b_1 = SS_xy / SS_xx 
   b_0 = m_y - b_1*m_x 
 
   return(b_0, b_1) 
 
# Function: Plot the scatter plot and Regression Line as per the predicted coefficients
def plot_regression_line(x, y, b): 
   # plotting the actual points as scatter plot 
   plt.scatter(x, y, color = "m", 
              marker = "o", s = 30) 
 
   # predicted response vector 
   y_pred = b[0] + b[1]*x 
     
   # plot the regression line 
   plt.plot(x, y_pred, color = "g") 
 
   # prepare and render the scatter plot 
   plt.xlabel('Year') 
   plt.ylabel('Global Average CO2 Concentrations (ppm)') 
   plt.title ('Yearly Global Average CO2 Concentrations in parts per million (ppm) and Linear Regression')     
   plt.show() 

# Function: Calculate RMSE (Root Mean-Squared Error values)    
def rmse(b,y):
   predict=[]
   for i in range(0,n):
       predict.append(b[0]+b[1]*x[i])
   predict=np.array(predict)    
   mse = sklearn.metrics.mean_squared_error(y, predict)
   root_mse = math.sqrt(mse)                       # RMSE value
   nrmse = root_mse/statistics.mean(y)             # Normalized RMSE value
   return(root_mse,nrmse)

# Function: Call the functions in a particular order
def main(x,y): 
   # Estimate Regression Coefficients 
   b = estimate_coef(x, y) 
   print("Estimated coefficients of the line y = b0 + b1*x are:\nb0 = {}   \nb1 = {}".format(b[0], b[1])) 
     
   # Plot regression line 
   residual_error = rmse(b,y)
   print("RMSE VALUE is",residual_error[0])
   print("Normalized RMSE VALUE is",residual_error[1])
   plot_regression_line(x, y, b)

#Call the main function    
if __name__ == "__main__": 
        main(x,y)

#EoF


# Root Mean Square Error,RMSE, is the standard deviation of the residuals (prediction errors).
# Residuals are a measure of how far from the regression line data points are.
# RMSE is a measure of how spread out these residuals are. It tells us how concentrated the data is around the line of best fit.

# In[ ]: