# Import two modules from SciPy package from scipy import interpolate, stats # Also import other useful libraries import numpy as np import pandas as pd from datetime import datetime, timedelta import matplotlib.pyplot as plt # Sample noisy data with a trend x = np.arange(0,10,0.1) y = 5*x - 25 + np.random.normal(0,25,size=100) # Plot sample data plt.figure(figsize=(7,4)) # dpi=300 plt.scatter(x,y,c='k',zorder=2) plt.plot(x,y,c='k',zorder=3) plt.grid(zorder=1) plt.xlabel('x') plt.ylabel('y'); # Plot sample data with trend line plt.figure(figsize=(7,4)) plt.scatter(x,y,c='k',zorder=2) plt.plot(x,y,c='k',zorder=3) plt.plot(x,5*x - 25,c='r',zorder=4) plt.grid(zorder=1) plt.xlabel('x') plt.ylabel('y'); # Sample noisy data with a quadratic trend x = np.arange(0,10,0.1) y_quad = 4*(x-5)**2 + 5*x - 25 + np.random.normal(0,25,size=100) # Plot sample quadratic data with trend line plt.figure(figsize=(7,4)) plt.scatter(x,y_quad,c='k',zorder=2) plt.plot(x,y_quad,c='k',zorder=3) plt.plot(x,6*x,c='r',zorder=4) plt.grid(zorder=1) plt.xlabel('x') plt.ylabel('y'); # Sample noisy data with outliers and a trend x = np.arange(0,10,0.1) y_outlier = 25*x - 150 + np.random.normal(0,25,size=100) y_outlier[-10:] -= 200 # Plot sample data with trend line m, b = stats.linregress(x,y_outlier)[0:2] plt.figure(figsize=(7,4)) plt.scatter(x,y_outlier,c='k',zorder=2) plt.plot(x,y_outlier,c='k',zorder=3) plt.plot(x,m*x + b,c='r',zorder=4,label='Regression') plt.plot(x,25*x - 150,'r--',zorder=4,label='Regression without outliers') plt.legend() plt.grid(zorder=1) plt.xlabel('x') plt.ylabel('y'); # Linear regression on original noisy data slope, intercept, rvalue, pvalue, stderr = stats.linregress(x,y) print('The slope is',round(slope,2)) print('The y-intercept is',round(intercept,2)) print('The r-value is',round(rvalue,2)) print('The p-value is',pvalue) print('The standard error is',round(stderr,2)) # This is how you can make an array of Datetime objects useable in a regression import matplotlib.dates as mdates t = np.array([datetime(2020,1,1),datetime(2020,2,1),datetime(2020,3,1)]) t_as_numbers = mdates.date2num(t) print(t_as_numbers) # Monthly climatological high temperatures for Seattle mid_months = np.array([datetime(2020,mo,15) for mo in np.arange(1,13)]) # months 1 through 12 high_t = np.array([45,48,52,58,64,69,72,73,67,59,51,47]) # units: °F # Plot original data plt.figure(figsize=(7,4)) plt.plot(mid_months,high_t,c='k',lw=2,zorder=2) plt.scatter(mid_months,high_t,c='k',zorder=3) plt.ylabel('Temperature (°F)') plt.xlabel('Month') plt.title('Climatological high temperatures in Seattle'); plt.grid(alpha=0.5) plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b')) # Interpolation to 1st of each month start_months = np.array([datetime(2020,mo,1) for mo in np.arange(1,13)]) interp_func = interpolate.interp1d(mdates.date2num(mid_months),high_t, fill_value='extrapolate',bounds_error=False) high_t_interpolated = interp_func(mdates.date2num(start_months)) # Plot interpolated data plt.figure(figsize=(7,4)) plt.plot(mid_months,high_t,c='k',lw=2,zorder=2,label='Original') plt.scatter(mid_months,high_t,c='k',zorder=3) plt.plot(start_months,high_t_interpolated,c='green',ls='--',lw=2,zorder=4,label='Interpolated') plt.scatter(start_months,high_t_interpolated,c='green',zorder=5) plt.ylabel('Temperature (°F)') plt.xlabel('Month') plt.title('Climatological high temperatures in Seattle') plt.legend() plt.grid(alpha=0.5) plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%b'))