#import liraries import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import warnings warnings.filterwarnings("ignore") from matplotlib import patches from scipy.spatial import ConvexHull DATA_DIR = "/content/sample_data/SeoulBikeData.csv" data = pd.read_csv(DATA_DIR, encoding= 'unicode_escape') data.head() print("---------------------------------------------------------------------------------") print(f"There are {data.shape[0]} observations and {data.shape[1]} features ") print("---------------------------------------------------------------------------------") data.info() # Prepare Data # Create as many colors as there are unique midwest['category'] categories = np.unique(data['Seasons']) colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))] # Draw Plot for Each Category plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k') for i, category in enumerate(categories): plt.scatter('Temperature(°C)', 'Rented Bike Count', data= data.loc[data["Seasons"]==category, :], s=20, c=colors[i], label=str(category)) # Decorations plt.gca().set(xlim=(np.min(data['Temperature(°C)']), np.max(data['Temperature(°C)'])), ylim=(np.min(data["Rented Bike Count"]), np.max(data["Rented Bike Count"])), xlabel='Temperature(°C) ', ylabel='Rented Bike Count') plt.xticks(fontsize=12); plt.yticks(fontsize=12) plt.title("Scatterplot of Temperature(°C) vs Rented Bike Count", fontsize=15) plt.legend(fontsize=12,loc="best") plt.show() # As many colors as there are unique midwest['category'] categories = np.unique(data['Seasons']) colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))] # Step 2: Draw Scatterplot with unique color for each category fig = plt.figure(figsize=(10, 8), dpi= 80, facecolor='w', edgecolor='k') for i, category in enumerate(categories): plt.scatter('Temperature(°C)', 'Rented Bike Count', data=data.loc[data.Seasons==category, :], s=20, c=colors[i], label=str(category), edgecolors='black', linewidths=.5) # Step 3: Encircling # https://stackoverflow.com/questions/44575681/how-do-i-encircle-different-data-sets-in-scatter-plot def encircle(x,y, ax=None, **kw): if not ax: ax=plt.gca() p = np.c_[x,y] hull = ConvexHull(p) poly = plt.Polygon(p[hull.vertices,:], **kw) ax.add_patch(poly) # Select data to be encircled data_encircle = data.loc[data.Seasons=='Summer', :] # Draw polygon surrounding vertices encircle(data_encircle["Temperature(°C)"], data_encircle['Rented Bike Count'], ec="k", fc="gold", alpha=0.1) encircle(data_encircle["Temperature(°C)"], data_encircle['Rented Bike Count'], ec="firebrick", fc="none", linewidth=1.5) # Step 4: Decorations plt.gca().set(xlim=(np.min(data['Temperature(°C)']), np.max(data['Temperature(°C)'])), ylim=(np.min(data["Rented Bike Count"]), np.max(data["Rented Bike Count"])), xlabel='Temperature(°C) ', ylabel='Rented Bike Count') plt.xticks(fontsize=12); plt.yticks(fontsize=12) plt.title("Bubble Plot with Encircling", fontsize=15) plt.legend(fontsize=12) plt.show() # Plot plt.figure(figsize=(10,8), dpi= 80) sns.pairplot(data[["Rented Bike Count","Temperature(°C)","Seasons","Wind speed (m/s)"]], kind="scatter", hue="Seasons", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5)) plt.show() # Prepare Data data_df = pd.read_csv(DATA_DIR, encoding= 'unicode_escape', parse_dates=['Date']).head(100) x = np.arange(data_df.shape[0]) y_returns = (data_df['Rented Bike Count'].diff().fillna(0)/data_df['Rented Bike Count'].shift(1)).fillna(0) * 100 # Plot plt.figure(figsize=(10,8), dpi= 80) plt.fill_between(x[1:], y_returns[1:], 0, where=y_returns[1:] >= 0, facecolor='green', interpolate=True, alpha=0.7) plt.fill_between(x[1:], y_returns[1:], 0, where=y_returns[1:] <= 0, facecolor='red', interpolate=True, alpha=0.7) # Annotate plt.annotate('Peak \n Mar 2017', xy=(55.0, 280.0), xytext=(35.0, 300), bbox=dict(boxstyle='square', fc='firebrick'), arrowprops=dict(facecolor='steelblue', shrink=0.05), fontsize=15, color='white') # Decorations xtickvals = [str(m)[:3].upper()+"-"+str(y) for y,m in zip(data_df.Date.dt.year, data_df.Date.dt.month_name())] plt.gca().set_xticks(x[::6]) plt.gca().set_xticklabels(xtickvals[::6], rotation=90, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'}) plt.ylim(-100,350) plt.xlim(1,100) plt.title("Rented Bike Count Return", fontsize=15) plt.ylabel('Monthly Rented Bike Count Return') plt.grid(alpha=0.5) plt.show() from locale import D_FMT # Prepare Data df = data[['Rented Bike Count', 'Seasons']].groupby('Seasons').apply(lambda x: x.mean()) df.sort_values('Rented Bike Count', inplace=True) df.reset_index(inplace=True) # Draw plot import matplotlib.patches as patches fig, ax = plt.subplots(figsize=(10,8), facecolor='white', dpi= 80) ax.vlines(x=df.index, ymin=0, ymax=df['Rented Bike Count'], color='firebrick', alpha=0.7, linewidth=20) # Annotate Text for i, rbc in enumerate(df['Rented Bike Count']): ax.text(i, rbc+10, round(rbc, 1), horizontalalignment='center', fontsize=10) # Title, Label, Ticks and Ylim ax.set_title('Bar Chart for Rented Bike Count', fontdict={'size':15}) ax.set(ylabel='Rented Bike Count', ylim=(0, 1500)) plt.xticks(df.index, df.Seasons.str.upper(), rotation=60, horizontalalignment='right', fontsize=12) # Add patches to color the X axis labels p1 = patches.Rectangle((.57, -0.005), width=.33, height=.13, alpha=.1, facecolor='green', transform=fig.transFigure) p2 = patches.Rectangle((.124, -0.005), width=.446, height=.13, alpha=.1, facecolor='red', transform=fig.transFigure) fig.add_artist(p1) fig.add_artist(p2) plt.show() df = data[['Rented Bike Count', 'Seasons']].groupby('Seasons').apply(lambda x: x.mean()) df.sort_values('Rented Bike Count', inplace=True) df.reset_index(inplace=True) # Draw plot fig, ax = plt.subplots(figsize=(10,8), dpi= 80) ax.vlines(x=df.index, ymin=0, ymax=df['Rented Bike Count'], color='firebrick', alpha=0.7, linewidth=2) ax.scatter(x=df.index, y=df['Rented Bike Count'], s=80, color='firebrick', alpha=0.7) # Title, Label, Ticks and Ylim ax.set_title('Lollipop Chart for Rented Bike Count', fontdict={'size':15}) ax.set_ylabel('Rented Bike Count') ax.set_xticks(df.index) ax.set_xticklabels(df.Seasons.str.upper(), rotation=60, fontdict={'horizontalalignment': 'right', 'size':12}) ax.set_ylim(0, 1200) # Annotate Text for i, rbc in enumerate(df['Rented Bike Count']): ax.text(i, rbc+15, round(rbc, 1), horizontalalignment='center',verticalalignment='bottom', fontsize=12) plt.show() # Draw Plot plt.figure(figsize=(10,8), dpi= 80) sns.kdeplot(data.loc[data['Seasons'] == 'Winter', "Rented Bike Count"], shade=True, color="g", label="Winter", alpha=.7) sns.kdeplot(data.loc[data['Seasons'] == 'Spring', "Rented Bike Count"], shade=True, color="deeppink", label="Spring", alpha=.7) sns.kdeplot(data.loc[data['Seasons'] == 'Summer', "Rented Bike Count"], shade=True, color="dodgerblue", label="Summer", alpha=.7) sns.kdeplot(data.loc[data['Seasons'] == 'Autumn', "Rented Bike Count"], shade=True, color="orange", label="Autumn", alpha=.7) # Decoration plt.title('Density Plot of Rented Bike Count by Seasons', fontsize=15) plt.legend() plt.show() # Draw Plot plt.figure(figsize=(10,8), dpi= 80) sns.distplot(data.loc[data['Holiday'] == 'No Holiday', "Rented Bike Count"], color="dodgerblue", label="No Holiday'", hist_kws={'alpha':.7}, kde_kws={'linewidth':3}) sns.distplot(data.loc[data['Holiday'] == 'Holiday', "Rented Bike Count"], color="orange", label="Holiday", hist_kws={'alpha':.7}, kde_kws={'linewidth':3}) plt.ylim(0, 0.004) # Decoration plt.title('Density Plot of Rented Bike Count by Holiday', fontsize=15) plt.legend() plt.show() # Draw Plot plt.figure(figsize=(10,8), dpi= 80) sns.boxplot(x='Seasons', y='Rented Bike Count', data=data, notch=False) # Add N Obs inside boxplot (optional) def add_n_obs(df,group_col,y): medians_dict = {grp[0]:grp[1][y].median() for grp in df.groupby(group_col)} xticklabels = [x.get_text() for x in plt.gca().get_xticklabels()] n_obs = df.groupby(group_col)[y].size().values for (x, xticklabel), n_ob in zip(enumerate(xticklabels), n_obs): plt.text(x, medians_dict[xticklabel]*1.01, "#obs : "+str(n_ob), horizontalalignment='center', fontdict={'size':14}, color='white') add_n_obs(data,group_col='Seasons',y='Rented Bike Count') # Decoration plt.title('Box Plot of Rented Bike Count by Seasons', fontsize=15) plt.ylim(-400, 3700) plt.show() # Draw Plot fig, ax = plt.subplots(figsize=(12, 7), subplot_kw=dict(aspect="equal"), dpi= 80) # Prepare Data df = data.groupby('Seasons').size().reset_index(name='counts') data_df = df['counts'] categories = df['Seasons'] explode = [0,0,0,0.1] def func(pct, allvals): absolute = int(pct/100.*np.sum(allvals)) return "{:.1f}% ({:d} )".format(pct, absolute) wedges, texts, autotexts = ax.pie(data_df, autopct=lambda pct: func(pct, data_df), textprops=dict(color="w"), colors=plt.cm.Dark2.colors, startangle=140, explode=explode) # Decoration ax.legend(wedges, categories, title="Rented Bike Count by Seasons", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1)) plt.setp(autotexts, size=10, weight=700) ax.set_title("Seasons: Pie Chart") plt.show() from scipy.stats import sem # Import Data data_mean = data.groupby('Hour')["Rented Bike Count"].mean() data_se = data.groupby('Hour')["Rented Bike Count"].apply(sem).mul(1.96) # Plot plt.figure(figsize=(10,8), dpi= 80) plt.ylabel("# Rented Bike Count", fontsize=16) x = data_mean.index plt.plot(x, data_mean, color="white", lw=2) plt.fill_between(x, data_mean - data_se, data_mean + data_se, color="#3F5D7D") # Decorations # Lighten borders plt.gca().spines["top"].set_alpha(0) plt.gca().spines["bottom"].set_alpha(1) plt.gca().spines["right"].set_alpha(0) plt.gca().spines["left"].set_alpha(1) plt.xticks(x[::2], [str(d) for d in x[::2]] , fontsize=12) plt.title("Bike Rented by Hour of Day (95% confidence)", fontsize=15) plt.xlabel("Hour of Day") s, e = plt.gca().get_xlim() plt.xlim(s, e) # Draw Horizontal Tick lines for y in range(8, 20, 2): plt.hlines(y, xmin=s, xmax=e, colors='black', alpha=0.5, linestyles="--", lw=0.5) plt.show() from pandas.plotting import parallel_coordinates # Plot plt.figure(figsize=(12,9), dpi= 80) data.drop(['Date','Holiday', 'Functioning Day'],axis=1,inplace=True) parallel_coordinates(data, 'Seasons', colormap='Dark2') # Lighten borders plt.gca().spines["top"].set_alpha(0) plt.gca().spines["bottom"].set_alpha(.3) plt.gca().spines["right"].set_alpha(0) plt.gca().spines["left"].set_alpha(.3) plt.title('Parallel Coordinated of Diamonds', fontsize=15) plt.grid(alpha=0.3) plt.xticks(fontsize=12,rotation=60,) plt.yticks(fontsize=12) plt.show()