#!/usr/bin/env python # coding: utf-8 # **Chapter 1 – The Machine Learning landscape** # # _This is the code used to generate some of the figures in chapter 1._ # # # #
# Run in Google Colab #
# **Warning**: this is the code for the 1st edition of the book. Please visit https://github.com/ageron/handson-ml2 for the 2nd edition code, with up-to-date notebooks using the latest library versions. # # Setup # First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures: # In[1]: # To support both python 2 and python 3 from __future__ import division, print_function, unicode_literals # Common imports import numpy as np import os # to make this notebook's output stable across runs np.random.seed(42) # To plot pretty figures get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib as mpl import matplotlib.pyplot as plt mpl.rc('axes', labelsize=14) mpl.rc('xtick', labelsize=12) mpl.rc('ytick', labelsize=12) # Where to save the figures PROJECT_ROOT_DIR = "." CHAPTER_ID = "fundamentals" IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) os.makedirs(IMAGES_PATH, exist_ok=True) def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300): path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension) print("Saving figure", fig_id) if tight_layout: plt.tight_layout() plt.savefig(path, format=fig_extension, dpi=resolution) # # Code example 1-1 # This function just merges the OECD's life satisfaction data and the IMF's GDP per capita data. It's a bit too long and boring and it's not specific to Machine Learning, which is why I left it out of the book. # In[2]: def prepare_country_stats(oecd_bli, gdp_per_capita): oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"] oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value") gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True) gdp_per_capita.set_index("Country", inplace=True) full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True) full_country_stats.sort_values(by="GDP per capita", inplace=True) remove_indices = [0, 1, 6, 8, 33, 34, 35] keep_indices = list(set(range(36)) - set(remove_indices)) return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices] # The code in the book expects the data files to be located in the current directory. I just tweaked it here to fetch the files in datasets/lifesat. # In[3]: import os datapath = os.path.join("datasets", "lifesat", "") # In[4]: # Download the data import urllib.request DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/" os.makedirs(datapath, exist_ok=True) for filename in ("oecd_bli_2015.csv", "gdp_per_capita.csv"): print("Downloading", filename) url = DOWNLOAD_ROOT + "datasets/lifesat/" + filename urllib.request.urlretrieve(url, datapath + filename) # In[5]: # Code example import matplotlib.pyplot as plt import numpy as np import pandas as pd import sklearn.linear_model # Load the data oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',') gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv",thousands=',',delimiter='\t', encoding='latin1', na_values="n/a") # Prepare the data country_stats = prepare_country_stats(oecd_bli, gdp_per_capita) X = np.c_[country_stats["GDP per capita"]] y = np.c_[country_stats["Life satisfaction"]] # Visualize the data country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction') plt.show() # Select a linear model model = sklearn.linear_model.LinearRegression() # Train the model model.fit(X, y) # Make a prediction for Cyprus X_new = [[22587]] # Cyprus' GDP per capita print(model.predict(X_new)) # outputs [[ 5.96242338]] # In[ ]: # In[ ]: # In[ ]: # # Note: you can ignore the rest of this notebook, it just generates many of the figures in chapter 1. # In[ ]: # In[ ]: # In[ ]: # # Load and prepare Life satisfaction data # If you want, you can get fresh data from the OECD's website. # Download the CSV from http://stats.oecd.org/index.aspx?DataSetCode=BLI # and save it to `datasets/lifesat/`. # In[6]: oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',') oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"] oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value") oecd_bli.head(2) # In[7]: oecd_bli["Life satisfaction"].head() # # Load and prepare GDP per capita data # Just like above, you can update the GDP per capita data if you want. Just download data from http://goo.gl/j1MSKe (=> imf.org) and save it to `datasets/lifesat/`. # In[8]: gdp_per_capita = pd.read_csv(datapath+"gdp_per_capita.csv", thousands=',', delimiter='\t', encoding='latin1', na_values="n/a") gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True) gdp_per_capita.set_index("Country", inplace=True) gdp_per_capita.head(2) # In[9]: full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True) full_country_stats.sort_values(by="GDP per capita", inplace=True) full_country_stats # In[10]: full_country_stats[["GDP per capita", 'Life satisfaction']].loc["United States"] # In[11]: remove_indices = [0, 1, 6, 8, 33, 34, 35] keep_indices = list(set(range(36)) - set(remove_indices)) sample_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices] missing_data = full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[remove_indices] # In[12]: sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5,3)) plt.axis([0, 60000, 0, 10]) position_text = { "Hungary": (5000, 1), "Korea": (18000, 1.7), "France": (29000, 2.4), "Australia": (40000, 3.0), "United States": (52000, 3.8), } for country, pos_text in position_text.items(): pos_data_x, pos_data_y = sample_data.loc[country] country = "U.S." if country == "United States" else country plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text, arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5)) plt.plot(pos_data_x, pos_data_y, "ro") save_fig('money_happy_scatterplot') plt.show() # In[13]: sample_data.to_csv(os.path.join("datasets", "lifesat", "lifesat.csv")) # In[14]: sample_data.loc[list(position_text.keys())] # In[15]: import numpy as np sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5,3)) plt.axis([0, 60000, 0, 10]) X=np.linspace(0, 60000, 1000) plt.plot(X, 2*X/100000, "r") plt.text(40000, 2.7, r"$\theta_0 = 0$", fontsize=14, color="r") plt.text(40000, 1.8, r"$\theta_1 = 2 \times 10^{-5}$", fontsize=14, color="r") plt.plot(X, 8 - 5*X/100000, "g") plt.text(5000, 9.1, r"$\theta_0 = 8$", fontsize=14, color="g") plt.text(5000, 8.2, r"$\theta_1 = -5 \times 10^{-5}$", fontsize=14, color="g") plt.plot(X, 4 + 5*X/100000, "b") plt.text(5000, 3.5, r"$\theta_0 = 4$", fontsize=14, color="b") plt.text(5000, 2.6, r"$\theta_1 = 5 \times 10^{-5}$", fontsize=14, color="b") save_fig('tweaking_model_params_plot') plt.show() # In[16]: from sklearn import linear_model lin1 = linear_model.LinearRegression() Xsample = np.c_[sample_data["GDP per capita"]] ysample = np.c_[sample_data["Life satisfaction"]] lin1.fit(Xsample, ysample) t0, t1 = lin1.intercept_[0], lin1.coef_[0][0] t0, t1 # In[17]: sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5,3)) plt.axis([0, 60000, 0, 10]) X=np.linspace(0, 60000, 1000) plt.plot(X, t0 + t1*X, "b") plt.text(5000, 3.1, r"$\theta_0 = 4.85$", fontsize=14, color="b") plt.text(5000, 2.2, r"$\theta_1 = 4.91 \times 10^{-5}$", fontsize=14, color="b") save_fig('best_fit_model_plot') plt.show() # In[18]: cyprus_gdp_per_capita = gdp_per_capita.loc["Cyprus"]["GDP per capita"] print(cyprus_gdp_per_capita) cyprus_predicted_life_satisfaction = lin1.predict([[cyprus_gdp_per_capita]])[0][0] cyprus_predicted_life_satisfaction # In[19]: sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(5,3), s=1) X=np.linspace(0, 60000, 1000) plt.plot(X, t0 + t1*X, "b") plt.axis([0, 60000, 0, 10]) plt.text(5000, 7.5, r"$\theta_0 = 4.85$", fontsize=14, color="b") plt.text(5000, 6.6, r"$\theta_1 = 4.91 \times 10^{-5}$", fontsize=14, color="b") plt.plot([cyprus_gdp_per_capita, cyprus_gdp_per_capita], [0, cyprus_predicted_life_satisfaction], "r--") plt.text(25000, 5.0, r"Prediction = 5.96", fontsize=14, color="b") plt.plot(cyprus_gdp_per_capita, cyprus_predicted_life_satisfaction, "ro") save_fig('cyprus_prediction_plot') plt.show() # In[20]: sample_data[7:10] # In[21]: (5.1+5.7+6.5)/3 # In[22]: backup = oecd_bli, gdp_per_capita def prepare_country_stats(oecd_bli, gdp_per_capita): oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"] oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value") gdp_per_capita.rename(columns={"2015": "GDP per capita"}, inplace=True) gdp_per_capita.set_index("Country", inplace=True) full_country_stats = pd.merge(left=oecd_bli, right=gdp_per_capita, left_index=True, right_index=True) full_country_stats.sort_values(by="GDP per capita", inplace=True) remove_indices = [0, 1, 6, 8, 33, 34, 35] keep_indices = list(set(range(36)) - set(remove_indices)) return full_country_stats[["GDP per capita", 'Life satisfaction']].iloc[keep_indices] # In[23]: # Code example import matplotlib.pyplot as plt import numpy as np import pandas as pd import sklearn # Load the data oecd_bli = pd.read_csv(datapath + "oecd_bli_2015.csv", thousands=',') gdp_per_capita = pd.read_csv(datapath + "gdp_per_capita.csv",thousands=',',delimiter='\t', encoding='latin1', na_values="n/a") # Prepare the data country_stats = prepare_country_stats(oecd_bli, gdp_per_capita) X = np.c_[country_stats["GDP per capita"]] y = np.c_[country_stats["Life satisfaction"]] # Visualize the data country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction') plt.show() # Select a linear model model = sklearn.linear_model.LinearRegression() # Train the model model.fit(X, y) # Make a prediction for Cyprus X_new = [[22587]] # Cyprus' GDP per capita print(model.predict(X_new)) # outputs [[ 5.96242338]] # In[24]: oecd_bli, gdp_per_capita = backup # In[25]: missing_data # In[26]: position_text2 = { "Brazil": (1000, 9.0), "Mexico": (11000, 9.0), "Chile": (25000, 9.0), "Czech Republic": (35000, 9.0), "Norway": (60000, 3), "Switzerland": (72000, 3.0), "Luxembourg": (90000, 3.0), } # In[27]: sample_data.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(8,3)) plt.axis([0, 110000, 0, 10]) for country, pos_text in position_text2.items(): pos_data_x, pos_data_y = missing_data.loc[country] plt.annotate(country, xy=(pos_data_x, pos_data_y), xytext=pos_text, arrowprops=dict(facecolor='black', width=0.5, shrink=0.1, headwidth=5)) plt.plot(pos_data_x, pos_data_y, "rs") X=np.linspace(0, 110000, 1000) plt.plot(X, t0 + t1*X, "b:") lin_reg_full = linear_model.LinearRegression() Xfull = np.c_[full_country_stats["GDP per capita"]] yfull = np.c_[full_country_stats["Life satisfaction"]] lin_reg_full.fit(Xfull, yfull) t0full, t1full = lin_reg_full.intercept_[0], lin_reg_full.coef_[0][0] X = np.linspace(0, 110000, 1000) plt.plot(X, t0full + t1full * X, "k") save_fig('representative_training_data_scatterplot') plt.show() # In[28]: full_country_stats.plot(kind='scatter', x="GDP per capita", y='Life satisfaction', figsize=(8,3)) plt.axis([0, 110000, 0, 10]) from sklearn import preprocessing from sklearn import pipeline poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False) scaler = preprocessing.StandardScaler() lin_reg2 = linear_model.LinearRegression() pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler), ('lin', lin_reg2)]) pipeline_reg.fit(Xfull, yfull) curve = pipeline_reg.predict(X[:, np.newaxis]) plt.plot(X, curve) save_fig('overfitting_model_plot') plt.show() # In[29]: full_country_stats.loc[[c for c in full_country_stats.index if "W" in c.upper()]]["Life satisfaction"] # In[30]: gdp_per_capita.loc[[c for c in gdp_per_capita.index if "W" in c.upper()]].head() # In[31]: plt.figure(figsize=(8,3)) plt.xlabel("GDP per capita") plt.ylabel('Life satisfaction') plt.plot(list(sample_data["GDP per capita"]), list(sample_data["Life satisfaction"]), "bo") plt.plot(list(missing_data["GDP per capita"]), list(missing_data["Life satisfaction"]), "rs") X = np.linspace(0, 110000, 1000) plt.plot(X, t0full + t1full * X, "r--", label="Linear model on all data") plt.plot(X, t0 + t1*X, "b:", label="Linear model on partial data") ridge = linear_model.Ridge(alpha=10**9.5) Xsample = np.c_[sample_data["GDP per capita"]] ysample = np.c_[sample_data["Life satisfaction"]] ridge.fit(Xsample, ysample) t0ridge, t1ridge = ridge.intercept_[0], ridge.coef_[0][0] plt.plot(X, t0ridge + t1ridge * X, "b", label="Regularized linear model on partial data") plt.legend(loc="lower right") plt.axis([0, 110000, 0, 10]) save_fig('ridge_model_plot') plt.show() # In[32]: backup = oecd_bli, gdp_per_capita def prepare_country_stats(oecd_bli, gdp_per_capita): return sample_data # In[33]: # Replace this linear model: import sklearn.linear_model model = sklearn.linear_model.LinearRegression() # In[34]: # with this k-neighbors regression model: import sklearn.neighbors model = sklearn.neighbors.KNeighborsRegressor(n_neighbors=3) # In[35]: X = np.c_[country_stats["GDP per capita"]] y = np.c_[country_stats["Life satisfaction"]] # Train the model model.fit(X, y) # Make a prediction for Cyprus X_new = np.array([[22587.0]]) # Cyprus' GDP per capita print(model.predict(X_new)) # outputs [[ 5.76666667]] # In[ ]: