In [2]:
# %load /Users/facai/Study/book_notes/preconfig.py
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
sns.set(font='SimHei')
plt.rcParams['axes.grid'] = False

from IPython.display import SVG

def show_image(filename, figsize=None):
if figsize:
plt.figure(figsize=figsize)



# xgboost的基本原理与实现简介¶

xgboost是个很棒的工具，其优点很多：运行速度，支持正则，直接用损失函数指导树的生成等等，不一而足。相较spark自带的gbdt，xgboost更适合工程应用。

~/W/xgboost ❯❯❯ git log -n 1
commit 74db1e8867eb9ecfacf07311131c2724dbc3fdbd
Author: Nan Zhu <[email protected]>
Date:   Sun Aug 28 21:25:49 2016 -0400

[jvm-packages] remove APIs with DMatrix from xgboost-spark  (#1519)

* test consistency of prediction functions between DMatrix and RDD

* remove APIs with DMatrix from xgboost-spark

* fix compilation error in xgboost4j-example

* fix test cases


### 0. 大纲¶

xgboost的贡献，既有理论上的拓展，也有工程上的性能优化。本文只关心在其在理论上的改进，主要是将正则（Regularized）引入损失函数，并且用损失函数直接指导树的生成。我们用传统GBDT做引，一步步走到xgboost。

### 1. GBDT框架¶

$$f(x) = h \left (x; \{b_j, R_j\}^J_1 \right ) = \displaystyle \sum_{j=1}^J b_j \mathbf{1}(x \in R_j)$$

$$\hat{y}_i = \phi(x_i) = \displaystyle \sum_{k=1}^K f_k(x_i)$$

\begin{align} f_m &= \displaystyle \operatorname{arg \, min}_f \sum_{i=1}^n L \left ( y_i, \hat{y}_i + f(x_i) \right ) \\ &= \operatorname{arg \, min} \mathcal{L}(f) \end{align}

[1]: Friedman - Greedy function approximation: A gradient boosting machine

### 2. xgboost¶

#### 2.0 正则¶

xgboost将正则 $\Omega(f)$ 引入进损失函数 $\mathcal{L}(f)$，控制了树的叶子数$\|R_j\|$和叶子值$b_j$，表示如下：

\begin{align} \mathcal{L}(f) &= \displaystyle \sum_{i=1}^{n} L(y_i, \hat{y}_i + f(x_i)) + \Omega(f) \\ \Omega(f) &= \gamma \|R_j\| + \frac{1}{2} \lambda \|b_j\|^2 \end{align}

#### 2.1 泰勒展开¶

\begin{align} \mathcal{L} &\approx \sum_{i=1}^n \left [ L(y_i, \hat{y}_i) + g_i f(x_i) + \frac{1}{2} h_i f^2(x_i) \right ] + \Omega(f) \\ g_i &= \frac{\partial \, L(y_i, \hat{y}_i)}{\partial \hat{y}_i} \\ h_i &= \frac{\partial^2 \, L(y_i, \hat{y}_i)}{\partial \hat{y}^2_i} \\ \end{align}

\begin{align} L(t) &\approx L(\hat{y}_i) + {\frac {L'(\hat{y}_i)}{1!}}(t-\hat{y}_i)+{\frac {f''(\hat{y}_i)}{2!}}(t-\hat{y}_i)^{2} \\ &= L(\hat{y}_i) + L'(\hat{y}_i) f(x_i) + \frac{1}{2}f''(\hat{y}_i) f^{2}(x_i) \\ &= L(y_i, \hat{y}_i) + L'(\hat{y}_i) f(x_i) + \frac{1}{2}f''(\hat{y}_i) f^{2}(x_i) \\ &= L(y_i, \hat{y}_i) + g_i f(x_i) + \frac{1}{2} h_i f^2(x_i) \end{align}

#### 2.2 引入树模型¶

$$\mathcal{L} \approx \sum_{i=1}^n \left [ L(y_i, \hat{y}_i) + g_i f(x_i) + \frac{1}{2} h_i f^2(x_i) \right ] + \Omega(f)$$

$$\mathcal{L} = \sum_{i=1}^n \left [ g_i f(x_i) + \frac{1}{2} h_i f^2(x_i) \right ] + \Omega(f)$$

1. 将决策树的数学模型$f(x)$代入外部损失函数$\mathcal{L}$。
2. 固定$J$，得到最优的$b_j$解析式。
3. 反代$b_j$回$\mathcal{L}$，将它作为树生成的评价函数（内部损失函数）。

##### 2.2.0 代入树模型¶

\begin{align} \mathcal{L} &= \sum_{i=1}^n \left [ g_i f(x_i) + \frac{1}{2} h_i f^2(x_i) \right ] + \Omega(f) \\ &= \sum_{i=1}^n \left [ g_i \sum_{j=1}^J b_j \mathbf{1}(x_i \in R_j) + \frac{1}{2} h_i (\sum_{j=1}^J \color{red}{b_j} \mathbf{1}(x_i \in R_j))^{\color{red}{2}} \right ] + \Omega(f) \\ &= \sum_{i=1}^n \left [ g_i \sum_{j=1}^J b_j \mathbf{1}(x_i \in R_j) + \frac{1}{2} h_i \sum_{j=1}^J \color{red}{b_j^2} \mathbf{1}(x_i \in R_j) \right ] + \Omega(f) \quad \text{因为$x_i$只属于一个$R_j$} \\ &= \sum_{j=1}^J \color{blue}{\sum_{i=1}^n \mathbf{1}(x_i \in R_j)} g_i b_j + \frac{1}{2} \color{blue}{\sum_{j=1}^J \sum_{i=1}^n \mathbf{1}(x_i \in R_j)} h_i b_j^2 + \Omega(f) \quad \text{乘法交换} \\ &\text{令} I_j = \{ i \, | \, x_i \in R_j \} \quad \text{即属于$R_j$的全部下标$i$} \\ &= \sum_{j=1}^J \color{blue}{\sum_{i \in I_j}} g_i b_j + \frac{1}{2} \sum_{j=1}^J \color{blue}{\sum_{i \in I_j}} h_i b_j^2 + \Omega(f) \\ &= \sum_{j=1}^J ( \sum_{i \in I_j} g_i ) b_j + \frac{1}{2} \sum_{j=1}^J (\sum_{i \in I_j} h_i) b_j^2 + \Omega(f) \quad \text{乘法分配律}\\ &= \sum_{j=1}^J ( \sum_{i \in I_j} g_i ) b_j + \frac{1}{2} \sum_{j=1}^J (\sum_{i \in I_j} h_i) b_j^2 + \gamma \|R_j\| + \frac{1}{2} \lambda \|b_j\|^2 \quad \text{代入正则$\Omega(f)$}\\ &= \sum_{j=1}^J ( \sum_{i \in I_j} g_i ) b_j + \frac{1}{2} \sum_{j=1}^J (\sum_{i \in I_j} h_i) b_j^2 + \gamma \|R_j\| + \frac{1}{2} \lambda \sum_{j=1}^J b_j^2 \\ &= \sum_{j=1}^J \left ( ( \sum_{i \in I_j} g_i ) b_j + \frac{1}{2} (\sum_{i \in I_j} h_i) b_j^2 + \frac{1}{2} \lambda b_j^2 \right ) + \gamma \|R_j\| \\ &= \sum_{j=1}^J \left ( ( \sum_{i \in I_j} g_i ) b_j + \frac{1}{2} (\sum_{i \in I_j} h_i + \lambda) b_j^2 \right ) + \gamma \|R_j\| \\ \end{align}
##### 2.2.1 $b_j$最优解析式¶
\begin{align} b_j &= \operatorname{arg \, min}_{b_j} \mathcal{L} \\ &= \operatorname{arg \, min}_{b_j} \sum_{j=1}^J \left ( ( \sum_{i \in I_j} g_i ) b_j + \frac{1}{2} (\sum_{i \in I_j} h_i + \lambda) b_j^2 \right ) + \gamma \|R_j\| \\ &\approx \sum_{j=1}^J \operatorname{arg \, min}_{b_j} \left ( ( \sum_{i \in I_j} g_i ) \color{red}{b_j} + \frac{1}{2} (\sum_{i \in I_j} h_i + \lambda) \color{red}{b_j^2} \right ) + \gamma \|R_j\| \end{align}

\begin{align} b^*_j &= - \frac{\sum_{i \in I_j} g_i}{\sum_{i \in I_j} h_i + \lambda} \\ \end{align}
##### 2.2.2 树生长的评价函数¶

\begin{align} \mathcal{L} &= - \frac{1}{2} \sum_{j=1}^J \frac{(\sum_{i \in I_j} g_i)^2}{\sum_{i \in I_j} h_i + \lambda} + \gamma \|R_j\| \\ &= - \frac{1}{2} H + \gamma T \end{align}

\begin{align} \mathcal{L}_{\text{split}} &= \mathcal{L} - \mathcal{L}_L - \mathcal{L}_R \\ &= \frac{1}{2} (H_L + H_R - H) + \gamma (T - T_L - T_R) \\ &= \frac{1}{2} (H_L + H_R - H) - \gamma \\ \end{align}

### 3 工程实现¶

#### 3.0 训练¶

1. 老模型的预测值$\hat{y}_i$；
2. 计算出两阶导数$g_i$和$h_i$；
3. 将导数信息传入，指导决策树生长。

288   void UpdateOneIter(int iter, DMatrix* train) override {
289     CHECK(ModelInitialized())
290         << "Always call InitModel or LoadModel before update";
291     if (tparam.seed_per_iteration || rabit::IsDistributed()) {
292       common::GlobalRandom().seed(tparam.seed * kRandSeedMagic + iter);
293     }
294     this->LazyInitDMatrix(train);
295     this->PredictRaw(train, &preds_);
297     gbm_->DoBoost(train, this->FindBufferOffset(train), &gpair_);
298   }


In [3]:
SVG("./res/Main.svg")

Out[3]:
In [ ]: