输入 \begin{align*} & x=\left( x_{1},x_{2},\ldots ,x_{j},\ldots ,x_{n}\right) ^{T} \end{align*}
输入层(层 1) \begin{align*} & a^{1}=\left( a_{1}^{1},a_{2}^{1},\ldots ,a_{j}^{1},\ldots \ldots ,a_{n}^{1}\right) ^{T}\\ & a_{j}^{1}=x_{j}\quad\left( j=1,2,\ldots ,n\right) \end{align*}
隐藏层(层 2) \begin{align*} & a^{2}=\left( a_{1}^{2},a_{2}^{2},\ldots ,a_{j}^{2},\ldots \ldots ,a_{m}^{2}\right) ^{T}\\ & a_{j}^{2}=\sigma \left( z_{j}^{2}\right) \\ & z_{j}^{2}= \sum _{k}w_{jk}^{2}\cdot a_{k}^{1}+b_{j}^{2}\quad\left( j=1,2,\ldots ,m\right) \end{align*}
输出层(层 3) \begin{align*} & a^{3}=\left( a_{1}^{3},a_{2}^{3},\ldots ,a_{j}^{3},\ldots \ldots ,a_{p}^{3}\right) ^{T}\\ & a_{j}^{3}=\sigma \left( z_{j}^{3}\right) \\ & z_{j}^{3}= \sum _{k}w_{jk}^{3}\cdot a_{k}^{2}+b_{j}^{3}\quad\left( j=1,2,\ldots ,p\right) \end{align*}
预测输出 \begin{align*} & \hat y=\left( \hat y_{1},\hat y_{2},\ldots ,\hat y_{j},\ldots ,\hat y_{p}\right) ^{T}\\ & \hat y_{j}=a_{j}^{3}\quad\left( j=1,2,\ldots ,p\right)\end{align*}
实际输出 \begin{align*} & y=\left( y_{1},y_{2},\ldots ,y_{j},\ldots ,y_{p}\right) ^{T} \quad\left( j=1,2,\ldots ,p\right) \end{align*}
单个样本x的损失\begin{align*} & C_{x}=\dfrac {1} {2}\left\| y-\widehat {y}\right\| ^{2}=\dfrac {1} {2}\sum _{j}\left( y_{j}-\widehat {y}_{j}\right) ^{2} \end{align*}
经验损失 \begin{align*} & C=\dfrac {1} {N}\sum _{x}C_{x} \end{align*}
第l层的第j个神经元上的误差 \begin{align*} & \delta _{j}^{l}\equiv \dfrac {\partial C_{x}} {\partial z_{j}^{l}} \quad\left( l=2,3\right)\end{align*}
输出层误差 \begin{align*} & \delta _{j}^{3}=\dfrac {\partial C_{x}} {\partial z_{j}^{3}} \\ & =\dfrac {\partial C_{x}} {\partial a_{j}^{3}}\cdot\dfrac {\partial a_{j}^{3}} {\partial z_{j}^{3}} \\ & =\dfrac {\partial C_{x}} {\partial a_{j}^{3}}\cdot \sigma '\left( z_{j}^{3}\right) \\& =\dfrac {\partial \left( \dfrac {1} {2}\sum _{j}\left( y_{j}-\widehat {y}_{j}\right) ^{2}\right) } {\partial a_{j}^{3}}\cdot \sigma'\left( z_{j}^{3}\right) \\& = \left(y_{j}-a_{j}^3 \right) \cdot \sigma'\left( z_{j}^{3} \right)\quad\left( j=1,2,\ldots ,p\right)\quad\left( k=1,2,\ldots ,p\right)\end{align*}
隐藏层误差\begin{align*} & \delta _{j}^{2}=\dfrac {\partial C_{x}} {\partial z_{j}^{2}}\\ & =\sum _{k}\dfrac {\partial C_{x}} {\partial z_{k}^{3}}\cdot \dfrac {\partial z_{k}^{3}} {\partial z_{j}^{2}} \\ & = \sum _{k} \dfrac {\partial z_{k}^{3}} {\partial z_{j}^{2}}\cdot\delta _{k}^{3}\\ & = \sum _{k} \dfrac {\partial \left( \sum _{j}w_{kj}^{3}\cdot a_{j}^{2}+b_{k}^{3}\right)} {\partial z_{j}^{2}}\cdot\delta _{k}^{3}\\ & = \sum _{k} \dfrac {\partial \left( \sum _{j}w_{kj}^{3}\cdot \sigma \left( z_{j}^{2}\right)+b_{k}^{3}\right)} {\partial z_{j}^{2}}\cdot\delta _{k}^{3}\\ & = \sum _{k} w_{kj}^{3}\cdot \sigma '\left( z_{j}^{2}\right) \cdot\delta _{k}^{3} \\ & = \sigma '\left( z_{j}^{2}\right) \cdot\sum _{k} w_{kj}^{3} \delta _{k}^{3} \quad\left( j=1,2,\ldots ,m\right)\quad\left( k=1,2,\ldots ,p\right)\end{align*}
经验损失在隐藏层(层2)/输出层(层3)关于偏置的梯度 \begin{align*} & \dfrac {\partial C_{x}} {\partial b_{j}^{l}}=\dfrac {\partial C_{x}} {\partial z_{j}^{l}}\cdot \dfrac {\partial z_{j}^{l}} {\partial b_{j}^{l}}=\delta _{j}^{l}\cdot \dfrac {\partial \left( \sum _{k}w_{jk}^{l}a_{k}^{l-1}+b_{j}^{l}\right) } {\partial b_{j}^{l}}=\delta _{j}^{l}\quad\left( l=2,3\right)\end{align*}
经验损失在隐藏层(层2)/输出层(层3)关于权值的梯度\begin{align*} & \dfrac {\partial C_{x}} {\partial w_{jk}^{l}}=\dfrac {\partial C_{x}} {\partial z_{j}^{l}}\cdot \dfrac {\partial z_{j}^{l}} {\partial w_{jk}^{l}}=\delta _{j}^{l}\cdot \dfrac {\partial \left( \sum _{k}w_{jk}^{l}a_{k}^{l-1}+b_{j}^{l}\right) } {\partial w_{jk}^{l}}=\delta _{j}^{l}\cdot a_{k}^{l-1}\quad\left( l=2,3\right)\end{align*}
误差反向传播算法: