Q:梯度爆炸的代码演示?
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
class MLP(nn.Module):
def __init__(self, neural_num, layers):
super(MLP, self).__init__()
self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
self.neural_num = neural_num
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
break
return x
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.normal_(m.weight.data) # normal: mean=0, std=1
# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16
net = MLP(neural_nums, layer_nums)
net.initialize()
inputs = torch.randn((batch_size, neural_nums)) # normal: mean=0, std=1
output = net(inputs)
print(output)
layer:0, std:15.959932327270508 layer:1, std:256.6237487792969 layer:2, std:4107.24560546875 layer:3, std:65576.8125 layer:4, std:1045011.875 layer:5, std:17110408.0 layer:6, std:275461408.0 layer:7, std:4402537984.0 layer:8, std:71323615232.0 layer:9, std:1148104736768.0 layer:10, std:17911758454784.0 layer:11, std:283574846619648.0 layer:12, std:4480599809064960.0 layer:13, std:7.196814275405414e+16 layer:14, std:1.1507761512626258e+18 layer:15, std:1.853110740188555e+19 layer:16, std:2.9677725826641455e+20 layer:17, std:4.780376223769898e+21 layer:18, std:7.613223480799065e+22 layer:19, std:1.2092652108825478e+24 layer:20, std:1.923257075956356e+25 layer:21, std:3.134467063655912e+26 layer:22, std:5.014437766285408e+27 layer:23, std:8.066615144249704e+28 layer:24, std:1.2392661553516338e+30 layer:25, std:1.9455688099759845e+31 layer:26, std:3.0238180658999113e+32 layer:27, std:4.950357571077011e+33 layer:28, std:8.150925520353362e+34 layer:29, std:1.322983152787379e+36 layer:30, std:2.0786820453988485e+37 layer:31, std:nan output is nan in 31 layers tensor([[ inf, -2.6817e+38, inf, ..., inf, inf, inf], [ -inf, -inf, 1.4387e+38, ..., -1.3409e+38, -1.9659e+38, -inf], [-1.5873e+37, inf, -inf, ..., inf, -inf, 1.1484e+38], ..., [ 2.7754e+38, -1.6783e+38, -1.5531e+38, ..., inf, -9.9440e+37, -2.5132e+38], [-7.7184e+37, -inf, inf, ..., -2.6505e+38, inf, inf], [ inf, inf, -inf, ..., -inf, inf, 1.7432e+38]], grad_fn=<MmBackward>)
Q:更改初始权值解决不带激活函数全连接网络的梯度爆炸问题?
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
class MLP(nn.Module):
def __init__(self, neural_num, layers):
super(MLP, self).__init__()
self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
self.neural_num = neural_num
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
break
return x
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
# 注意此处的std
nn.init.normal_(m.weight.data, std=np.sqrt(1/self.neural_num)) # normal: mean=0, std=1
# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16
net = MLP(neural_nums, layer_nums)
net.initialize()
inputs = torch.randn((batch_size, neural_nums)) # normal: mean=0, std=1
output = net(inputs)
print(output)
layer:0, std:0.9974957704544067 layer:1, std:1.0024365186691284 layer:2, std:1.002745509147644 layer:3, std:1.0006227493286133 layer:4, std:0.9966009855270386 layer:5, std:1.019859790802002 layer:6, std:1.026173710823059 layer:7, std:1.0250457525253296 layer:8, std:1.0378952026367188 layer:9, std:1.0441951751708984 layer:10, std:1.0181655883789062 layer:11, std:1.0074602365493774 layer:12, std:0.9948930144309998 layer:13, std:0.9987586140632629 layer:14, std:0.9981392025947571 layer:15, std:1.0045733451843262 layer:16, std:1.0055204629898071 layer:17, std:1.0122840404510498 layer:18, std:1.0076017379760742 layer:19, std:1.000280737876892 layer:20, std:0.9943006038665771 layer:21, std:1.012800931930542 layer:22, std:1.012657642364502 layer:23, std:1.018149971961975 layer:24, std:0.9776086211204529 layer:25, std:0.9592394828796387 layer:26, std:0.9317858815193176 layer:27, std:0.9534041881561279 layer:28, std:0.9811319708824158 layer:29, std:0.9953019022941589 layer:30, std:0.9773916006088257 layer:31, std:0.9655940532684326 layer:32, std:0.9270440936088562 layer:33, std:0.9329946637153625 layer:34, std:0.9311841726303101 layer:35, std:0.9354336261749268 layer:36, std:0.9492132067680359 layer:37, std:0.9679954648017883 layer:38, std:0.9849981665611267 layer:39, std:0.9982335567474365 layer:40, std:0.9616852402687073 layer:41, std:0.9439758658409119 layer:42, std:0.9631161093711853 layer:43, std:0.958673894405365 layer:44, std:0.9675614237785339 layer:45, std:0.9837557077407837 layer:46, std:0.9867278337478638 layer:47, std:0.9920817017555237 layer:48, std:0.9650403261184692 layer:49, std:0.9991624355316162 layer:50, std:0.9946174025535583 layer:51, std:0.9662044048309326 layer:52, std:0.9827387928962708 layer:53, std:0.9887880086898804 layer:54, std:0.9932605624198914 layer:55, std:1.0237400531768799 layer:56, std:0.9702046513557434 layer:57, std:1.0045380592346191 layer:58, std:0.9943899512290955 layer:59, std:0.9900636076927185 layer:60, std:0.99446702003479 layer:61, std:0.9768352508544922 layer:62, std:0.9797843098640442 layer:63, std:0.9951220750808716 layer:64, std:0.9980446696281433 layer:65, std:1.0086933374404907 layer:66, std:1.0276142358779907 layer:67, std:1.0429234504699707 layer:68, std:1.0197855234146118 layer:69, std:1.0319130420684814 layer:70, std:1.0540012121200562 layer:71, std:1.026781439781189 layer:72, std:1.0331352949142456 layer:73, std:1.0666675567626953 layer:74, std:1.0413838624954224 layer:75, std:1.0733673572540283 layer:76, std:1.0404183864593506 layer:77, std:1.0344083309173584 layer:78, std:1.0022705793380737 layer:79, std:0.99835205078125 layer:80, std:0.9732587337493896 layer:81, std:0.9777462482452393 layer:82, std:0.9753198623657227 layer:83, std:0.9938382506370544 layer:84, std:0.9472599029541016 layer:85, std:0.9511011242866516 layer:86, std:0.9737769961357117 layer:87, std:1.005651831626892 layer:88, std:1.0043526887893677 layer:89, std:0.9889539480209351 layer:90, std:1.0130352973937988 layer:91, std:1.0030947923660278 layer:92, std:0.9993206262588501 layer:93, std:1.0342745780944824 layer:94, std:1.031973123550415 layer:95, std:1.0413124561309814 layer:96, std:1.0817031860351562 layer:97, std:1.128799557685852 layer:98, std:1.1617802381515503 layer:99, std:1.2215303182601929 tensor([[-1.0696, -1.1373, 0.5047, ..., -0.4766, 1.5904, -0.1076], [ 0.4572, 1.6211, 1.9659, ..., -0.3558, -1.1235, 0.0979], [ 0.3908, -0.9998, -0.8680, ..., -2.4161, 0.5035, 0.2814], ..., [ 0.1876, 0.7971, -0.5918, ..., 0.5395, -0.8932, 0.1211], [-0.0102, -1.5027, -2.6860, ..., 0.6954, -0.1858, -0.8027], [-0.5871, -1.3739, -2.9027, ..., 1.6734, 0.5094, -0.9986]], grad_fn=<MmBackward>)
Q:添加tanh激活函数后的全连接网络,梯度消失代码示例
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
class MLP(nn.Module):
def __init__(self, neural_num, layers):
super(MLP, self).__init__()
self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
self.neural_num = neural_num
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
# 多添加了tanh激活函数
x = torch.tanh(x)
print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
break
return x
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
# 注意此处的std
nn.init.normal_(m.weight.data, std=np.sqrt(1/self.neural_num)) # normal: mean=0, std=1
# a = np.sqrt(6 / (self.neural_num + self.neural_num))
#
# tanh_gain = nn.init.calculate_gain('tanh')
# a *= tanh_gain
#
# nn.init.uniform_(m.weight.data, -a, a)
# nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)
# nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
# nn.init.kaiming_normal_(m.weight.data)
# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16
net = MLP(neural_nums, layer_nums)
net.initialize()
inputs = torch.randn((batch_size, neural_nums)) # normal: mean=0, std=1
output = net(inputs)
print(output)
layer:0, std:0.6273701786994934 layer:1, std:0.48910173773765564 layer:2, std:0.4099564850330353 layer:3, std:0.35637012124061584 layer:4, std:0.32117360830307007 layer:5, std:0.2981105148792267 layer:6, std:0.27730831503868103 layer:7, std:0.2589356303215027 layer:8, std:0.2468511462211609 layer:9, std:0.23721906542778015 layer:10, std:0.22171513736248016 layer:11, std:0.21079954504966736 layer:12, std:0.19820132851600647 layer:13, std:0.19069305062294006 layer:14, std:0.18555502593517303 layer:15, std:0.17953835427761078 layer:16, std:0.17485806345939636 layer:17, std:0.1702701896429062 layer:18, std:0.16508983075618744 layer:19, std:0.1591130942106247 layer:20, std:0.15480300784111023 layer:21, std:0.15263864398002625 layer:22, std:0.148549422621727 layer:23, std:0.14617665112018585 layer:24, std:0.13876432180404663 layer:25, std:0.13316625356674194 layer:26, std:0.12660598754882812 layer:27, std:0.12537942826747894 layer:28, std:0.12535445392131805 layer:29, std:0.12589804828166962 layer:30, std:0.11994210630655289 layer:31, std:0.11700887233018875 layer:32, std:0.11137297749519348 layer:33, std:0.11154612898826599 layer:34, std:0.10991233587265015 layer:35, std:0.10996390879154205 layer:36, std:0.10969001054763794 layer:37, std:0.10975216329097748 layer:38, std:0.11063200235366821 layer:39, std:0.11021336913108826 layer:40, std:0.10465587675571442 layer:41, std:0.10141163319349289 layer:42, std:0.1026025265455246 layer:43, std:0.10079070925712585 layer:44, std:0.10096712410449982 layer:45, std:0.10117629915475845 layer:46, std:0.10145658999681473 layer:47, std:0.09987485408782959 layer:48, std:0.09677786380052567 layer:49, std:0.099615179002285 layer:50, std:0.09867013245820999 layer:51, std:0.09398546814918518 layer:52, std:0.09388342499732971 layer:53, std:0.09352942556142807 layer:54, std:0.09336657077074051 layer:55, std:0.0948176234960556 layer:56, std:0.08856320381164551 layer:57, std:0.09024856984615326 layer:58, std:0.088644839823246 layer:59, std:0.08766943216323853 layer:60, std:0.08726289123296738 layer:61, std:0.08623495697975159 layer:62, std:0.08549778908491135 layer:63, std:0.0855521708726883 layer:64, std:0.0853666365146637 layer:65, std:0.08462794870138168 layer:66, std:0.0852193832397461 layer:67, std:0.08562126755714417 layer:68, std:0.08368431031703949 layer:69, std:0.08476374298334122 layer:70, std:0.0853630006313324 layer:71, std:0.08237560093402863 layer:72, std:0.08133518695831299 layer:73, std:0.08416958898305893 layer:74, std:0.08226992189884186 layer:75, std:0.08379074186086655 layer:76, std:0.08003697544336319 layer:77, std:0.07888862490653992 layer:78, std:0.07618380337953568 layer:79, std:0.07458437979221344 layer:80, std:0.07207276672124863 layer:81, std:0.07079190015792847 layer:82, std:0.0712786465883255 layer:83, std:0.07165777683258057 layer:84, std:0.06893909722566605 layer:85, std:0.0690247192978859 layer:86, std:0.07030878216028214 layer:87, std:0.07283661514520645 layer:88, std:0.07280214875936508 layer:89, std:0.07130246609449387 layer:90, std:0.07225215435028076 layer:91, std:0.0712454542517662 layer:92, std:0.07088854163885117 layer:93, std:0.0730612576007843 layer:94, std:0.07276967912912369 layer:95, std:0.07259567081928253 layer:96, std:0.07586522400379181 layer:97, std:0.07769150286912918 layer:98, std:0.07842090725898743 layer:99, std:0.08206238597631454 tensor([[-0.1103, -0.0739, 0.1278, ..., -0.0508, 0.1544, -0.0107], [ 0.0807, 0.1208, 0.0030, ..., -0.0385, -0.1887, -0.0294], [ 0.0321, -0.0833, -0.1482, ..., -0.1133, 0.0206, 0.0155], ..., [ 0.0108, 0.0560, -0.1099, ..., 0.0459, -0.0961, -0.0124], [ 0.0398, -0.0874, -0.2312, ..., 0.0294, -0.0562, -0.0556], [-0.0234, -0.0297, -0.1155, ..., 0.1143, 0.0083, -0.0675]], grad_fn=<TanhBackward>)
Q:使用Xavier初始化tanh激活函数的全连接网络解决梯度消失问题的代码示例?
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
class MLP(nn.Module):
def __init__(self, neural_num, layers):
super(MLP, self).__init__()
self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
self.neural_num = neural_num
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
# 多添加了tanh激活函数
x = torch.tanh(x)
print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
break
return x
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
# 手动计算xavier
a = np.sqrt(6 / (self.neural_num + self.neural_num))
tanh_gain = nn.init.calculate_gain('tanh')
a *= tanh_gain
nn.init.uniform_(m.weight.data, -a, a)
# pytorch提供的xavier初始化方法,效果同上
# tanh_gain = nn.init.calculate_gain('tanh')
# nn.init.xavier_uniform_(m.weight.data, gain=tanh_gain)
# nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
# nn.init.kaiming_normal_(m.weight.data)
# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16
net = MLP(neural_nums, layer_nums)
net.initialize()
inputs = torch.randn((batch_size, neural_nums)) # normal: mean=0, std=1
output = net(inputs)
print(output)
layer:0, std:0.7571136355400085 layer:1, std:0.6924336552619934 layer:2, std:0.6677976846694946 layer:3, std:0.6551960110664368 layer:4, std:0.655646800994873 layer:5, std:0.6536089777946472 layer:6, std:0.6500504612922668 layer:7, std:0.6465446949005127 layer:8, std:0.6456685662269592 layer:9, std:0.6414617896080017 layer:10, std:0.6423627734184265 layer:11, std:0.6509683728218079 layer:12, std:0.6584846377372742 layer:13, std:0.6530249118804932 layer:14, std:0.6528729796409607 layer:15, std:0.6523412466049194 layer:16, std:0.6534921526908875 layer:17, std:0.6540238261222839 layer:18, std:0.6477403044700623 layer:19, std:0.6469652652740479 layer:20, std:0.6441705822944641 layer:21, std:0.6484488248825073 layer:22, std:0.6512865424156189 layer:23, std:0.6525684595108032 layer:24, std:0.6531476378440857 layer:25, std:0.6488809585571289 layer:26, std:0.6533839702606201 layer:27, std:0.6482065320014954 layer:28, std:0.6471589803695679 layer:29, std:0.6553042531013489 layer:30, std:0.6560811400413513 layer:31, std:0.6522760987281799 layer:32, std:0.6499098539352417 layer:33, std:0.6568747758865356 layer:34, std:0.6544532179832458 layer:35, std:0.6535674929618835 layer:36, std:0.6508696675300598 layer:37, std:0.6428772807121277 layer:38, std:0.6495102643966675 layer:39, std:0.6479291319847107 layer:40, std:0.6470604538917542 layer:41, std:0.6513484716415405 layer:42, std:0.6503545045852661 layer:43, std:0.6458993554115295 layer:44, std:0.6517387628555298 layer:45, std:0.6520006060600281 layer:46, std:0.6539937257766724 layer:47, std:0.6537032723426819 layer:48, std:0.6516646146774292 layer:49, std:0.6535552740097046 layer:50, std:0.6464877724647522 layer:51, std:0.6491119265556335 layer:52, std:0.6455202102661133 layer:53, std:0.6520237326622009 layer:54, std:0.6531855463981628 layer:55, std:0.6627183556556702 layer:56, std:0.6544181108474731 layer:57, std:0.6501768827438354 layer:58, std:0.6510448455810547 layer:59, std:0.6549468040466309 layer:60, std:0.6529951691627502 layer:61, std:0.6515748500823975 layer:62, std:0.6453633904457092 layer:63, std:0.644793689250946 layer:64, std:0.6489539742469788 layer:65, std:0.6553947925567627 layer:66, std:0.6535270810127258 layer:67, std:0.6528791785240173 layer:68, std:0.6492816209793091 layer:69, std:0.6596571207046509 layer:70, std:0.6536712646484375 layer:71, std:0.6498764157295227 layer:72, std:0.6538681387901306 layer:73, std:0.64595627784729 layer:74, std:0.6543275117874146 layer:75, std:0.6525828838348389 layer:76, std:0.6462088227272034 layer:77, std:0.6534948945045471 layer:78, std:0.6461930871009827 layer:79, std:0.6457878947257996 layer:80, std:0.6481245160102844 layer:81, std:0.6496317386627197 layer:82, std:0.6516988277435303 layer:83, std:0.6485154032707214 layer:84, std:0.6395408511161804 layer:85, std:0.6498249173164368 layer:86, std:0.6510564088821411 layer:87, std:0.6505221724510193 layer:88, std:0.6573457717895508 layer:89, std:0.6529723405838013 layer:90, std:0.6536353230476379 layer:91, std:0.6497699022293091 layer:92, std:0.6459059715270996 layer:93, std:0.6459072232246399 layer:94, std:0.6530925631523132 layer:95, std:0.6515892148017883 layer:96, std:0.6434286832809448 layer:97, std:0.6425578594207764 layer:98, std:0.6407340168952942 layer:99, std:0.6442393660545349 tensor([[ 0.1133, 0.1239, 0.8211, ..., 0.9411, -0.6334, 0.5155], [-0.9585, -0.2371, 0.8548, ..., -0.2339, 0.9326, 0.0114], [ 0.9487, -0.2279, 0.8735, ..., -0.9593, 0.7922, 0.6263], ..., [ 0.7257, 0.0800, -0.4440, ..., -0.9589, 0.2604, 0.5402], [-0.9572, 0.5179, -0.8041, ..., -0.4298, -0.6087, 0.9679], [ 0.6105, 0.3994, 0.1072, ..., 0.3904, -0.5274, 0.0776]], grad_fn=<TanhBackward>)
Q:使用He初始化(Kaiming)relu激活函数的全连接网络解决梯度消失问题的代码示例?
import torch
import random
import numpy as np
import torch.nn as nn
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
class MLP(nn.Module):
def __init__(self, neural_num, layers):
super(MLP, self).__init__()
self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
self.neural_num = neural_num
def forward(self, x):
for (i, linear) in enumerate(self.linears):
x = linear(x)
# 多添加了relu激活函数
x = torch.relu(x)
print("layer:{}, std:{}".format(i, x.std()))
if torch.isnan(x.std()):
print("output is nan in {} layers".format(i))
break
return x
def initialize(self):
for m in self.modules():
if isinstance(m, nn.Linear):
# 手动计算kaiming初始化
nn.init.normal_(m.weight.data, std=np.sqrt(2 / self.neural_num))
# pytorch官方提供kaiming初始化
# nn.init.kaiming_normal_(m.weight.data)
# 100层神经网络
layer_nums = 100
# 每一层神经元数为256
neural_nums = 256
batch_size = 16
net = MLP(neural_nums, layer_nums)
net.initialize()
inputs = torch.randn((batch_size, neural_nums)) # normal: mean=0, std=1
output = net(inputs)
print(output)
layer:0, std:0.826629638671875 layer:1, std:0.8786815404891968 layer:2, std:0.9134422540664673 layer:3, std:0.8892471194267273 layer:4, std:0.834428071975708 layer:5, std:0.874537467956543 layer:6, std:0.7926971316337585 layer:7, std:0.7806458473205566 layer:8, std:0.8684563636779785 layer:9, std:0.9434137344360352 layer:10, std:0.964215874671936 layer:11, std:0.8896796107292175 layer:12, std:0.8287257552146912 layer:13, std:0.8519769906997681 layer:14, std:0.8354345560073853 layer:15, std:0.802306056022644 layer:16, std:0.8613607287406921 layer:17, std:0.7583686709403992 layer:18, std:0.8120225071907043 layer:19, std:0.791111171245575 layer:20, std:0.7164372801780701 layer:21, std:0.778393030166626 layer:22, std:0.8672043085098267 layer:23, std:0.874812662601471 layer:24, std:0.9020991325378418 layer:25, std:0.8585715889930725 layer:26, std:0.7824353575706482 layer:27, std:0.7968912720680237 layer:28, std:0.8984369039535522 layer:29, std:0.8704465627670288 layer:30, std:0.9860473275184631 layer:31, std:0.9080777168273926 layer:32, std:0.9140636920928955 layer:33, std:1.009956955909729 layer:34, std:0.9909380674362183 layer:35, std:1.0253208875656128 layer:36, std:0.849043607711792 layer:37, std:0.703953742980957 layer:38, std:0.7186155319213867 layer:39, std:0.7250635027885437 layer:40, std:0.7030817270278931 layer:41, std:0.6325559020042419 layer:42, std:0.6623690724372864 layer:43, std:0.6960875988006592 layer:44, std:0.7140733003616333 layer:45, std:0.632905125617981 layer:46, std:0.6458898186683655 layer:47, std:0.7354375720024109 layer:48, std:0.6710687279701233 layer:49, std:0.6939153671264648 layer:50, std:0.6889258027076721 layer:51, std:0.6331773996353149 layer:52, std:0.6029313206672668 layer:53, std:0.6145528554916382 layer:54, std:0.6636686325073242 layer:55, std:0.7440094947814941 layer:56, std:0.7972175478935242 layer:57, std:0.7606149911880493 layer:58, std:0.696868360042572 layer:59, std:0.7306802272796631 layer:60, std:0.6875627636909485 layer:61, std:0.7171440720558167 layer:62, std:0.7646605372428894 layer:63, std:0.7965086698532104 layer:64, std:0.8833740949630737 layer:65, std:0.8592952489852905 layer:66, std:0.8092936873435974 layer:67, std:0.806481122970581 layer:68, std:0.6792410612106323 layer:69, std:0.6583346128463745 layer:70, std:0.5702278017997742 layer:71, std:0.5084435939788818 layer:72, std:0.4869326055049896 layer:73, std:0.46350404620170593 layer:74, std:0.4796811640262604 layer:75, std:0.47372108697891235 layer:76, std:0.45414549112319946 layer:77, std:0.4971912205219269 layer:78, std:0.492794930934906 layer:79, std:0.4422350823879242 layer:80, std:0.4802998900413513 layer:81, std:0.5579248666763306 layer:82, std:0.5283755660057068 layer:83, std:0.5451980829238892 layer:84, std:0.6203726530075073 layer:85, std:0.6571893095970154 layer:86, std:0.703682005405426 layer:87, std:0.7321067452430725 layer:88, std:0.6924356818199158 layer:89, std:0.6652532815933228 layer:90, std:0.6728308796882629 layer:91, std:0.6606621742248535 layer:92, std:0.6094604730606079 layer:93, std:0.6019102334976196 layer:94, std:0.595421552658081 layer:95, std:0.6624555587768555 layer:96, std:0.6377885341644287 layer:97, std:0.6079285740852356 layer:98, std:0.6579315066337585 layer:99, std:0.6668476462364197 tensor([[0.0000, 1.3437, 0.0000, ..., 0.0000, 0.6444, 1.1867], [0.0000, 0.9757, 0.0000, ..., 0.0000, 0.4645, 0.8594], [0.0000, 1.0023, 0.0000, ..., 0.0000, 0.5148, 0.9196], ..., [0.0000, 1.2873, 0.0000, ..., 0.0000, 0.6454, 1.1411], [0.0000, 1.3589, 0.0000, ..., 0.0000, 0.6749, 1.2438], [0.0000, 1.1807, 0.0000, ..., 0.0000, 0.5668, 1.0600]], grad_fn=<ReluBackward0>)
Q:如何计算激活函数的方差变化尺度?
torch.nn.init.calculate_gain(nonlinearity, param=None)
Q:calculate_gain代码示例?
import torch
import torch.nn as nn
# 数据
x = torch.randn(10000)
out = torch.tanh(x)
# 得到标准差的尺度变化,即缩放比例
gain = x.std() / out.std()
print('gain:{}'.format(gain))
# pytorch计算tanh的增益
tanh_gain = nn.init.calculate_gain('tanh')
print('tanh_gain in PyTorch:', tanh_gain)
gain:1.5827221870422363 tanh_gain in PyTorch: 1.6666666666666667
Q:如何进行交叉熵计算?
torch.nn.CrossEntropyLoss(weight: Optional[torch.Tensor] = None, size_average=None, ignore_index: int = -100, reduce=None, reduction: str = 'mean')
Q:人民币分类模型代码演示
"""
# @file name : 1_split_dataset.py
# @author : TingsongYu https://github.com/TingsongYu
# @date : 2020-07-24 10:08:00
# @brief : 将数据集划分为训练集,验证集,测试集
"""
import os
import random
import shutil
def makedir(new_dir):
if not os.path.exists(new_dir):
os.makedirs(new_dir)
if __name__ == '__main__':
dataset_dir = os.path.abspath(os.path.join("data", "RMB_data"))
split_dir = os.path.abspath(os.path.join("data", "rmb_split"))
train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "valid")
test_dir = os.path.join(split_dir, "test")
if not os.path.exists(dataset_dir):
raise Exception("\n{} 不存在,请下载 02-01-数据-RMB_data.rar 放到\n{} 下,并解压即可".format(
dataset_dir, os.path.dirname(dataset_dir)))
train_pct = 0.8
valid_pct = 0.1
test_pct = 0.1
for root, dirs, files in os.walk(dataset_dir):
for sub_dir in dirs:
imgs = os.listdir(os.path.join(root, sub_dir))
imgs = list(filter(lambda x: x.endswith('.jpg'), imgs))
random.shuffle(imgs)
img_count = len(imgs)
train_point = int(img_count * train_pct)
valid_point = int(img_count * (train_pct + valid_pct))
for i in range(img_count):
if i < train_point:
out_dir = os.path.join(train_dir, sub_dir)
elif i < valid_point:
out_dir = os.path.join(valid_dir, sub_dir)
else:
out_dir = os.path.join(test_dir, sub_dir)
makedir(out_dir)
target_path = os.path.join(out_dir, imgs[i])
src_path = os.path.join(dataset_dir, sub_dir, imgs[i])
shutil.copy(src_path, target_path)
print('Class:{}, train:{}, valid:{}, test:{}'.format(sub_dir, train_point, valid_point-train_point,
img_count-valid_point))
print("已在 {} 创建划分好的数据\n".format(out_dir))
Class:1, train:80, valid:10, test:10 已在 /Volumes/code/GitHub/Learn-AI/pytorch_deepshare/data/rmb_split/test/1 创建划分好的数据 Class:100, train:80, valid:10, test:10 已在 /Volumes/code/GitHub/Learn-AI/pytorch_deepshare/data/rmb_split/test/100 创建划分好的数据
"""
# @file name : ce_loss.py
# @author : TingsongYu https://github.com/TingsongYu
# @date : 2019-10-07 10:08:00
# @brief : 人民币分类模型训练
"""
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.optim as optim
from PIL import Image
from matplotlib import pyplot as plt
from utils.lenet import LeNet
from utils.my_dataset import RMBDataset
from utils.common_tools import transform_invert, set_seed
set_seed(1) # 设置随机种子
rmb_label = {"1": 0, "100": 1}
# 参数设置
MAX_EPOCH = 10
BATCH_SIZE = 16
LR = 0.01
log_interval = 10
val_interval = 1
# ============================ step 1/5 数据 ============================
split_dir = os.path.abspath(os.path.join("data", "rmb_split"))
if not os.path.exists(split_dir):
raise Exception(r"数据 {} 不存在, 回到lesson-06\1_split_dataset.py生成数据".format(split_dir))
train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "valid")
norm_mean = [0.485, 0.456, 0.406]
norm_std = [0.229, 0.224, 0.225]
train_transform = transforms.Compose([
transforms.Resize((32, 32)),
transforms.RandomCrop(32, padding=4),
transforms.RandomGrayscale(p=0.8),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std),
])
valid_transform = transforms.Compose([
transforms.Resize((32, 32)),
transforms.ToTensor(),
transforms.Normalize(norm_mean, norm_std),
])
# 构建MyDataset实例
train_data = RMBDataset(data_dir=train_dir, transform=train_transform)
valid_data = RMBDataset(data_dir=valid_dir, transform=valid_transform)
# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE)
# ============================ step 2/5 模型 ============================
net = LeNet(classes=2)
net.initialize_weights()
# ============================ step 3/5 损失函数 ============================
loss_functoin = nn.CrossEntropyLoss() # 选择损失函数
# ============================ step 4/5 优化器 ============================
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) # 选择优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 设置学习率下降策略
# ============================ step 5/5 训练 ============================
train_curve = list()
valid_curve = list()
for epoch in range(MAX_EPOCH):
loss_mean = 0.
correct = 0.
total = 0.
net.train()
for i, data in enumerate(train_loader):
# forward
inputs, labels = data
outputs = net(inputs)
# backward
optimizer.zero_grad()
loss = loss_functoin(outputs, labels)
loss.backward()
# update weights
optimizer.step()
# 统计分类情况
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).squeeze().sum().numpy()
# 打印训练信息
loss_mean += loss.item()
train_curve.append(loss.item())
if (i+1) % log_interval == 0:
loss_mean = loss_mean / log_interval
print("Training:Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
epoch, MAX_EPOCH, i+1, len(train_loader), loss_mean, correct / total))
loss_mean = 0.
scheduler.step() # 更新学习率
# validate the model
if (epoch+1) % val_interval == 0:
correct_val = 0.
total_val = 0.
loss_val = 0.
net.eval()
with torch.no_grad():
for j, data in enumerate(valid_loader):
inputs, labels = data
outputs = net(inputs)
loss = loss_functoin(outputs, labels)
_, predicted = torch.max(outputs.data, 1)
total_val += labels.size(0)
correct_val += (predicted == labels).squeeze().sum().numpy()
loss_val += loss.item()
valid_curve.append(loss_val)
print("Valid:\t Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
epoch, MAX_EPOCH, j+1, len(valid_loader), loss_val, correct / total))
train_x = range(len(train_curve))
train_y = train_curve
train_iters = len(train_loader)
valid_x = np.arange(1, len(valid_curve)+1) * train_iters*val_interval # 由于valid中记录的是epochloss,需要对记录点进行转换到iterations
valid_y = valid_curve
plt.plot(train_x, train_y, label='Train')
plt.plot(valid_x, valid_y, label='Valid')
plt.legend(loc='upper right')
plt.ylabel('loss value')
plt.xlabel('Iteration')
plt.show()
# ============================ inference ============================
test_dir = "test_data"
test_data = RMBDataset(data_dir=test_dir, transform=valid_transform)
valid_loader = DataLoader(dataset=test_data, batch_size=1)
for i, data in enumerate(valid_loader):
# forward
inputs, labels = data
outputs = net(inputs)
_, predicted = torch.max(outputs.data, 1)
rmb = 1 if predicted.numpy()[0] == 0 else 100
img_tensor = inputs[0, ...] # C H W
img = transform_invert(img_tensor, train_transform)
plt.imshow(img)
plt.title("LeNet got {} Yuan".format(rmb))
plt.show()
plt.pause(0.5)
plt.close()
Training:Epoch[000/010] Iteration[010/010] Loss: 0.6654 Acc:58.75% Valid: Epoch[000/010] Iteration[002/002] Loss: 0.8840 Acc:58.75% Training:Epoch[001/010] Iteration[010/010] Loss: 0.4679 Acc:84.38% Valid: Epoch[001/010] Iteration[002/002] Loss: 0.2392 Acc:84.38% Training:Epoch[002/010] Iteration[010/010] Loss: 0.3967 Acc:80.62% Valid: Epoch[002/010] Iteration[002/002] Loss: 0.1648 Acc:80.62% Training:Epoch[003/010] Iteration[010/010] Loss: 0.1178 Acc:96.88% Valid: Epoch[003/010] Iteration[002/002] Loss: 0.0284 Acc:96.88% Training:Epoch[004/010] Iteration[010/010] Loss: 0.0138 Acc:100.00% Valid: Epoch[004/010] Iteration[002/002] Loss: 0.1566 Acc:100.00% Training:Epoch[005/010] Iteration[010/010] Loss: 0.0511 Acc:98.75% Valid: Epoch[005/010] Iteration[002/002] Loss: 0.0001 Acc:98.75% Training:Epoch[006/010] Iteration[010/010] Loss: 0.0033 Acc:100.00% Valid: Epoch[006/010] Iteration[002/002] Loss: 0.0002 Acc:100.00% Training:Epoch[007/010] Iteration[010/010] Loss: 0.0440 Acc:98.12% Valid: Epoch[007/010] Iteration[002/002] Loss: 0.0002 Acc:98.12% Training:Epoch[008/010] Iteration[010/010] Loss: 0.0173 Acc:99.38% Valid: Epoch[008/010] Iteration[002/002] Loss: 0.0004 Acc:99.38% Training:Epoch[009/010] Iteration[010/010] Loss: 0.0228 Acc:99.38% Valid: Epoch[009/010] Iteration[002/002] Loss: 0.0006 Acc:99.38%
Q:交叉熵代码演示?
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)
# ----------------------------------- CrossEntropy loss: reduction -----------------------------------
# def loss function
loss_f_none = nn.CrossEntropyLoss(weight=None, reduction='none')
loss_f_sum = nn.CrossEntropyLoss(weight=None, reduction='sum')
loss_f_mean = nn.CrossEntropyLoss(weight=None, reduction='mean')
# forward
loss_none = loss_f_none(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)
# view
print("Cross Entropy Loss:\n ", loss_none, loss_sum, loss_mean)
Cross Entropy Loss: tensor([1.3133, 0.1269, 0.1269]) tensor(1.5671) tensor(0.5224)
Q:通过手算校验pytorch的交叉熵代码的正确性代码?
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)
# --------------------------------- compute by hand
idx = 0
input_1 = inputs.detach().numpy()[idx] # [1, 2]
target_1 = target.numpy()[idx] # [0]
# 第一项
x_class = input_1[target_1]
# 第二项
sigma_exp_x = np.sum(list(map(np.exp, input_1)))
log_sigma_exp_x = np.log(sigma_exp_x)
# 输出loss
loss_1 = -x_class + log_sigma_exp_x
print("第一个样本loss为: ", loss_1)
第一个样本loss为: 1.3132617
Q:交叉熵的weight作用代码示例?
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)
# ----------------------------------- weight -----------------------------------
# def loss function
weights = torch.tensor([1, 2], dtype=torch.float)
# weights = torch.tensor([0.7, 0.3], dtype=torch.float)
loss_f_none_w = nn.CrossEntropyLoss(weight=weights, reduction='none')
loss_f_sum = nn.CrossEntropyLoss(weight=weights, reduction='sum')
loss_f_mean = nn.CrossEntropyLoss(weight=weights, reduction='mean')
# forward
loss_none_w = loss_f_none_w(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)
# view
print("weights: ", weights)
print(loss_none_w, loss_sum, loss_mean)
weights: tensor([1., 2.]) tensor([1.3133, 0.2539, 0.2539]) tensor(1.8210) tensor(0.3642)
Q:交叉熵的weight作用手算校验代码示例?
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)
weights = torch.tensor([1, 2], dtype=torch.float)
weights_all = np.sum(list(map(lambda x: weights.numpy()[x], target.numpy()))) # [0, 1, 1] # [1 2 2]
mean = 0
loss_sep = loss_none.detach().numpy()
for i in range(target.shape[0]):
x_class = target.numpy()[i]
tmp = loss_sep[i] * (weights.numpy()[x_class] / weights_all)
mean += tmp
print(mean)
0.3641947731375694
Q:如何实现负对数似然函数中的负号功能?
torch.nn.NLLLoss(weight: Optional[torch.Tensor] = None, size_average=None, ignore_index: int = -100, reduce=None, reduction: str = 'mean')
Q:NLLLoss代码示例?
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
# fake data
inputs = torch.tensor([[1, 2], [1, 3], [1, 3]], dtype=torch.float)
target = torch.tensor([0, 1, 1], dtype=torch.long)
# ----------------------------------- 2 NLLLoss -----------------------------------
weights = torch.tensor([1, 1], dtype=torch.float)
loss_f_none_w = nn.NLLLoss(weight=weights, reduction='none')
loss_f_sum = nn.NLLLoss(weight=weights, reduction='sum')
loss_f_mean = nn.NLLLoss(weight=weights, reduction='mean')
# forward
loss_none_w = loss_f_none_w(inputs, target)
loss_sum = loss_f_sum(inputs, target)
loss_mean = loss_f_mean(inputs, target)
# view
print("weights: ", weights)
print("NLL Loss", loss_none_w, loss_sum, loss_mean)
weights: tensor([1., 1.]) NLL Loss tensor([-1., -3., -3.]) tensor(-7.) tensor(-2.3333)
Q:二分类交叉熵是怎样?
torch.nn.BCELoss(weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean')
Q:BCELoss代码示例?
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)
target_bce = target
# itarget
inputs = torch.sigmoid(inputs)
weights = torch.tensor([1, 1], dtype=torch.float)
loss_f_none_w = nn.BCELoss(weight=weights, reduction='none')
loss_f_sum = nn.BCELoss(weight=weights, reduction='sum')
loss_f_mean = nn.BCELoss(weight=weights, reduction='mean')
# forward
loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum = loss_f_sum(inputs, target_bce)
loss_mean = loss_f_mean(inputs, target_bce)
# view
print("weights: ", weights)
print("BCE Loss", loss_none_w, loss_sum, loss_mean)
weights: tensor([1., 1.]) BCE Loss tensor([[0.3133, 2.1269], [0.1269, 2.1269], [3.0486, 0.0181], [4.0181, 0.0067]]) tensor(11.7856) tensor(1.4732)
Q:BCELoss手算校验代码示例?
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)
inputs = torch.sigmoid(inputs)
idx = 0
x_i = inputs.detach().numpy()[idx, idx]
y_i = target.numpy()[idx, idx] #
# loss
# l_i = -[ y_i * np.log(x_i) + (1-y_i) * np.log(1-y_i) ] # np.log(0) = nan
l_i = -y_i * np.log(x_i) if y_i else -(1-y_i) * np.log(1-x_i)
# 输出loss
print("BCE inputs: ", inputs)
print("第一个loss为: ", l_i)
BCE inputs: tensor([[0.7311, 0.8808], [0.8808, 0.8808], [0.9526, 0.9820], [0.9820, 0.9933]]) 第一个loss为: 0.31326166
Q:结合Sigmoid与二分类交叉熵
torch.nn.BCEWithLogitsLoss(weight: Optional[torch.Tensor] = None, size_average=None, reduce=None, reduction: str = 'mean', pos_weight: Optional[torch.Tensor] = None)
Q:BCEwithLogitsLoss的代码示例?
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)
target_bce = target
# 不需要加sigmoid
# inputs = torch.sigmoid(inputs)
weights = torch.tensor([1, 1], dtype=torch.float)
loss_f_none_w = nn.BCEWithLogitsLoss(weight=weights, reduction='none')
loss_f_sum = nn.BCEWithLogitsLoss(weight=weights, reduction='sum')
loss_f_mean = nn.BCEWithLogitsLoss(weight=weights, reduction='mean')
# forward
loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum = loss_f_sum(inputs, target_bce)
loss_mean = loss_f_mean(inputs, target_bce)
# view
print("weights: ", weights)
print(loss_none_w, loss_sum, loss_mean)
weights: tensor([1., 1.]) tensor([[0.3133, 2.1269], [0.1269, 2.1269], [3.0486, 0.0181], [4.0181, 0.0067]]) tensor(11.7856) tensor(1.4732)
Q:BCEwithLogitsLoss的pos_weight代码示例?
inputs = torch.tensor([[1, 2], [2, 2], [3, 4], [4, 5]], dtype=torch.float)
target = torch.tensor([[1, 0], [1, 0], [0, 1], [0, 1]], dtype=torch.float)
target_bce = target
# itarget
# inputs = torch.sigmoid(inputs)
weights = torch.tensor([1], dtype=torch.float)
pos_w = torch.tensor([3], dtype=torch.float) # 3
loss_f_none_w = nn.BCEWithLogitsLoss(weight=weights, reduction='none', pos_weight=pos_w)
loss_f_sum = nn.BCEWithLogitsLoss(weight=weights, reduction='sum', pos_weight=pos_w)
loss_f_mean = nn.BCEWithLogitsLoss(weight=weights, reduction='mean', pos_weight=pos_w)
# forward
loss_none_w = loss_f_none_w(inputs, target_bce)
loss_sum = loss_f_sum(inputs, target_bce)
loss_mean = loss_f_mean(inputs, target_bce)
# view
print("pos_weights: ", pos_w)
print(loss_none_w, loss_sum, loss_mean)
pos_weights: tensor([3.]) tensor([[0.9398, 2.1269], [0.3808, 2.1269], [3.0486, 0.0544], [4.0181, 0.0201]]) tensor(12.7158) tensor(1.5895)
Q:如何计算inputs与target之差的绝对值?
torch.nn.L1Loss(size_average=None, reduce=None, reduction: str = 'mean')
Q:如何计算inputs与target之差的平方?
torch.nn.MSELoss(size_average=None, reduce=None, reduction: str = 'mean')
Q:L1Loss和MSELoss代码示例?
import torch
import torch.nn as nn
import numpy as np
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
inputs = torch.ones((2, 2))
target = torch.ones((2, 2)) * 3
loss_f = nn.L1Loss(reduction='none')
loss = loss_f(inputs, target)
print("input:{}\ntarget:{}\nL1 loss:{}".format(inputs, target, loss))
loss_f_mse = nn.MSELoss(reduction='none')
loss_mse = loss_f_mse(inputs, target)
print("MSE loss:{}".format(loss_mse))
input:tensor([[1., 1.], [1., 1.]]) target:tensor([[3., 3.], [3., 3.]]) L1 loss:tensor([[2., 2.], [2., 2.]]) MSE loss:tensor([[4., 4.], [4., 4.]])
Q:平滑的L1Loss是怎样?
0.5\left(x_{i}-y_{i}\right)^{2}, & \text { if }\left|x_{i}-y_{i}\right|<1 \ \left|x_{i}-y_{i}\right|-0.5, & \text { otherwise } \end{array}\right.$$
torch.nn.SmoothL1Loss(size_average=None, reduce=None, reduction: str = 'mean')
Q:SmoothL1Loss代码示例
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
inputs = torch.linspace(-3, 3, steps=500)
target = torch.zeros_like(inputs)
loss_f = nn.SmoothL1Loss(reduction='none')
loss_smooth = loss_f(inputs, target)
loss_l1 = np.abs(inputs.numpy())
plt.plot(inputs.numpy(), loss_smooth.numpy(), label='Smooth L1 Loss')
plt.plot(inputs.numpy(), loss_l1, label='L1 loss')
plt.xlabel('x_i - y_i')
plt.ylabel('loss value')
plt.legend()
plt.grid()
plt.show()
Q:泊松分布的负对数似然损失函数是怎样?
torch.nn.PoissonNLLLoss(log_input: bool = True, full: bool = False, size_average=None, eps: float = 1e-08, reduce=None, reduction: str = 'mean')
Q:PoissonNLLLoss以及手算校验的代码示例
inputs = torch.randn((2, 2))
target = torch.randn((2, 2))
loss_f = nn.PoissonNLLLoss(log_input=True, full=False, reduction='none')
loss = loss_f(inputs, target)
print("input:{}\ntarget:{}\nPoisson NLL loss:{}".format(inputs, target, loss))
idx = 0
loss_1 = torch.exp(inputs[idx, idx]) - target[idx, idx]*inputs[idx, idx]
print("第一个元素loss:", loss_1)
input:tensor([[-1.0276, -0.5631], [-0.8923, -0.0583]]) target:tensor([[-0.1955, -0.9656], [ 0.4224, 0.2673]]) Poisson NLL loss:tensor([[0.1570, 0.0258], [0.7866, 0.9590]]) 第一个元素loss: tensor(0.1570)
Q:Pytorch中的优化器的代码是怎样?
class Optimizer(object): def init(self, params, defaults): self.defaults = defaults self.state = defaultdict(dict) self.param_groups = []
self.param_groups = [{'params': param_groups}]
- defaults:优化器超参数
- state:参数的缓存, 如momentum的缓存
- params_groups:管理的参数组
- _step_count:记录更新次数, 学习率调整中使用
Q:pytorch的优化器有哪些方法?
- zero_grad():清空所管理参数的梯度
- step():执行一步更新
- add_param_group():添加参数组
- state_dict():获取优化器当前状态信息字典
- load_state_dict():加载状态信息字典
Q:优化器的step和zero_grad代码示例
import torch
import torch.optim as optim
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
print("weight before step:{}".format(weight.data))
optimizer.step() # 修改lr=1 0.1观察结果
print("weight after step:{}".format(weight.data))
print("weight in optimizer:{}\nweight in weight:{}\n".format(id(optimizer.param_groups[0]['params'][0]), id(weight)))
print("weight.grad is {}\n".format(weight.grad))
optimizer.zero_grad()
print("after optimizer.zero_grad(), weight.grad is\n{}".format(weight.grad))
weight before step:tensor([[0.6614, 0.2669], [0.0617, 0.6213]]) weight after step:tensor([[ 0.5614, 0.1669], [-0.0383, 0.5213]]) weight in optimizer:140728466157072 weight in weight:140728466157072 weight.grad is tensor([[1., 1.], [1., 1.]]) after optimizer.zero_grad(), weight.grad is tensor([[0., 0.], [0., 0.]])
Q:优化器的add_param_group代码示例
import torch
import torch.optim as optim
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
optimizer = optim.SGD([weight], lr=0.1)
print("optimizer.param_groups is\n{}".format(optimizer.param_groups))
w2 = torch.randn((3, 3), requires_grad=True)
optimizer.add_param_group({"params": w2, 'lr': 0.0001})
print("\noptimizer.param_groups is\n{}".format(optimizer.param_groups))
optimizer.param_groups is [{'params': [tensor([[0.6614, 0.2669], [0.0617, 0.6213]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}] optimizer.param_groups is [{'params': [tensor([[0.6614, 0.2669], [0.0617, 0.6213]], requires_grad=True)], 'lr': 0.1, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}, {'params': [tensor([[-0.4519, -0.1661, -1.5228], [ 0.3817, -1.0276, -0.5631], [-0.8923, -0.0583, -0.1955]], requires_grad=True)], 'lr': 0.0001, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False}]
Q:优化器的state_dict和load_state_dict代码示例
import torch
import torch.optim as optim
from utils.common_tools import set_seed
set_seed(1) # 设置随机种子
weight = torch.randn((2, 2), requires_grad=True)
weight.grad = torch.ones((2, 2))
# ----------------------------------- state_dict -----------------------------------
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
opt_state_dict = optimizer.state_dict()
print("state_dict before step:\n", opt_state_dict)
for i in range(10):
optimizer.step()
print("state_dict after step:\n", optimizer.state_dict())
torch.save(optimizer.state_dict(), "optimizer_state_dict.pkl")
# -----------------------------------load state_dict -----------------------------------
optimizer = optim.SGD([weight], lr=0.1, momentum=0.9)
state_dict = torch.load("optimizer_state_dict.pkl")
print("\nstate_dict before load state:\n", optimizer.state_dict())
optimizer.load_state_dict(state_dict)
print("state_dict after load state:\n", optimizer.state_dict())
state_dict before step: {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]} state_dict after step: {'state': {140728466498736: {'momentum_buffer': tensor([[6.5132, 6.5132], [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]} state_dict before load state: {'state': {}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]} state_dict after load state: {'state': {140728466498736: {'momentum_buffer': tensor([[6.5132, 6.5132], [6.5132, 6.5132]])}}, 'param_groups': [{'lr': 0.1, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'params': [140728466498736]}]}
Q:optim.SGD是怎样?
torch.optim.SGD(params, lr=<required parameter>, momentum=0, dampening=0, weight_decay=0, nesterov=False)