import d2l
from mxnet import autograd, gluon, init, np, npx
from mxnet.gluon import nn
npx.set_np()
在多 GPU 上初始模型参数。
net = d2l.resnet18(10)
ctx = d2l.try_all_gpus()
net.initialize(init=init.Normal(sigma=0.01), ctx=ctx)
验证。
x = np.random.uniform(size=(4, 1, 28, 28))
gpu_x = gluon.utils.split_and_load(x, ctx)
net(gpu_x[0]), net(gpu_x[1])
(array([[ 9.4988764e-07, 4.0808845e-06, -5.1063816e-06, -4.9375967e-06, 1.1718329e-06, -5.6178824e-06, -4.8232919e-06, 1.9737163e-06, -7.3709026e-07, 2.2256274e-06], [ 7.7096996e-07, 4.2829342e-06, -6.1890505e-06, -5.4664861e-06, 1.2786281e-06, -5.2085825e-06, -4.6386904e-06, 2.0427817e-06, -1.0129007e-06, 2.0370280e-06]], ctx=gpu(0)), array([[ 2.4921033e-07, 3.8222056e-06, -5.5915402e-06, -5.4971724e-06, 1.4587372e-06, -4.5317338e-06, -4.8936981e-06, 2.3227499e-06, -3.8662023e-07, 1.8324375e-06], [-8.7117598e-08, 3.6717909e-06, -5.0221552e-06, -5.0705357e-06, 2.1382066e-06, -4.9615883e-06, -4.7389462e-06, 2.3168993e-06, -4.3993109e-07, 2.1564051e-06]], ctx=gpu(1)))
在多 GPU 上计算精度。
def evaluate_accuracy_gpus(net, data_iter):
# 查看所在的所有设备
ctx_list = list(net.collect_params().values())[0].list_ctx()
metric = d2l.Accumulator(2) # 分类正确的样本数,总样本数。
for features, labels in data_iter:
Xs, ys = d2l.split_batch(features, labels, ctx_list)
pys = [net(X) for X in Xs] # 并行执行。
metric.add(sum(float(d2l.accuracy(py, y)) for py, y in zip(pys, ys)),
labels.size)
return metric[0]/metric[1]
训练函数。
def train(num_gpus, batch_size, lr):
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
ctx_list = [d2l.try_gpu(i) for i in range(num_gpus)]
net.initialize(init=init.Normal(sigma=0.01),
ctx=ctx_list, force_reinit=True)
trainer = gluon.Trainer(
net.collect_params(), 'sgd', {'learning_rate': lr})
loss = gluon.loss.SoftmaxCrossEntropyLoss()
timer, num_epochs = d2l.Timer(), 10
animator = d2l.Animator('epoch', 'test acc', xlim=[1, num_epochs])
for epoch in range(num_epochs):
timer.start()
for features, labels in train_iter:
Xs, ys = d2l.split_batch(features, labels, ctx_list)
with autograd.record():
ls = [loss(net(X), y) for X, y in zip(Xs, ys)]
for l in ls:
l.backward()
trainer.step(batch_size)
npx.waitall()
timer.stop()
animator.add(epoch+1, (evaluate_accuracy_gpus(net, test_iter),))
print('test acc: %.2f, %.1f sec/epoch on %s' % (
animator.Y[0][-1], timer.avg(), ctx_list))
使用一个 GPU。
train(num_gpus=1, batch_size=256, lr=0.1)
test acc: 0.93, 13.2 sec/epoch on [gpu(0)]
使用两个 GPU。
train(num_gpus=2, batch_size=512, lr=0.2)
test acc: 0.92, 6.8 sec/epoch on [gpu(0), gpu(1)]