for i, (image, label) inenumerate(train_loader): pred = model(image) # 1 loss = criterion(pred, label) # 2 optimizer.zero_grad() # 3 loss.backward() # 4 optimizer.step() # 5
神经网络forward过程
获取loss,通过pred和label计算你损失函数
清空网络中参数的梯度
反向传播,计算当前梯度
根据梯度更新网络参数
使用梯度累加的方法如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
for i,(image, label) inenumerate(train_loader): # 1. input output pred = model(image) loss = criterion(pred, label)
# 2.1 loss regularization loss = loss / accumulation_steps # 2.2 back propagation loss.backward()
# 3. update parameters of net if (i+1) % accumulation_steps == 0: # optimizer the net optimizer.step() # update parameters of net optimizer.zero_grad() # reset gradient
for 每次梯度累加循环 optimizer.zero_grad() # 前accumulation_step-1个step,不进行梯度同步,每张卡分别累积梯度。 for _ inrange(K-1):: with model.no_sync(): prediction = model(data) loss = loss_fn(prediction, label) / K loss.backward() # 积累梯度,但是多卡之间不进行同步 # 第K个step prediction = model(data) loss = loss_fn(prediction, label) / K loss.backward() # 进行多卡之间的梯度同步 optimizer.step()
优雅写法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from contextlib import nullcontext # 如果你的python版本小于3.7,请注释掉上面一行,使用下面这个: # from contextlib import suppress as nullcontext
if local_rank != -1: model = DDP(model)
optimizer.zero_grad() for i, (data, label) inenumerate(dataloader): # 只在DDP模式下,轮数不是K整数倍的时候使用no_sync my_context = model.no_sync if local_rank != -1and i % K != 0else nullcontext with my_context(): prediction = model(data) loss = loss_fn(prediction, label) / K loss.backward() # 积累梯度,不应用梯度改变 if i % K == 0: optimizer.step() optimizer.zero_grad()