pytorch矩阵倒数详细推导

@TOC

本文用最小二乘法来简述PyTorch中的梯度下降具体执行过程

之前已经推到过最小二乘法的矩阵求导过程:链接

一、数学形式

之前虽然知道矩阵求导是怎么来的,但是具体PyTorch是怎么实现的还未知,所以这里进行了求解和验证,采用了SGD梯度下降,为方便计算,batch_size=3,并且采用一元线性回归来实现,所以这里w只有一维,带有bias偏置

数学表达式为

注意为了方便下面验证,取一元线性回归,即参数

本来的矩阵,即n元,现在改为一元,转置相当于没变化了

接下来给出矩阵求导以及非矩阵求导的公式,这里我们取batch_size=3,即

损失函数为:

1. 矩阵求导

矩阵求导的时候没有算上偏置,下面非矩阵求导会算上,而且主要是用非矩阵求导的过程来验证PyTorch运行过程,所以不影响

带入得

2. 非矩阵求导

带入得

上述就是对求导的结果,由于相同,直接给出结果

下面验证时仅针对有偏置的一元线性回归,并且只验证

二、PyTorch

这里我们batch_size=3,随机取输入分别为

输出为

loss为

的梯度为-150.28

更新前后的参数为

而学习率为0.001,恰好

下面我们手动计算进行验证看看是否与上述输出一致

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np

x = np.array([4.6667, 8.2222, 2.8889])
y = np.array([9.7349, 17.2363, 6.3046])
y_ = np.array([-0.7449, -1.9711, -0.1318])

# 计算loss
f = y_-y
l = np.power(f, 2)
print(l)
l_ = np.sum(l)
print(l_/3)
'''
[109.82620804 368.92421476 41.42724496]
173.39255591999998
'''


# 计算相应参数
print(np.sum(x))
print(np.power(x, 2))
print(x*y)
print(np.sum(np.power(x, 2)))
print(np.sum(x*y))
'''
15.7778
[21.77808889 67.60457284 8.34574321]
[ 45.42985783 141.72030586 18.21335894]
97.72840494
205.36352263000003
'''


# 计算w梯度
gw = (2/3)*((-0.3449 * 97.7284)-(205.3635-15.7778*0.8645))
print(gw)
'''
-150.28674470666664
'''

可以发现我们的计算是一致的

三、执行代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
'''
torch.optim.SGD
'''

import os
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader


# 1. 创建数据集
def create_linear_data(nums_data, if_plot=False):
"""
Create data for linear model
Args:
nums_data: how many data points that wanted
Returns:
x with shape (nums_data, 1)
"""
x = torch.linspace(2, 10, nums_data)
x = torch.unsqueeze(x,dim=1)
k = 2
y = k * x + torch.rand(x.size())

if if_plot:
plt.scatter(x.numpy(),y.numpy(),c=x.numpy())
plt.show()
# data = torch.cat([x, y], dim=1)
datax = x
datay = y
return datax, datay


datax, datay = create_linear_data(10, if_plot=False)
length = len(datax)


# 2. Dummy DataSet
class LinearDataset(Dataset):

def __init__(self, length, datax, datay):
self.len = length
self.datax = datax
self.datay = datay

def __getitem__(self, index):
datax = self.datax[index]
datay = self.datay[index]
return {'x': datax, 'y': datay}

def __len__(self):
return self.len


# 3. Parameters and DataLoaders
batch_size = 3
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
linear_loader = DataLoader(dataset=LinearDataset(length, datax, datay),
batch_size=batch_size, shuffle=True)


# 4. model
class Model(torch.nn.Module):
"""
Linear Regressoin Module, the input features and output
features are defaults both 1
"""
def __init__(self):
super().__init__()
self.linear = torch.nn.Linear(1, 1)

def forward(self, input):
output = self.linear(input)
print("In Model: #############################################")
print("input: ", input.size(), "output: ", output.size())
return output

model = Model()
# model = torch.nn.DataParallel(model, device_ids=device_ids)
model.cuda()


# 6. loss && optimizer
loss = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)


# 7. train
Linear_loss = []
for i in range(10):
# print("epoch: %d\n", i)
for data in linear_loader:
x, y = data['x'], data['y']
x = x.cuda()
y = y.cuda()
y_pred = model(x)
lloss = loss(y_pred, y)
optimizer.zero_grad()
lloss.backward()
for param in model.parameters(): # before optimize
print(param.item())
optimizer.step()
for param in model.parameters(): # after optimize
print(param.item())
Linear_loss.append(lloss.item())
print("Out--->input size:", x.size(), "output_size:", y_pred.size())


# 8. plot
loss_len = len(Linear_loss)
axis_x = range(loss_len)
plt.plot(axis_x, Linear_loss)
plt.show()