权值共享
import torch
import torch.nn as nn
class model(nn.Module):
def __init__(self):
super(model,self).__init__()
self.lstm = nn.LSTM(input_size = 10,hidden_size = 5)
self.linear = nn.Linear(input_features = 5,out_features = 2)
def forward(self,inputdata1,inputdata2):
lstm_result1 = self.lstm (input_data1)
lstm_result2 = self.lstm(inputdata2)
output = self.linear(lstm_result1+lstm_result2)
return output
注释:在神经网络的训练过程中经常用到两层网络共享权值,在上述代码片中,定义神经网络时定义一个lstm模型和一个全连接层,在前向计算中多次调用lstm层进行计算,相当于神经网络模型中有两个lstm层,即计算inputdata1和inputdata2的两个lstm共享权值。
参考:
某些层参数不更新
在查询此类资料时,在博客中看到模型层中添加了requires_grad = False后参数仍会训练的问题,博主并给出了相关解决方法,这里记录两个感觉使用方便的方法。
更多内容参考:
import torch
import torch.nn as nn
class model(nn.Module):
def __init__(self):
super(model,self).__init__()
self.lstm = nn.LSTM(input_size = 10,hidden_size = 5)
for p in self.parameters():
p.requires_grad = False
self.linear = nn.Linear(input_features = 5,out_features = 2)
def forward(self,inputdata1,inputdata2):
lstm_result1 = self.lstm (input_data1)
lstm_result2 = self.lstm(inputdata2)
output = self.linear(lstm_result1+lstm_result2)
return output
注释:在不需要参数更新的层后边添加如下代码行:
for p in self.parameters():
p.requires_grad = False
但是上述方法适用于模型中最初几层都不需训练,顶层需要训练的情况,如果出现需要训练和不需要训练的模型层交替出现的时候,上述方法就无法使用。博主给出了使用范围更广的方法:
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
class model(nn.Module):
def __init__(self):
super(model,self).__init__()
self.lstm1 = nn.LSTM(input_size = 10,hidden_size = 10,requires_grad = True)
self.lstm2 = nn.LSTM(input_size = 10,hidden_size = 5,requires_grad = False)
self.linear = nn.Linear(input_features = 5,out_features = 2,requires_grad = True)
def forward(self,inputdata):
lstm_result1 = self.lstm1(input_data)
lstm_result2 = self.lstm(lstm_result1)
output = self.linear(lstm_result1+lstm_result2)
return output
model = model()
#人为构造输入和真实标签
input_data = torch.randn([1,10])#[1,10]代表输入一个样本,该样本的向量是10维,此处必须是二位数据
target = torch.tensor([1],dtype = torch.long)#输入一个样本时真实标签只有一个,如果输入是[5,10],则真实标签就应该为5个,例如,torch.tensor([0,1,1,1,0])
#模型计算,反向传播
result = model(input_data)
loss_fc = CrossEntropyLoss()
loss = loss_fc(input_data,target)
loss.backward()
#优化函数优化
torch.optimizer.SGD(filter(lambda p:p.requires_grad = True,model.parameters(),lr = 0.01))
注释:上述代码片在优化函数部分对参数进行过滤,只选取requires_grad = True的参数进行优化更新。
为不同的层赋予不同的学习率
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
class model(nn.Module):
def __init__(self):
super(model,self).__init__()
self.lstm = nn.LSTM(input_size = 10,hidden_size = 10,requires_grad = True)
self.linear = nn.Linear(input_features = 5,out_features = 2,requires_grad = True)
def forward(self,inputdata):
lstm_result = self.lstm(input_data)
output = self.linear(lstm_result)
return output
model = model()
#人为构造输入和真实标签
input_data = torch.randn([1,10])#[1,10]代表输入一个样本,该样本的向量是10维,此处必须是二位数据
target = torch.tensor([1],dtype = torch.long)#输入一个样本时真实标签只有一个,如果输入是[5,10],则真实标签就应该为5个,例如,torch.tensor([0,1,1,1,0])
#模型计算,反向传播
result = model(input_data)
loss_fc = CrossEntropyLoss()
loss = loss_fc(input_data,target)
loss.backward()
#使用优化函数优化过程中,为不同的层赋予不同的学习率,
param_lstm = [p for p in model.lstm.parameters()]
param_linear = [p for p in model.linear.parameters()]
params = [{'params':param_lstm,'lr':0.1},{'params':param_linear,'lr':0.01}]
torch.optimizer.SGD(params)
将两个模型参数的平均值赋予第三个模型
import torch
import torch.nn as nn
from collections import OrderedDict
#创建两个模型
model1 = nn.Linear(10,10)
model2 = nn.Linear(10,10)
#获取两个模型的平均值
param_dict = {}
for key in model1.state_dict.keys():#model1.state_dict()输出值为OrderedDict类型
param_key = (model1.state_dict[key] + model2.state_dict[key]) / 2
param_dict[key] = param_key
#将两个模型的平均值转换成OrderedDict类型,并赋予第三个模型
param_dict = OrderedDict(param_dict)
model3 = nn.Linear(10,10)#三个模型的构造必须一致
model3.load_state_dict(param_dict)
输出模型中每个层的梯度
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
class model(nn.Module):
def __init__(self):
super(model,self).__init__()
self.lstm = nn.LSTM(input_size = 10,hidden_size = 10,requires_grad = True)
self.linear = nn.Linear(input_features = 5,out_features = 2,requires_grad = True)
def forward(self,inputdata):
lstm_result = self.lstm(input_data)
output = self.linear(lstm_result)
return output
model = model()
#人为构造输入和真实标签
input_data = torch.randn([1,10])#[1,10]代表输入一个样本,该样本的向量是10维,此处必须是二位数据
target = torch.tensor([1],dtype = torch.long)#输入一个样本时真实标签只有一个,如果输入是[5,10],则真实标签就应该为5个,例如,torch.tensor([0,1,1,1,0])
#模型计算,反向传播
result = model(input_data)
loss_fc = CrossEntropyLoss()
loss = loss_fc(input_data,target)
loss.backward()
#输出不同层的梯度
print(model.lstm.grad)
print(model.linear.grad)
#细分输出不同层权值和偏置的梯度
print(model.lstm.weight.grad)
print(model.lstm.bias.grad)
print(model.linear.weight.grad)
print(model.linear.bias.grad)
查看模型梯度参考:https://zhuanlan.zhihu.com/p/36121066