IEBN:实例增强的批量归一化——Batch噪声的一种自适应正则化器
摘要
批量归一化(BN)通过一批图像的统计数据对输入图像的特征进行归一化,因此BN会将噪声带到训练损失的梯度。已有的研究表明,噪声对深度神经网络的优化和泛化能力有重要影响,但噪声过大会损害网络的性能。本文提出了一个新的观点,即自注意机制可以通过增强实例信息来调节噪声,从而获得更好的正则化效果。因此,我们提出了一种基于注意力的BN,称为实例增强批量归一化(IEBN),它通过一个简单的线性变换来重新校准每个通道的信息。即使在训练过程中存在两种噪声攻击,IEBN仍具有良好的噪声调节和稳定网络训练的能力,从而提高泛化能力。最后,对于不同的网络结构和基准数据集,IEBN在图像分类任务中仅以轻微的参数增量优于BN。
1. IEBN
本文主要关注两种由SGD和BN引起噪声:
- 近似噪声(Estimation Noise):BN使用批量数据的均值和方差来代表整体;SGD使用批损失来近似整个数据集损失
- 批噪声(Batch Noise):在前向传播时会通过BN将批信息引入到数据中,反向传播的梯度也会受批信息的干扰
由之前的论文可知:模型需要适当的噪声,适量的近似噪声可以用作参数的正则化,同时也可以避免模型陷入局部最优或鞍点。但是噪声过大会导致损失急剧变化,从而导致优化难等问题,因此本文提出了IEBN来自适应调节噪声,具体操作如下:
- 全局池化
- 特征处理:使用两个可学习参数和Sigmoid激活函数
- 实例嵌入:对BN的缩放系数进行调制
2. 代码复现
2.1 下载并导入所需要的包
%matplotlib inline
import paddle
import paddle.fluid as fluid
import numpy as np
import matplotlib.pyplot as plt
from paddle.vision.datasets import Cifar10
from paddle.vision.transforms import Transpose
from paddle.io import Dataset, DataLoader
from paddle import nn
import paddle.nn.functional as F
import paddle.vision.transforms as transforms
import os
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from paddle import ParamAttr
from paddle.nn.layer.norm import _BatchNormBase
2.2 创建数据集
train_tfm = transforms.Compose([
transforms.Resize((130, 130)),
transforms.RandomResizedCrop(128),
transforms.RandomHorizontalFlip(0.5),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
])
test_tfm = transforms.Compose([
transforms.Resize((128, 128)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)),
])
paddle.vision.set_image_backend('cv2')
# 使用Cifar100数据集
train_dataset = Cifar10(data_file='data/data152754/cifar-10-python.tar.gz', mode='train', transform = train_tfm, )
val_dataset = Cifar10(data_file='data/data152754/cifar-10-python.tar.gz', mode='test',transform = test_tfm)
print("train_dataset: %d" % len(train_dataset))
print("val_dataset: %d" % len(val_dataset))
train_dataset: 50000
val_dataset: 10000
batch_size=256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=4)
2.3 标签平滑
class LabelSmoothingCrossEntropy(nn.Layer):
def __init__(self, smoothing=0.1):
super().__init__()
self.smoothing = smoothing
def forward(self, pred, target):
confidence = 1. - self.smoothing
log_probs = F.log_softmax(pred, axis=-1)
idx = paddle.stack([paddle.arange(log_probs.shape[0]), target], axis=1)
nll_loss = paddle.gather_nd(-log_probs, index=idx)
smooth_loss = paddle.mean(-log_probs, axis=-1)
loss = confidence * nll_loss + self.smoothing * smooth_loss
return loss.mean()
2.4 ResNet-IEBN
2.4.1 IEBN
class IEBN(_BatchNormBase):
def __init__(self, num_features):
super().__init__(num_features)
self.num_features = num_features
self.pooling = nn.AdaptiveAvgPool2D(1)
self.sigmoid = nn.Sigmoid()
self.weight = self.create_parameter([1, num_features, 1, 1], default_initializer=nn.initializer.Assign(paddle.ones([1, num_features, 1, 1])))
self.bias = self.create_parameter([1, num_features, 1, 1], default_initializer=nn.initializer.Assign(paddle.zeros([1, num_features, 1, 1])))
self.weight_readjust = self.create_parameter([1, num_features, 1, 1],
default_initializer=nn.initializer.Assign(paddle.zeros([1, num_features, 1, 1])))
self.bias_readjust = self.create_parameter([1, num_features, 1, 1],
default_initializer=nn.initializer.Assign(-1 * paddle.ones([1, num_features, 1, 1])))
def forward(self, x):
attn = self.pooling(x)
attn = self.sigmoid(self.weight_readjust * attn + self.bias_readjust)
weight = paddle.ones([self.num_features])
bias = paddle.zeros([self.num_features])
out = F.batch_norm(x, self._mean, self._variance, weight, bias, self.training)
out = out * self.weight * attn + self.bias
return out
model = IEBN(64)
paddle.summary(model, (1, 64, 224, 224))
W0208 16:27:54.341118 3662 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0208 16:27:54.345618 3662 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.
-------------------------------------------------------------------------------
Layer (type) Input Shape Output Shape Param #
===============================================================================
AdaptiveAvgPool2D-1 [[1, 64, 224, 224]] [1, 64, 1, 1] 0
Sigmoid-1 [[1, 64, 1, 1]] [1, 64, 1, 1] 0
===============================================================================
Total params: 0
Trainable params: 0
Non-trainable params: 0
-------------------------------------------------------------------------------
Input size (MB): 12.25
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 12.25
-------------------------------------------------------------------------------
{'total_params': 0, 'trainable_params': 0}
2.4.2 ResNet-IEBN
class BasicBlock(nn.Layer):
expansion = 1
def __init__(
self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None,
):
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2D
if dilation > 1:
raise NotImplementedError(
"Dilation > 1 not supported in BasicBlock"
)
self.conv1 = nn.Conv2D(
inplanes, planes, 3, padding=1, stride=stride, bias_attr=False
)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU()
self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class BottleneckBlock(nn.Layer):
expansion = 4
def __init__(
self,
inplanes,
planes,
stride=1,
downsample=None,
groups=1,
base_width=64,
dilation=1,
norm_layer=None,
):
super().__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2D
width = int(planes * (base_width / 64.0)) * groups
self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
self.bn1 = norm_layer(width)
self.conv2 = nn.Conv2D(
width,
width,
3,
padding=dilation,
stride=stride,
groups=groups,
dilation=dilation,
bias_attr=False,
)
self.bn2 = norm_layer(width)
self.conv3 = nn.Conv2D(
width, planes * self.expansion, 1, bias_attr=False
)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out
class ResNet(nn.Layer):
"""ResNet model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
Block (BasicBlock|BottleneckBlock): Block module of model.
depth (int, optional): Layers of ResNet, Default: 50.
width (int, optional): Base width per convolution group for each convolution block, Default: 64.
num_classes (int, optional): Output num_features of last fc layer. If num_classes <= 0, last fc layer
will not be defined. Default: 1000.
with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
groups (int, optional): Number of groups for each convolution block, Default: 1.
Returns:
:ref:`api_paddle_nn_Layer`. An instance of ResNet model.
Examples:
.. code-block:: python
import paddle
from paddle.vision.models import ResNet
from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
# build ResNet with 18 layers
resnet18 = ResNet(BasicBlock, 18)
# build ResNet with 50 layers
resnet50 = ResNet(BottleneckBlock, 50)
# build Wide ResNet model
wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2)
# build ResNeXt model
resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32)
x = paddle.rand([1, 3, 224, 224])
out = resnet18(x)
print(out.shape)
# [1, 1000]
"""
def __init__(
self,
block,
depth=50,
width=64,
num_classes=1000,
with_pool=True,
groups=1,
):
super().__init__()
layer_cfg = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}
layers = layer_cfg[depth]
self.groups = groups
self.base_width = width
self.num_classes = num_classes
self.with_pool = with_pool
self._norm_layer = IEBN
self.inplanes = 64
self.dilation = 1
self.conv1 = nn.Conv2D(
3,
self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias_attr=False,
)
self.bn1 = self._norm_layer(self.inplanes)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
if with_pool:
self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
if num_classes > 0:
self.fc = nn.Linear(512 * block.expansion, num_classes)
def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2D(
self.inplanes,
planes * block.expansion,
1,
stride=stride,
bias_attr=False,
),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(
block(
self.inplanes,
planes,
stride,
downsample,
self.groups,
self.base_width,
previous_dilation,
norm_layer,
)
)
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(
block(
self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
norm_layer=norm_layer,
)
)
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
if self.with_pool:
x = self.avgpool(x)
if self.num_classes > 0:
x = paddle.flatten(x, 1)
x = self.fc(x)
return x
model = ResNet(BasicBlock, depth=18, num_classes=10)
paddle.summary(model, (1, 3, 128, 128))
2.5 训练
learning_rate = 0.1
n_epochs = 100
paddle.seed(42)
np.random.seed(42)
work_path = 'work/model'
model = ResNet(BasicBlock, depth=18, num_classes=10)
criterion = LabelSmoothingCrossEntropy()
scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=learning_rate, milestones=[30, 60, 80], verbose=False)
optimizer = paddle.optimizer.Momentum(parameters=model.parameters(), learning_rate=scheduler, weight_decay=1e-5)
gate = 0.0
threshold = 0.0
best_acc = 0.0
val_acc = 0.0
loss_record = {'train': {'loss': [], 'iter': []}, 'val': {'loss': [], 'iter': []}} # for recording loss
acc_record = {'train': {'acc': [], 'iter': []}, 'val': {'acc': [], 'iter': []}} # for recording accuracy
loss_iter = 0
acc_iter = 0
for epoch in range(n_epochs):
# ---------- Training ----------
model.train()
train_num = 0.0
train_loss = 0.0
val_num = 0.0
val_loss = 0.0
accuracy_manager = paddle.metric.Accuracy()
val_accuracy_manager = paddle.metric.Accuracy()
print("#===epoch: {}, lr={:.10f}===#".format(epoch, optimizer.get_lr()))
for batch_id, data in enumerate(train_loader):
x_data, y_data = data
labels = paddle.unsqueeze(y_data, axis=1)
logits = model(x_data)
loss = criterion(logits, y_data)
acc = paddle.metric.accuracy(logits, labels)
accuracy_manager.update(acc)
if batch_id % 10 == 0:
loss_record['train']['loss'].append(loss.numpy())
loss_record['train']['iter'].append(loss_iter)
loss_iter += 1
loss.backward()
optimizer.step()
optimizer.clear_grad()
train_loss += loss
train_num += len(y_data)
scheduler.step()
total_train_loss = (train_loss / train_num) * batch_size
train_acc = accuracy_manager.accumulate()
acc_record['train']['acc'].append(train_acc)
acc_record['train']['iter'].append(acc_iter)
acc_iter += 1
# Print the information.
print("#===epoch: {}, train loss is: {}, train acc is: {:2.2f}%===#".format(epoch, total_train_loss.numpy(), train_acc*100))
# ---------- Validation ----------
model.eval()
for batch_id, data in enumerate(val_loader):
x_data, y_data = data
labels = paddle.unsqueeze(y_data, axis=1)
with paddle.no_grad():
logits = model(x_data)
loss = criterion(logits, y_data)
acc = paddle.metric.accuracy(logits, labels)
val_accuracy_manager.update(acc)
val_loss += loss
val_num += len(y_data)
total_val_loss = (val_loss / val_num) * batch_size
loss_record['val']['loss'].append(total_val_loss.numpy())
loss_record['val']['iter'].append(loss_iter)
val_acc = val_accuracy_manager.accumulate()
acc_record['val']['acc'].append(val_acc)
acc_record['val']['iter'].append(acc_iter)
print("#===epoch: {}, val loss is: {}, val acc is: {:2.2f}%===#".format(epoch, total_val_loss.numpy(), val_acc*100))
# ===================save====================
if val_acc > best_acc:
best_acc = val_acc
paddle.save(model.state_dict(), os.path.join(work_path, 'best_model.pdparams'))
paddle.save(optimizer.state_dict(), os.path.join(work_path, 'best_optimizer.pdopt'))
print(best_acc)
paddle.save(model.state_dict(), os.path.join(work_path, 'final_model.pdparams'))
paddle.save(optimizer.state_dict(), os.path.join(work_path, 'final_optimizer.pdopt'))
2.6 实验结果
def plot_learning_curve(record, title='loss', ylabel='CE Loss'):
''' Plot learning curve of your CNN '''
maxtrain = max(map(float, record['train'][title]))
maxval = max(map(float, record['val'][title]))
ymax = max(maxtrain, maxval) * 1.1
mintrain = min(map(float, record['train'][title]))
minval = min(map(float, record['val'][title]))
ymin = min(mintrain, minval) * 0.9
total_steps = len(record['train'][title])
x_1 = list(map(int, record['train']['iter']))
x_2 = list(map(int, record['val']['iter']))
figure(figsize=(10, 6))
plt.plot(x_1, record['train'][title], c='tab:red', label='train')
plt.plot(x_2, record['val'][title], c='tab:cyan', label='val')
plt.ylim(ymin, ymax)
plt.xlabel('Training steps')
plt.ylabel(ylabel)
plt.title('Learning curve of {}'.format(title))
plt.legend()
plt.show()
plot_learning_curve(loss_record, title='loss', ylabel='CE Loss')
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/cbook/__init__.py:2349: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
if isinstance(obj, collections.Iterator):
/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/cbook/__init__.py:2366: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
return list(data) if isinstance(data, collections.MappingView) else data
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jGIekOeo-1677308975884)(main_files/main_27_1.png)]
plot_learning_curve(acc_record, title='acc', ylabel='Accuracy')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9beeZV2y-1677308975884)(main_files/main_28_0.png)]
import time
work_path = 'work/model'
model = ResNet(BasicBlock, depth=18, num_classes=10)
model_state_dict = paddle.load(os.path.join(work_path, 'best_model.pdparams'))
model.set_state_dict(model_state_dict)
model.eval()
aa = time.time()
for batch_id, data in enumerate(val_loader):
x_data, y_data = data
labels = paddle.unsqueeze(y_data, axis=1)
with paddle.no_grad():
logits = model(x_data)
bb = time.time()
print("Throughout:{}".format(int(len(val_dataset)//(bb - aa))))
Throughout:2447
3. ResNet
3.1 ResNet
model = paddle.vision.models.resnet18(num_classes=10)
paddle.summary(model, (1, 3, 128, 128))
3.2 训练
learning_rate = 0.1
n_epochs = 100
paddle.seed(42)
np.random.seed(42)
work_path = 'work/model1'
model = paddle.vision.models.resnet18(num_classes=10)
criterion = LabelSmoothingCrossEntropy()
scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=learning_rate, milestones=[30, 60, 80], verbose=False)
optimizer = paddle.optimizer.Momentum(parameters=model.parameters(), learning_rate=scheduler, weight_decay=1e-5)
gate = 0.0
threshold = 0.0
best_acc = 0.0
val_acc = 0.0
loss_record1 = {'train': {'loss': [], 'iter': []}, 'val': {'loss': [], 'iter': []}} # for recording loss
acc_record1 = {'train': {'acc': [], 'iter': []}, 'val': {'acc': [], 'iter': []}} # for recording accuracy
loss_iter = 0
acc_iter = 0
for epoch in range(n_epochs):
# ---------- Training ----------
model.train()
train_num = 0.0
train_loss = 0.0
val_num = 0.0
val_loss = 0.0
accuracy_manager = paddle.metric.Accuracy()
val_accuracy_manager = paddle.metric.Accuracy()
print("#===epoch: {}, lr={:.10f}===#".format(epoch, optimizer.get_lr()))
for batch_id, data in enumerate(train_loader):
x_data, y_data = data
labels = paddle.unsqueeze(y_data, axis=1)
logits = model(x_data)
loss = criterion(logits, y_data)
acc = paddle.metric.accuracy(logits, labels)
accuracy_manager.update(acc)
if batch_id % 10 == 0:
loss_record1['train']['loss'].append(loss.numpy())
loss_record1['train']['iter'].append(loss_iter)
loss_iter += 1
loss.backward()
optimizer.step()
optimizer.clear_grad()
train_loss += loss
train_num += len(y_data)
scheduler.step()
total_train_loss = (train_loss / train_num) * batch_size
train_acc = accuracy_manager.accumulate()
acc_record1['train']['acc'].append(train_acc)
acc_record1['train']['iter'].append(acc_iter)
acc_iter += 1
# Print the information.
print("#===epoch: {}, train loss is: {}, train acc is: {:2.2f}%===#".format(epoch, total_train_loss.numpy(), train_acc*100))
# ---------- Validation ----------
model.eval()
for batch_id, data in enumerate(val_loader):
x_data, y_data = data
labels = paddle.unsqueeze(y_data, axis=1)
with paddle.no_grad():
logits = model(x_data)
loss = criterion(logits, y_data)
acc = paddle.metric.accuracy(logits, labels)
val_accuracy_manager.update(acc)
val_loss += loss
val_num += len(y_data)
total_val_loss = (val_loss / val_num) * batch_size
loss_record1['val']['loss'].append(total_val_loss.numpy())
loss_record1['val']['iter'].append(loss_iter)
val_acc = val_accuracy_manager.accumulate()
acc_record1['val']['acc'].append(val_acc)
acc_record1['val']['iter'].append(acc_iter)
print("#===epoch: {}, val loss is: {}, val acc is: {:2.2f}%===#".format(epoch, total_val_loss.numpy(), val_acc*100))
# ===================save====================
if val_acc > best_acc:
best_acc = val_acc
paddle.save(model.state_dict(), os.path.join(work_path, 'best_model.pdparams'))
paddle.save(optimizer.state_dict(), os.path.join(work_path, 'best_optimizer.pdopt'))
print(best_acc)
paddle.save(model.state_dict(), os.path.join(work_path, 'final_model.pdparams'))
paddle.save(optimizer.state_dict(), os.path.join(work_path, 'final_optimizer.pdopt'))
3.3 实验结果
plot_learning_curve(loss_record1, title='loss', ylabel='CE Loss')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eEUoi514-1677308975886)(main_files/main_39_0.png)]
plot_learning_curve(acc_record1, title='acc', ylabel='Accuracy')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FtDZgX8T-1677308975886)(main_files/main_40_0.png)]
##### import time
work_path = 'work/model1'
model = paddle.vision.models.resnet18(num_classes=10)
model_state_dict = paddle.load(os.path.join(work_path, 'best_model.pdparams'))
model.set_state_dict(model_state_dict)
model.eval()
aa = time.time()
for batch_id, data in enumerate(val_loader):
x_data, y_data = data
labels = paddle.unsqueeze(y_data, axis=1)
with paddle.no_grad():
logits = model(x_data)
bb = time.time()
print("Throughout:{}".format(int(len(val_dataset)//(bb - aa))))
Throughout:2653
Throughout:2653
## 4. 对比实验结果
| Model | Train Acc | Val Acc | Parameter |
| -------- | -------- | -------- | -------- |
| ResNet18 w IEBN | 0.9040 | 0.9118 | 11200842 |
| ResNet18 w/o IEBN | 0.8718 | 0.8870 | 11191242 |
## 总结
本文从调节批噪声的角度出发,提出了一种自适应调节批噪声的基于注意力的BN,仅增加少量参数的情况下准确率大大提高
## 参考资料
论文:[Instance Enhancement Batch Normalization: an Adaptive Regulator of Batch Noise](https://arxiv.org/abs/1908.04008)\
代码:[gbup-group/IEBN](https://github.com/gbup-group/IEBN)