进行模型训练的第一步是载入数据,使用pytorch框架载入数据需要两个步骤:构建Dataset数据集和创建Dataloader数据迭代器。pytorch要载入数据训练SSD,可以直接调用 torchvision.datasets.VOCDetection 或者 torchvision.datasets.CocoDetection ,需要做的是按照要求放置数据就好。
此处为了自己写后续的图像增广的操作,所以自定义了Dataset类进行数据读取。自定义的类是继承了 torch.utils.data.dataset.Dataset 类,必须重构实现的__init__(self, ... ) 和 __getitem__(self, index)函数,习惯上还会重写__len__(self)函数。
下面是自己重写的Dataset类,为了读入方便,采用一个txt文件去存储图片(文件格式jpg)和对应的标注文件(文件格式xml),生成txt文件代码如下:
import
下面是重写的Dataset类,其中由于要解析xml标注文件,所以定义函数 load_xml 进行读取解析;而考虑到在进行测试时也存在数据载入需要,所以有 is_trian 标志位去判断载入数据类型,如果是训练数据,则需要图像和标注label同时载入;否则仅仅载入图像数据,用于测试。
import re
import torch
from PIL import Image
import cv2
import numpy as np
from torch.utils.data import dataset
from torchvision import transforms
## 该函数用于解析xml标注文件
def load_xml(filepath):
## 定义正则匹配模板
pattern1 = re.compile('<bndbox>')
pattern2 = re.compile('<name>(.*)</name>')
fi = open(filepath)
content = fi.readlines()
tag_list = []
bndbox_list = []
for line, line1, line2, line3, line4 in zip(content, content[1:], content[2:], content[3:], content[4:]):
res = pattern2.search(line)
if res != None:
## 正则匹配找到classname
tag_list.append(re.findall(pattern2, line))
## 正则匹配找到bounding box的信息
res = pattern1.search(line)
if res != None:
xmin = int(re.findall(r'd+', line1)[0])
ymin = int(re.findall(r'd+', line2)[0])
xmax = int(re.findall(r'd+', line3)[0])
ymax = int(re.findall(r'd+', line4)[0])
bndbox_list.append([xmin, ymin, xmax, ymax])
return bndbox_list, tag_list
## 重写的载入SSD数据的Dataset类
class DetectionDataset(dataset.Dataset):
## init传入参数除了record_path是刚刚生成的数据txt文件,is_train是判定是都是载入训练数据,
## Classes是label的list(要注意background作为
的第一类存在于list里面),其他可暂时不管;因为里面部分参数是跟数据增广相关
def __init__(self, record_path, img_shape, Classes=[], is_mixup=False, is_mosaic=False, is_train=True, backbone_name='resnet50'):
self.data = []
self.img_shape = img_shape
self.Classes = Classes
self.is_train = is_train
self.backbone_name = backbone_name
## mixup or mosaic just one
if (is_mixup and is_mosaic) or is_mosaic:
self.mix_num = 4
self.mosaic = imgdeal.MosaicDeal()
elif is_mixup and not is_mosaic:
self.mix_num = 2
else:
self.mix_num = 1
## 如果是训练数据,需要图片和标注文件一块读入
if self.is_train:
with open(record_path) as fp:
for line in fp.readlines():
if line == 'n':
break
else:
tmp = line.strip("n").split(" ")
## tmp[0]: is img path,tmp[1]: is img label
self.data.append([tmp[0], tmp[1]])
else:
with open(record_path) as fp:
for line in fp.readlines():
if line == 'n':
break
else:
tmp = line.strip("n").split(" ")
## tmp[0]: is img path
self.data.append([tmp[0]])
self.transformations = transforms.Compose([transforms.ToTensor()])
# get the data size
def __len__(self):
return len(self.data)
# get the date one by one
def __getitem__(self, index):
## 如果只是测试,仅读入图片即可
if not self.is_train:
im = self.pull_image(index)
return im
## 训练需要同时返回图像和标注label
im, gt = self.pull_item(index)
return im, gt
## 该函数用于读入训练的图像和标注文件
def pull_item(self, index):
out_img = np.zeros([self.img_shape, self.img_shape, 3])
out_target = []
min_offset = 0.2
cut_x = np.random.randint(int(self.img_shape * min_offset), int(self.img_shape * (1 - min_offset)))
cut_y = np.random.randint(int(self.img_shape * min_offset), int(self.img_shape * (1 - min_offset)))
for i in range(self.mix_num):
if i == 0:
img = Image.open(self.data[index][0]).convert('RGB')
bndbox, label = load_xml(self.data[index][1])
else:
random_index = np.random.randint(0, len(self.data))
img = Image.open(self.data[random_index][0]).convert('RGB')
bndbox, label = datadeal.load_xml(self.data[random_index][1])
## 将tag转换成类别里面的索引
label_index = []
for tag in label:
label_index.append([self.Classes.index(tag[0])])
## 后续为图像增广操作
## augment the image
img, bndbox, label_index = imgdeal.data_augmentation(img, bndbox, label_index, self.img_shape)
## 合并bounding box和tag
target = np.hstack((bndbox, label_index))
if self.mix_num == 2:
if i == 0:
old_img = img.copy()
old_truth = target.copy()
else:
out_img = cv2.addWeighted(img, 0.5, old_img, 0.5, 0)
out_target = np.concatenate([old_truth, target], axis=0)
elif self.mix_num == 4:
oh, ow, oc = img.shape
dh, dw, dc = np.array(np.array([oh, ow, oc]) * 0.3, dtype=np.int)
pleft = np.random.randint(-dw, dw)
pright = np.random.randint(-dw, dw)
ptop = np.random.randint(-dh, dh)
pbot = np.random.randint(-dh, dh)
swidth = ow - pleft - pright
sheight = oh - ptop - pbot
left_shift = int(min(cut_x, max(0, (-int(pleft) * self.img_shape / swidth))))
top_shift = int(min(cut_y, max(0, (-int(ptop) * self.img_shape / sheight))))
right_shift = int(min((self.img_shape - cut_x), max(0, (-int(pright) * self.img_shape / swidth))))
bot_shift = int(min(self.img_shape - cut_y, max(0, (-int(pbot) * self.img_shape / sheight))))
out_img, target = self.mosaic(out_img, img.copy(), target.copy(), self.img_shape, self.img_shape,
cut_x, cut_y, i, left_shift, right_shift, top_shift, bot_shift)
if i == 0:
out_target = target
else:
## --TODO
if len(out_target) == 0:
out_target = target
elif len(out_target) != 0 and len(target) == 0:
out_target = out_target
else:
out_target = np.concatenate([out_target, target], axis=0)
else:
out_img = img.copy()
out_target = target.copy()
pre_deal = datadeal.AssignGTtoDefaultBox(image_size=self.img_shape, backbone_name=self.backbone_name)
out_img, out_target = pre_deal(out_img, out_target)
return out_img, out_target
## 该函数用于读入测试图像
def pull_image(self, index):
img = Image.open(self.data[index][0]).resize((self.img_shape, self.img_shape)).convert('RGB')
out_img = np.array(img) / 255
return out_img
将训练数据载入变成Dataset之后,为了进行训练,需要定义一个DataLoader迭代器,进行数据的批量载入。关于DataLoader的参数可自行查阅,但此处要指出,由于重写了Dataset,载入训练数据同时返回图像和label,所以collate_fn这个参数需要输入取样本的方式函数。
from torch.utils.data import DataLoader
## 载入训练数据时候DataLoader读入数据的方式
def detection_collate(batch):
targets = []
imgs = []
for sample in batch:
imgs.append(sample[0])
targets.append(torch.FloatTensor(sample[1]))
return imgs, targets
## 载入测试数据时候DataLoader读入数据的方式
def detection_collate_test(batch):
imgs = []
for sample in batch:
imgs.append(sample)
return imgs
data_path = '/home/cat/train.txt'
Classes_type = ['__backgroud__', 'cat1', 'cat2']
img_size = 300
batch_size = 36
num_workers = 10
backbone_name = 'resnet50' ## 这个参数是在数据增广做完后将label转成default box时候使用的
dataset = DetectionDataset(data_path, img_size, Classes=Classes_type, backbone_name=backbone_name)
data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers,
shuffle=True, drop_last=True, collate_fn=detection_collate, pin_memory=True)
至此,完成数据的读入。
数据尽管已经载入成DataLoader的形式,但其实在载入Dataset那一步,还有很重要的一步没有说明,那就是“数据增广”。数据增广是关系模型训练很重要的一步,好的数据增广方式可以很大程度上提升模型的训练效果(个人认为是因为做了数据增广,相当于扩充了数据样本的数量)。
数据增广最基本的方式是水平/垂直翻转,旋转,缩放,裁剪,剪切,平移,对比度,此处参照网上的增广方式(抱歉对于出处忘记了,后面如果找到了再补上),使用opencv实现了如下。
其中cutout操作是在图像中裁掉一块填充黑的,由于后面引入了mixup和Mosaic操作,个人感觉三种方法同时使用对于我的训练数据可能产生坏的影响(我的训练数据有两类如果裁剪不当样子是一样的),所以三选一,进行增广。另外就是在做移动或者裁剪这类操作时,应考虑重新修正bounding box。
import numpy as np
import random
from numpy import random as rd
import warnings
import math
import cv2
import torch
## 转换颜色空间
class ConvertColor(object):
def __init__(self, current='BGR', transform='HSV'):
self.transform = transform
self.current = current
def __call__(self, image, boxes=None, labels=None):
if self.current == 'BGR' and self.transform == 'HSV':
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
elif self.current == 'HSV' and self.transform == 'BGR':
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
else:
raise NotImplementedError
return image, boxes, labels
## 随机改变饱和度
class RandomSaturation(object):
def __init__(self, lower=0.5, upper=1.5, p=0.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
self.p = p
def __call__(self, image, boxes=None, labels=None):
if random.random() > self.p:
image[:, :, 1] *= random.uniform(self.lower, self.upper)
return image, boxes, labels
## 随机改变对比度
class RandomContrast(object):
def __init__(self, lower=0.5, upper=1.5, p=0.5):
self.lower = lower
self.upper = upper
assert self.upper >= self.lower, "contrast upper must be >= lower."
assert self.lower >= 0, "contrast lower must be non-negative."
self.p = p
# expects float image
def __call__(self, image, boxes=None, labels=None):
if random.random() > self.p:
alpha = random.uniform(self.lower, self.upper)
image = image.astype(np.float32)
image *= alpha
image = image.astype(np.uint8)
return image, boxes, labels
## 随机改变亮度
class RandomBrightness(object):
def __init__(self, delta=32, p=0.5):
assert delta >= 0.0
assert delta <= 255.0
self.delta = delta
self.p = p
def __call__(self, image, boxes=None, labels=None):
if random.random() > self.p:
delta = random.randint(-self.delta, self.delta)
image = np.clip(image + delta, 0, 255)
return image, boxes, labels
## 随机改变色调
class RandomHue(object):
def __init__(self, delta=18.0, p=0.5):
assert delta >= 0.0 and delta <= 360.0
self.delta = delta
self.p = p
def __call__(self, image, boxes=None, labels=None):
if random.random() > self.p:
image[:, :, 0] += random.uniform(-self.delta, self.delta)
image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
return image, boxes, labels
## 归一化输入图像
class NormalizeImg(object):
def __init__(self, mean=0, std=255):
self.mean = mean
self.std = std
def __call__(self, img, bndbox):
img = (img - self.mean) / self.std
bndbox = (np.round(np.array(bndbox) / img.shape[0], 3)).tolist()
return img, bndbox
## 将数据转换成tensor格式
class ToTensor(object):
def __call__(self, cvimage, boxes=None, labels=None):
return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
## 更改图像尺寸及bounding box尺寸
class AllResize(object):
def __init__(self, size=300):
self.size = size
def __call__(self, image, boxes, labels=None):
image_h, image_w, image_c = image.shape
image_h_ratio = self.size / image_h
image_w_ratio = self.size / image_w
image = cv2.resize(image, (self.size, self.size))
for box in boxes:
box[0] = int(box[0] * image_w_ratio)
box[1] = int(box[1] * image_h_ratio)
box[2] = int(box[2] * image_w_ratio)
box[3] = int(box[3] * image_h_ratio)
return image, boxes, labels
## 图像随机裁剪(遮挡)一块区域
class Cutout(object):
def __init__(self, scale=(0.02, 0.4), ratio=(0.4, 1 / 0.4),
value=(0, 255), pixel_level=False, inplace=False):
if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
warnings.warn("range should be of kind (min, max)")
if scale[0] < 0 or scale[1] > 1:
raise ValueError("range of scale should be between 0 and 1")
self.scale = scale
self.ratio = ratio
self.value = value
self.pixel_level = pixel_level
self.inplace = inplace
def get_params(self, img, scale, ratio):
img = np.array(img)
img_h, img_w, img_c = img.shape
s = random.uniform(*scale)
r = random.uniform(*ratio)
s = s * img_h * img_w
w = int(math.sqrt(s / r))
h = int(math.sqrt(s * r))
left = random.randint(0, img_w - w)
top = random.randint(0, img_h - h)
return left, top, h, w, img_c
def cutout(self, img, i, j, h, w, v, inplace=False):
if not inplace:
img = img.copy()
img[i:i + h, j:j + w, :] = v
return img
def __call__(self, img):
left, top, h, w, ch = self.get_params(img, self.scale, self.ratio)
if self.pixel_level:
c = np.random.randint(*self.value, size=(h, w, ch))
else:
c = random.randint(*self.value)
return self.cutout(img, top, left, h, w, c, self.inplace)
## 调用数据增广接口
def data_augmentation(img, bndbox, label, img_shape, h_flip_p=0.5, v_flip_p=0.5, crop_p=0.6):
try:
# change img to numpy
img = np.array(img)
# some tramsform way
ResizeImage = AllResize(size=img_shape)
BGR2HSV = ConvertColor(transform='HSV')
HSV2BGR = ConvertColor(current='HSV', transform='BGR')
ChangeHue = RandomHue()
ChangeContrast = RandomContrast()
ChangeSaturation = RandomSaturation()
ChangeBrightness = RandomBrightness()
# ChangeTensor = ToTensor()
CropImage = RandomSampleCrop()
NormalizeImage = NormalizeImg()
# img param change
img, bndbox, _ = ResizeImage(img, bndbox)
img, _, _ = ChangeBrightness(img)
img, _, _ = ChangeContrast(img)
# filp
oh, ow, _ = img.shape
if random.random() > (1 - h_flip_p):
img = cv2.flip(img, 1)
for box in bndbox:
## 做完翻转后修正检测框坐标
## TODO:
tmp1 = box[0]
tmp2 = box[2]
box[0] = ow - tmp2
box[2] = ow - tmp1
if random.random() > (1 - v_flip_p):
img = cv2.flip(img, 0)
for box in bndbox:
## 做完翻转后修正检测框坐标
## TODO:
tmp1 = box[1]
tmp2 = box[3]
box[1] = oh - tmp2
box[3] = oh - tmp1
# # crop the image
# bndbox = np.asarray(bndbox)
# label = np.asarray(label)
# if random.random() > (1 - crop_p):
# img, bndbox, label = CropImage(img, bndbox, label)
# img = img.astype(np.uint8)
# img, bndbox, _ = ResizeImage(img, bndbox)
# normalize image
img, bndbox = NormalizeImage(img, bndbox)
except:
warnings.warn("OpenCV can't augment image!")
return img, bndbox, label
前面有提到mixup和Mosaic这两个操作,但并没有放入图像增广接口函数里面,究其原因,是因为这两个操作的对象并不是一张图像,而是除了当前图像还会随机读入多张图像进行合并处理,所以两个处理我放到了前面Dataset数据载入的__getitem__函数调用的pull_image函数里面。
mixup操作就是将一张图像通过加权的方式叠加到另一张图像上面,权重一般设置0.5,代码和实现效果如下:
for i in range(self.mix_num):
if i == 0:
img = Image.open(self.data[index][0]).convert('RGB')
bndbox, label = load_xml(self.data[index][1])
else:
random_index = np.random.randint(0, len(self.data))
img = Image.open(self.data[random_index][0]).convert('RGB')
bndbox, label = datadeal.load_xml(self.data[random_index][1])
if self.mix_num == 2:
if i == 0:
old_img = img.copy()
old_truth = target.copy()
else:
out_img = cv2.addWeighted(img, 0.5, old_img, 0.5, 0)
out_target = np.concatenate([old_truth, target], axis=0)
Mosaic操作是将四张图片随机裁剪拼接合并成一张图片,并且保证合成图片大小跟原来图像一致。其作用效果个人觉得应该是相当于一次学习了四个目标,在相同的训练次数的情况下,学习内容增加了,并且随机裁剪增加了输入的随机性,类似于cutout的遮挡,所以能提升模型的训练效果。代码和实现效果如下:
## mosaic deal
class MosaicDeal(object):
def __init__(self):
self.is_use = True
## 该函数作用是根据裁剪修正bounding box
def filter_truth(self, bboxes, dx, dy, sx, sy, xd, yd):
bboxes[:, 0] -= dx
bboxes[:, 2] -= dx
bboxes[:, 1] -= dy
bboxes[:, 3] -= dy
bboxes[:, 0] = np.clip(bboxes[:, 0], 0, sx)
bboxes[:, 2] = np.clip(bboxes[:, 2], 0, sx)
bboxes[:, 1] = np.clip(bboxes[:, 1], 0, sy)
bboxes[:, 3] = np.clip(bboxes[:, 3], 0, sy)
out_box = list(np.where(((bboxes[:, 1] == sy) & (bboxes[:, 3] == sy)) |
((bboxes[:, 0] == sx) & (bboxes[:, 2] == sx)) |
((bboxes[:, 1] == 0) & (bboxes[:, 3] == 0)) |
((bboxes[:, 0] == 0) & (bboxes[:, 2] == 0)))[0])
list_box = list(range(bboxes.shape[0]))
for i in out_box:
list_box.remove(i)
bboxes = bboxes[list_box]
bboxes[:, 0] += xd
bboxes[:, 2] += xd
bboxes[:, 1] += yd
bboxes[:, 3] += yd
return bboxes
def __call__(self, out_img, img, bboxes, w, h, cut_x, cut_y, i_mixup,
left_shift, right_shift, top_shift, bot_shift):
left_shift = min(left_shift, w - cut_x)
top_shift = min(top_shift, h - cut_y)
right_shift = min(right_shift, cut_x)
bot_shift = min(bot_shift, cut_y)
if i_mixup == 0:
## --TODO:
## maybe the bboxes is empty
if len(bboxes) == 0:
bboxes = bboxes
else:
bboxes = self.filter_truth(bboxes, left_shift, top_shift, cut_x, cut_y, 0, 0)
out_img[:cut_y, :cut_x] = img[top_shift:top_shift + cut_y, left_shift:left_shift + cut_x]
if i_mixup == 1:
## --TODO:
## maybe the bboxes is empty
if len(bboxes) == 0:
bboxes = bboxes
else:
bboxes = self.filter_truth(bboxes, cut_x - right_shift, top_shift, w - cut_x, cut_y, cut_x, 0)
out_img[:cut_y, cut_x:] = img[top_shift:top_shift + cut_y, cut_x - right_shift:w - right_shift]
if i_mixup == 2:
## --TODO:
## maybe the bboxes is empty
if len(bboxes) == 0:
bboxes = bboxes
else:
bboxes = self.filter_truth(bboxes, left_shift, cut_y - bot_shift, cut_x, h - cut_y, 0, cut_y)
out_img[cut_y:, :cut_x] = img[cut_y - bot_shift:h - bot_shift, left_shift:left_shift + cut_x]
if i_mixup == 3:
## --TODO:
## maybe the bboxes is empty
if len(bboxes) == 0:
bboxes = bboxes
else:
bboxes = self.filter_truth(bboxes, cut_x - right_shift, cut_y - bot_shift, w - cut_x, h - cut_y, cut_x,
cut_y)
out_img[cut_y:, cut_x:] = img[cut_y - bot_shift:h - bot_shift, cut_x - right_shift:w - right_shift]
return out_img, bboxes
self.mosaic = imgdeal.MosaicDeal()
for i in range(self.mix_num):
if i == 0:
img = Image.open(self.data[index][0]).convert('RGB')
bndbox, label = datadeal.load_xml(self.data[index][1])
else:
random_index = np.random.randint(0, len(self.data))
img = Image.open(self.data[random_index][0]).convert('RGB')
bndbox, label = datadeal.load_xml(self.data[random_index][1])
if self.mix_num == 2:
## 此处为mixup操作,所以省略
....
elif self.mix_num == 4:
oh, ow, oc = img.shape
dh, dw, dc = np.array(np.array([oh, ow, oc]) * 0.3, dtype=np.int)
pleft = np.random.randint(-dw, dw)
pright = np.random.randint(-dw, dw)
ptop = np.random.randint(-dh, dh)
pbot = np.random.randint(-dh, dh)
swidth = ow - pleft - pright
sheight = oh - ptop - pbot
left_shift = int(min(cut_x, max(0, (-int(pleft) * self.img_shape / swidth))))
top_shift = int(min(cut_y, max(0, (-int(ptop) * self.img_shape / sheight))))
right_shift = int(min((self.img_shape - cut_x), max(0, (-int(pright) * self.img_shape / swidth))))
bot_shift = int(min(self.img_shape - cut_y, max(0, (-int(pbot) * self.img_shape / sheight))))
out_img, target = self.mosaic(out_img, img.copy(), target.copy(), self.img_shape, self.img_shape,
cut_x, cut_y, i, left_shift, right_shift, top_shift, bot_shift)