1.目标跟踪的框架
刚接触目标跟踪,尝试了一种较为简单的目标跟踪方法,
原理图如下:
图一
训练过程:输入上一帧的目标包围框,上一帧的图像,当前帧的图像,标签为当前帧的目标包围框(简称bb)。首先得到输入后,以bb的中心剪出上一帧图像和当前帧图像4倍大的样本(宽高各为原来的两倍),之后再resize为(127,127)大小的图像,同时,当前帧bb需要进行相应的运算,得到bb在resize后的图像中的坐标,如图二所示。之后将resize后的两张图像输入卷积层,得到特征后,将其变成一维向量后拼接,送入全连接层,然后以变换后的当前帧的bb作为标签进行训练,如图一所示。
验证过程:输入为上一帧的目标包围框,上一帧的图像,当前帧的图像,输出为下一帧的bb。首先得到输入后,以bb的中心剪出上一帧图像和当前帧图像4倍大的样本(宽高各为原来的两倍),之后再resize为(127,127)大小的图像,之后将resize后的两张图像输入卷积层,得到特征后,将其变成一维向量后拼接,送入全连接层,预测出当前帧的bb;最后需要将得到的bb进行运算,映射成为裁剪前的框的位置,此时预测结束,得到当前帧的bb。当前帧的bb将作为下一帧的裁剪区域。
图二
代码是根据网盘中的论文以及具体情况改编实现的。
网盘链接:https://pan.baidu.com/s/1Ys9a3eqpLZ_N9zaMzlU7mA 提取码:gc6r
2.数据集
数据集的每个视频标签为一个json文件,内容为一个字典,{‘exit’:list1, ‘gt_rect’: list2},len(list1) = len(list2)=frame_num;list1[frame_id] = 0(或者1,0表示目标不存在,1表示目标在图像里);list2[frame_id] = [x,y,w,h](或者为[],[]表示目标不存在,[x,y,w,h]表示目标的位置)。视频数据是将按照frame顺序分成了一系列png图片,每个视频一个文件夹。
3.视频分帧(get_frame.py)
import cv2
import os
import glob
def save_img():
video_base_path = r'C:\Users\youchao\Desktop\test'
video_dirs = glob.glob(os.path.join(video_base_path,'*'))
for video_dir in video_dirs:
video_path = os.path.join(video_dir,'IR.mp4')
frames_path = os.path.join(video_dir,'IR')
if not os.path.exists(frames_path):
os.makedirs(frames_path)
print(video_dir)
cap = cv2.VideoCapture(video_path) #读入视频文件
count = 0
ret = cap.isOpened()
while ret: #循环读取视频帧
count +=1
ret, frame = cap.read()
frame_path = os.path.join(frames_path,'{:0>6}.png'.format(count))
if ret:
cv2.imwrite(frame_path, frame) #存储为图像,保存名为 文件夹名_数字(第几个文件).png
cv2.waitKey(1)
else:
break
cap.release()
print('save_success')
print(frames_path)
save_img()
4.训练和测试(mytest.py)
import glob
import os
import numpy as np
import json
import util
from torchvision import models
import torch
from PIL import Image
from torch.optim import lr_scheduler
from torchvision import transforms
def train(train_base_path):
batch_size = 16
n_epoch = 20
all_video_annotation = util.get_patch_path_and_annotation(train_base_path)
preprocessing_dataset = util.dataset(all_video_annotation=all_video_annotation)
dataloader = torch.utils.data.DataLoader(dataset=preprocessing_dataset, batch_size=batch_size, shuffle=True)
""" model"""
model = util.SiamFC()
print(model)
model.train()
if torch.cuda.is_available():
model.cuda()
loss_f = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
if not os.path.exists('./net'):
os.makedirs('./net')
"""train"""
for epoch in range(n_epoch):
print('train.........')
loss_batch = 0.0
num = 0
for i, sample in enumerate(dataloader, 1):
# input
x0 = sample['sample0']
x1 = sample['sample1']
y_gt = sample['bb']
if torch.cuda.is_available():
x0 = x0.cuda()
x1 = x1.cuda()
y_gt = y_gt.cuda()
optimizer.zero_grad()
# forward
f_0 = model.feature(x0)
f_1 = model.feature(x1)
num0 = f_0.shape[0]
num1 = f_1.shape[0]
assert num0 == num1
f_0 = f_0.view(num0, 9216)
f_1 = f_1.view(num1, 9216)
f = torch.cat((f_0, f_1), 1)
# with torch.set_grad_enabled(True):
y_pre = model.classifier(f)
# loss and backward
num += num0
loss = loss_f(y_pre, y_gt)
loss.backward()
optimizer.step()
loss_batch += loss.item() * num0
if i % 20 == 0:
print('sample_loss avg {}_batchs:{:.4f}'.format(i, loss_batch / num))
loss_epoch = loss_batch
# adjust lr in every epoch
scheduler.step()
print('sample_loss avg epoch:{:.4f}'.format(loss_epoch / len(preprocessing_dataset)))
# save model
if epoch == n_epoch - 1:
torch.save(model, f'./net/uav_{epoch}.pth')
def test(test_base_path,model_path):
# test_base_path = r'C:\Users\youchao\Desktop\test'
all_video_annotation, video_name_list = util.get_test_path_and_annotation(test_base_path)
results_path = './results/'
if not os.path.exists(results_path):
os.makedirs(results_path)
model = torch.load(model_path)
model.eval()
if torch.cuda.is_available():
model.cuda()
video_init = {'20190925_101846_1_3': [362, 306, 37, 24]}
for i, video_name in enumerate(video_name_list):
video_annotation = all_video_annotation[i] # list.shape=(frame,4)
labels = [video_init[video_name]]
file_path = os.path.join(results_path, f'{video_name}_IR.json')
for frame, frame_annotation in enumerate(video_annotation, 0):
image0_path = frame_annotation[0]
image1_path = frame_annotation[2]
image0 = Image.open(image0_path)
image1 = Image.open(image1_path)
image0_id = int(os.path.basename(image0_path).split('.png')[0])
image0_bounding_box = labels[image0_id - 1] # list:[x,y,w,h]
output_size = (127, 127)
sample0 = util.crop_resize(image0, image0_bounding_box, output_size=output_size)
sample1 = util.crop_resize(image1, image0_bounding_box, output_size=output_size)
x0 = sample0['resize_img'].unsqueeze(0)
x1 = sample1['resize_img'].unsqueeze(0)
if torch.cuda.is_available():
x0 = x0.cuda()
x1 = x1.cuda()
# forward
f_0 = model.feature(x0)
f_1 = model.feature(x1)
num0 = f_0.shape[0]
num1 = f_1.shape[0]
assert num0 == num1
f_0 = f_0.view(num0, 9216)
f_1 = f_1.view(num1, 9216)
f = torch.cat((f_0, f_1), 1)
resize_bb = model.classifier(f) # [x1,y1,x2,y2]
resize_bb = np.squeeze(resize_bb.data.cpu().numpy())
# calculation the location of bb
w, h = image0.size
x1_s = int(image0_bounding_box[0] - image0_bounding_box[2] / 2)
y1_s = int(image0_bounding_box[1] - image0_bounding_box[3] / 2)
x2_s = int(image0_bounding_box[0] + image0_bounding_box[2] * 1.5)
y2_s = int(image0_bounding_box[1] + image0_bounding_box[3] * 1.5)
x1_s = max(0, x1_s)
y1_s = max(0, y1_s)
x2_s = min(w, x2_s)
y2_s = min(h, y2_s)
h_s = y2_s - y1_s
w_s = x2_s - x1_s
scale = 2 * [w_s / output_size[1], h_s / output_size[0]]
crop_bb = [float(a) * float(b) for a, b in zip(resize_bb, scale)]
origin_coordinates = 2 * [x1_s, y1_s] # top left coordinate of sample
bb = [a + b for a, b in zip(crop_bb, origin_coordinates)]
image1_bounding_box = [bb[0], bb[1], bb[2] - bb[0], bb[3] - bb[1]]
labels.append(image1_bounding_box)
with open(file_path, "w") as f:
json.dump(labels, f)
if __name__ == '__main__':
state = "train"
if state =="train":
test_base_path = r'C:\Users\youchao\Desktop\test'
model_path = 'net/uav_19.pth'
test(test_base_path, model_path)
if state == "test":
train_base_path = r'C:\Users\youchao\Desktop\test'
train(train_base_path)
5.定义的函数以及类(util.py)
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
import numpy as np
import torch
import torch.nn as nn
import json
import os
import glob
class dataset(Dataset):
def __init__(self,all_video_annotation):
self.all_video_annotation = all_video_annotation #shape :(video_num * frame_name,10)
def __len__(self):
return len(self.all_video_annotation)
def __getitem__(self, item):
image0_path = self.all_video_annotation[item][0] #str
image0 = Image.open(image0_path)
image0_bounding_box = self.all_video_annotation[item][1]#list:[x,y,w,h]
image1_path = self.all_video_annotation[item][2]
image1 = Image.open(image1_path)
image1_bounding_box = self.all_video_annotation[item][3]
sample0 = crop_resize(image0,image0_bounding_box)
sample1 = crop_resize(image1,image0_bounding_box,image1_bounding_box)
label = torch.Tensor(sample1['resize_bb'])
sample = {'sample0':sample0['resize_img'], 'sample1': sample1['resize_img'], 'bb': label}
return sample
def crop_resize(image,image0_bounding_box,image1_bounding_box=False,output_size=(127,127)):#output_size.type: tuple(h,w)
#coordinate of sampel
x1_s = int(image0_bounding_box[0] - image0_bounding_box[2] / 2)
y1_s = int(image0_bounding_box[1] - image0_bounding_box[3] / 2)
x2_s = int(image0_bounding_box[0] + image0_bounding_box[2] * 1.5)
y2_s = int(image0_bounding_box[1] + image0_bounding_box[3] * 1.5)
x1_s = max(0, x1_s)
y1_s = max(0, y1_s)
w, h =image.size
x2_s = min(w, x2_s)
y2_s = min(h, y2_s)
h_s = y2_s - y1_s
w_s = x2_s - x1_s
#crop
data = np.array(image).astype(np.uint8)
data = data[y1_s:y2_s,x1_s:x2_s,:]
#resize
img = Image.fromarray(data)
transform_resize = transforms.Resize(output_size, interpolation=Image.NEAREST)
transform_to_tensor = transforms.ToTensor()
img = transform_to_tensor(transform_resize(img))
# bb of image1 after crop
if isinstance(image1_bounding_box, list):
bb = [image1_bounding_box[0], image1_bounding_box[1],
image1_bounding_box[0] + image1_bounding_box[2], image1_bounding_box[1] + image1_bounding_box[3]]
origin_coordinates = 2 * [x1_s, y1_s] # top left coordinate of sample
crop_bb = [a - b for a, b in zip(bb, origin_coordinates)]
scale = 2 * [output_size[1] / w_s, output_size[0] / h_s]
resize_bb = [float(a) * float(b) for a, b in zip(crop_bb, scale)]
else:
resize_bb = []
sample = {'resize_img': img, 'resize_bb': resize_bb}#resize_bb:[x1,y1,x2,y2]
return sample
class SiamFC(nn.Module):
def __init__(self):
super(SiamFC, self).__init__()
self.feature = nn.Sequential(
# conv1
nn.Conv2d(3, 96, 11, 2),
nn.BatchNorm2d(96, eps=1e-6, momentum=0.05),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2),
# conv2
nn.Conv2d(96, 256, 5, 1, groups=2),
nn.BatchNorm2d(256, eps=1e-6, momentum=0.05),
nn.ReLU(inplace=True),
nn.MaxPool2d(3, 2),
# conv3
nn.Conv2d(256, 384, 3, 1),
nn.BatchNorm2d(384, eps=1e-6, momentum=0.05),
nn.ReLU(inplace=True),
# conv4
nn.Conv2d(384, 384, 3, 1, groups=2),
nn.BatchNorm2d(384, eps=1e-6, momentum=0.05),
nn.ReLU(inplace=True),
# conv5
nn.Conv2d(384, 256, 3, 1, groups=2))
self.classifier = nn.Sequential(
torch.nn.Linear(in_features=18432, out_features=4096, bias=True),
torch.nn.ReLU(inplace=True),
torch.nn.Dropout(p=0.5, inplace=False),
torch.nn.Linear(in_features=4096, out_features=4096, bias=True),
torch.nn.ReLU(inplace=True),
torch.nn.Dropout(p=0.5, inplace=False),
torch.nn.Linear(in_features=4096, out_features=1000, bias=True),
torch.nn.ReLU(inplace=True),
torch.nn.Dropout(p=0.5, inplace=False),
torch.nn.Linear(in_features=1000, out_features=4, bias=True))
def get_patch_path_and_annotation(train_base_path):
#base_path = r'C:\Users\youchao\Desktop\test'
video_paths = sorted(glob.glob(os.path.join(train_base_path,'*')))
all_video_annotation = [] #shape :(video_num * frame_name,4)
for video_path in video_paths:
video_name = os.path.basename(video_path)
annotation_path = os.path.join(train_base_path,video_name,'IR_label.json')
patch_paths = sorted(glob.glob(os.path.join(train_base_path, video_name, 'IR', '*.png')))
with open(annotation_path,'r') as file:
annotation = json.load(file)#{'exit','gt_rect'}
video_annotation = [] #shape :(frame_num,3)
assert len(annotation['exist']) == len(patch_paths)
for frame_num in range(len(annotation['exist'])):
object_exit = annotation['exist'][frame_num]
if object_exit == 0:
continue
patch_path = patch_paths[frame_num]
coordinate = annotation['gt_rect'][frame_num]
patch_annotation = [patch_path,coordinate]
video_annotation.append(patch_annotation)
for index in range(len(video_annotation)-1):
current_frame = video_annotation[index]
next_frame = video_annotation[index+1]
current_next = current_frame + next_frame #list:[c_path,[x,y,w,h],n_path,[x,y,w,h]]
all_video_annotation.append(current_next)
return all_video_annotation
def get_test_path_and_annotation(test_base_path):
#base_path = r'C:\Users\youchao\Desktop\test'
video_paths = sorted(glob.glob(os.path.join(test_base_path,'*')))
all_video_annotation = [] #shape :(video_num * frame_name,4)
video_name_list = []
for video_path in video_paths:
video_name = os.path.basename(video_path)
video_name_list.append(video_name)
annotation_path = os.path.join(test_base_path,video_name,'IR_label.json')
patch_paths = sorted(glob.glob(os.path.join(test_base_path, video_name, 'IR', '*.png')))
with open(annotation_path,'r') as file:
annotation = json.load(file)#{'exit','gt_rect'}
video_annotation = [] #shape :(frame_num,2)
test_video_annotation = []
assert len(annotation['exist']) == len(patch_paths)
for frame_num in range(len(annotation['exist'])):
patch_exit = annotation['exist'][frame_num]
patch_path = patch_paths[frame_num]
patch_annotation = [patch_path,patch_exit]
video_annotation.append(patch_annotation)
for index in range(len(video_annotation)-1):
current_frame = video_annotation[index]
index_current = index
while current_frame[1] == 0:
index_current -=1
current_frame = video_annotation[index_current]
next_frame = video_annotation[index+1]
current_next = current_frame + next_frame #list:[c_path,1,n_path,exist]
test_video_annotation.append(current_next)
all_video_annotation.append(test_video_annotation)
return all_video_annotation,video_name_list#list:[video_num , frame,4]
5.存在的问题
这个框架很理想,训练好模型后输入前后帧可以回归出目标框,实际操作的时候发现存在误差积累的现象,因为从第一帧开始,预测第二帧的包围框(简称bb)将产生几个像素点的误差,导致以第二帧的bb为基准对图像进行裁剪时,使得原来的目标发生了变化,一直累积变化,最后完全不知道目标是什么。实验过程中,在预测bb的时候,由于目标的误差的累积,最后预测输出的bb=[x,y,w,h]中h或者w的值变化到了0~1之间,小于一个像素点,当值小于0.5时,在裁剪图像时使用了int(w)直接使得w等于0而报错,图像超出索引。