2016年提出的Faster RCNN目标检测模型是深度学习现代目标检测算法的开山之作,也是第一个真正全流程都是神经网络的目标检测模型。
其主要步骤如下:
1,使用CNN对输入图片提取feature map.
2,对feature map上的每个点设计一套不同大小和长宽比的anchor作为先验框。
3,设计RPN网络从大量的anchor中筛选出一些作为目标框的proposals并用回归分支纠正它们的位置。
4,使用ROI Pooling技术对不同大小的proposals获取相同大小的对应特征图,以便后续分类模型一并处理。
5,在proposals的feature map上使用分类分支和回归分支进一步预测目标类别和更精确的定位。
anchor技巧和ROI Pooling技术 是非常值得学习的技巧,在许多目标检测模型中都能看到他们的身影。
尽管FasterRCNN历史悠久,但依然是一个非常重要的目标检测任务的baseline.
一般会把它叫做two-stage的目标检测模型,主要是如果train from scratch, RPN网络提取proposals和后续对propasals的定位分类 这两个步骤是要分开训练的,但在微调的时候,通常可以一起训练。
本文我们主要演示调用torchvision中的faster-rcnn模型在自己的数据集上微调来检测螺丝螺母。
#!pip install torchvision,torchkeras
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from PIL import Image,ImageColor,ImageDraw,ImageFont
import torch
from torch import nn
import torchvision
from torchvision import datasets, models, transforms
import datetime
import os
import copy
import json
print(torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
2.0.0+cu117
〇,预训练模型
from torchkeras.data import get_example_image
img = get_example_image('park.jpg')
img.save('park.jpg')
from torchkeras.plots import vis_detection
# 准备数据
inputs = []
img = Image.open('park.jpg').convert("RGB")
img_tensor = torch.from_numpy(np.array(img)/255.).permute(2,0,1).float()
if torch.cuda.is_available():
img_tensor = img_tensor.cuda()
inputs.append(img_tensor)
# 加载模型
num_classes = 91
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.COCO_V1,
num_classes = num_classes)
if torch.cuda.is_available():
model.to("cuda:0")
model.eval()
# 预测结果
with torch.no_grad():
predictions = model(inputs)
# 结果可视化
class_names = torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.COCO_V1.meta['categories']
vis_detection(img,predictions[0],class_names,min_score = 0.8)
下面代码我们演示使用我开发的优雅的torchkeras工具在自己的数据集上对Faster-RCNN模型进行finetune。
我们使用一个非常简单的螺丝(bolt)螺母(nut)数据集作为示范。
公众号 算法美食屋 后台回复关键词:torchkeras,获取本文notebook代码和 bolt nut 数据集 下载地址。
一,准备数据
data_path = "./data/bolt_nut"
train_images_path = "./data/bolt_nut/train"
train_targets_path = './data/bolt_nut/train.txt'
val_images_path = "./data/bolt_nut/val"
val_targets_path = './data/bolt_nut/val.txt'
class_names = ['__background__','bolt','nut']
class BoltNut(torch.utils.data.Dataset):
def __init__(self, images_path, targets_path,
class_names = class_names,
transforms = None
):
self.images_path = images_path
self.targets_path = targets_path
self.transforms = transforms
self.infos_list = open(targets_path,"r").readlines()
self.class_names = class_names
def __getitem__(self, idx):
info_str = self.infos_list[idx]
info_arr = info_str.replace("\n","").replace("\t ","").split("\t")
img_path = info_arr.pop(0)
info_arr = [x for x in info_arr if x.strip()]
infos = [json.loads(x) for x in info_arr]
img= Image.open(os.path.join(self.images_path,img_path)).convert("RGB")
target = {}
target["image_id"] = torch.tensor([int(img_path.split(".")[0])],dtype = torch.int64)
target["labels"] = torch.tensor([self.class_names.index(x["value"]) for x in infos],
dtype = torch.int64)
coords = [x["coordinate"] for x in infos]
boxes = torch.tensor([[xmin,ymin,xmax,ymax] for (xmin,ymin), (xmax,ymax) in coords])
target["boxes"] = boxes
target["area"] = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
target["iscrowd"] = torch.zeros((len(infos) ,), dtype=torch.int64)
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
def __len__(self):
return len(self.infos_list)
# 可视化数据集
ds_train = BoltNut(train_images_path,train_targets_path)
img,target = ds_train[12]
target["scores"] = torch.ones_like(target["labels"])
img_result = vis_detection(img,target,class_names,min_score = 0.8)
img_result
下面我们设计数据增强模块
import random
from torchvision import transforms as T
class Compose(object):
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, image, target):
for t in self.transforms:
image, target = t(image, target)
return image, target
class RandomHorizontalFlip(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image, target):
if random.random() < self.prob:
height, width = image.shape[-2:]
image = image.flip(-1)
bbox = target["boxes"]
bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
target["boxes"] = bbox
if "masks" in target:
target["masks"] = target["masks"].flip(-1)
return image, target
class ToTensor(object):
def __call__(self, image, target):
image = T.ToTensor()(image)
return image, target
transforms_train = Compose([ToTensor(),RandomHorizontalFlip(0.5)])
transforms_val = ToTensor()
ds_train = BoltNut(train_images_path,train_targets_path,transforms=transforms_train)
ds_val = BoltNut(val_images_path,val_targets_path,transforms=transforms_val)
def collate_fn(batch):
return tuple(zip(*batch))
dl_train = torch.utils.data.DataLoader(ds_train, batch_size=2,
shuffle=True, num_workers=4,collate_fn= collate_fn)
dl_val = torch.utils.data.DataLoader(ds_val, batch_size=2,
shuffle=True, num_workers=4,collate_fn= collate_fn)
for batch in dl_train:
features,labels = batch
break
二,定义模型
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
num_classes = 3 # 3 classes (bult,nut) + background
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(
weights=torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.COCO_V1)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
三,训练模型
from torchkeras import KerasModel
class StepRunner:
def __init__(self, net, loss_fn, accelerator, stage = "train", metrics_dict = None,
optimizer = None, lr_scheduler = None
):
self.net,self.loss_fn,self.metrics_dict,self.stage = net,loss_fn,metrics_dict,stage
self.optimizer,self.lr_scheduler = optimizer,lr_scheduler
self.accelerator = accelerator
if self.stage=='train':
self.net.train()
else:
self.net.train() #attention here
def __call__(self, batch):
features,labels = batch
#loss
loss_dict = self.net(features,labels)
loss = sum(loss_dict.values())
#backward()
if self.optimizer is not None and self.stage=="train":
self.accelerator.backward(loss)
self.optimizer.step()
if self.lr_scheduler is not None:
self.lr_scheduler.step()
self.optimizer.zero_grad()
#all_preds = self.accelerator.gather(preds)
#all_labels = self.accelerator.gather(labels)
all_loss = self.accelerator.gather(loss).sum()
#losses
step_losses = {self.stage+"_loss":all_loss.item()}
#metrics
step_metrics = {}
if self.stage=="train":
if self.optimizer is not None:
step_metrics['lr'] = self.optimizer.state_dict()['param_groups'][0]['lr']
else:
step_metrics['lr'] = 0.0
return step_losses,step_metrics
KerasModel.StepRunner = StepRunner
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005,
momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=4)
keras_model = KerasModel(model,
loss_fn = None,
metrics_dict=None,
optimizer= optimizer,
lr_scheduler=lr_scheduler
)
keras_model.fit(train_data=dl_train,val_data=dl_val,
epochs=20,patience=5,
monitor='val_loss',
mode='min',
ckpt_path ='faster-rcnn.pt',
plot=True
)
四,评估模型
import torch
from PIL import Image
from tqdm import tqdm
from ultralytics.yolo.utils import set_logging
set_logging(verbose=False)
from ultralytics.yolo.utils.metrics import DetMetrics, box_iou
def process_batch(predictions, targets,
iouv = torch.linspace(0.5, 0.95, 10) # iou vector for mAP@0.5:0.95
):
...
return metrics
model.eval()
list_predictions = [model(x[0].to('cuda')[None,...])[0] for x in ds_val]
list_targets = [x[1] for x in ds_val]
names = {0:'bolt',1:'nut'}
metrics = eval_metrics(list_predictions,
list_targets,
names = names)
display(metrics.results_dict)
{'metrics/precision(B)': 0.9976781395819151,
'metrics/recall(B)': 1.0,
'metrics/mAP50(B)': 0.995,
'metrics/mAP50-95(B)': 0.8542317510036526,
'fitness': 0.8683085759032874}
import pandas as pd
df = pd.DataFrame()
df['metric'] = metrics.keys
for i,c in names.items():
df[c] = metrics.class_result(i)
df
五,使用模型
# 准备数据
inputs = []
img_path = os.path.join(val_images_path,os.listdir(val_images_path)[5])
img = Image.open(img_path).convert("RGB")
img_tensor = torch.from_numpy(np.array(img)/255.).permute(2,0,1).float()
if torch.cuda.is_available():
img_tensor = img_tensor.cuda()
inputs.append(img_tensor)
model.eval()
# 预测结果
with torch.no_grad():
predictions = model(inputs)
# 结果可视化
vis_detection(img,predictions[0],list(idx2names.values()),min_score = 0.8)