1 . LiteSeg 是实时语义分割算法,论文参见 :DICTA 2019 LiteSeg: A Novel Lightweight ConvNet for Semantic Segmentation 。 2.训练和开发环境是win10,显卡RTX3080,cuda10.2,cudnn7.1,OpenCV4.5,2020年8月13日的发布v3.0这个版本,C++ IDE vs2019,Anaconda 3.5。
1.论文提出Atrous Spatial Pyramid Pooling module (ASPP)这个创新点,论文作者应用长短残差连接、深度可分离卷积构造了一个更快、更有效的语义分割模型。论文给出了LiteSeg在多个backbone上测试的比较,Darknet19、MobileNet以及ShuffleNet,在准确性及计算成本之间提供一种折衷方案。比如以MobileNetV2为主网络的在Cityscapes数据集上针对640×360分辨率的图像以每秒161帧的速度,达到了67.81%的mIoU精度。
2.网络结构
3、参数和fps对比:
4.在Cityscapes这个数据集上实验结果对比,这里有跟Enet的对比,之后的博客有时间的话也来复现这个Enet,Enet的应用相对要广泛一些。
1、这里使用的训练框架是pytorch,代码参考 github上的这个源码:https://github.com/tahaemara/LiteSeg。 2.环境创建
git clone https://github.com/tahaemara/LiteSeg.git
conda create --name liteseg python=3.7
source activate liteseg
conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
pip install cython matplotlib tqdm opencv-python tensorboard yaml scipy pillow addict
3.为了能在移动端上达到实时,我按自己使用场景优化的源码以上传: 。
1.我这里是使用LiteSeg对拍照的文档、书本,证件进行分割,使用的标注工具是labelme这个工具。标注示例如下 :
2.标注完成之后会有对应的json文件,json文件里面保存着物体的位置信息,这个信息要比目标识别的信息多出很多。
json文件里面的内容:
2.得到json文件之后要对数据进行处理与分割训练集和测试集。
数据集处理代码:
import os
import sys
import glob
import json
import math
import uuid
import random
import numpy as np
import PIL.Image
import PIL.ImageDraw
from tqdm import tqdm
def shape_to_mask(img_shape, points, shape_type=None,
line_width=10, point_size=5):
mask = np.zeros(img_shape[:2], dtype=np.uint8)
mask = PIL.Image.fromarray(mask)
draw = PIL.ImageDraw.Draw(mask)
xy = [tuple(point) for point in points]
if shape_type == 'circle':
assert len(xy) == 2, 'Shape of shape_type=circle must have 2 points'
(cx, cy), (px, py) = xy
d = math.sqrt((cx - px) ** 2 + (cy - py) ** 2)
draw.ellipse([cx - d, cy - d, cx + d, cy + d], outline=1, fill=1)
elif shape_type == 'rectangle':
assert len(xy) == 2, 'Shape of shape_type=rectangle must have 2 points'
draw.rectangle(xy, outline=1, fill=1)
elif shape_type == 'line':
assert len(xy) == 2, 'Shape of shape_type=line must have 2 points'
draw.line(xy=xy, fill=1, width=line_width)
elif shape_type == 'linestrip':
draw.line(xy=xy, fill=1, width=line_width)
elif shape_type == 'point':
assert len(xy) == 1, 'Shape of shape_type=point must have 1 points'
cx, cy = xy[0]
r = point_size
draw.ellipse([cx - r, cy - r, cx + r, cy + r], outline=1, fill=1)
else:
assert len(xy) > 2, 'Polygon must have points more than 2'
draw.polygon(xy=xy, outline=1, fill=1)
mask = np.array(mask, dtype=bool)
return mask
def shapes_to_label(img_shape, shapes, label_name_to_value):
cls = np.zeros(img_shape[:2], dtype=np.int32)
ins = np.zeros_like(cls)
instances = []
for shape in shapes:
points = shape['points']
label = shape['label']
group_id = shape.get('group_id')
if group_id is None:
group_id = uuid.uuid1()
shape_type = shape.get('shape_type', None)
cls_name = label
instance = (cls_name, group_id)
if instance not in instances:
instances.append(instance)
ins_id = instances.index(instance) + 1
cls_id = 1
# cls_id = label_name_to_value[cls_name]
mask = shape_to_mask(img_shape[:2], points, shape_type)
cls[mask] = cls_id
ins[mask] = ins_id
return cls, ins
def lblsave(filename, lbl):
if os.path.splitext(filename)[1] != '.png':
filename += '.png'
# Assume label ranses [-1, 254] for int32,
# and [0, 255] for uint8 as VOC.
if lbl.min() >= 0 and lbl.max() <= 255:
lbl_pil = PIL.Image.fromarray(lbl.astype(np.uint8), mode='L')
lbl_pil.save(filename)
else:
raise ValueError(
'[%s] Cannot save the pixel-wise class label as PNG. '
'Please consider using the .npy format.' % filename
)
if __name__ == '__main__':
data_path = sys.argv[1]
out_path = sys.argv[2]
if not os.path.exists(out_path):
os.makedirs(out_path)
label_name_to_value = {
'_background_': 0,
'a': 1,
}
json_fns = glob.glob(os.path.join(data_path, '**/*.json'), recursive=True)
out_lst = []
for json_fn in tqdm(json_fns):
with open(json_fn, 'r') as f:
data = json.load(f)
img_shape = (data['imageHeight'], data['imageWidth'])
lbl, _ = shapes_to_label(img_shape, data['shapes'], label_name_to_value)
image_fn = json_fn.replace('.json', '.jpg')
label_fn = json_fn.replace('.json', '_label.png')
if not os.path.exists(image_fn):
print(image_fn + ' not exists')
continue
else:
img = PIL.Image.open(image_fn)
mask = PIL.Image.open(label_fn)
if img.size != mask.size:
print(image_fn, img.size, mask.size)
continue
lblsave(label_fn, lbl)
out_lst.append(image_fn + ',' + label_fn)
random.shuffle(out_lst)
trn_num = int(len(out_lst) * 0.9)
with open(os.path.join(out_path, 'train.txt'), 'w') as f:
f.write('\n'.join(out_lst[:trn_num]))
with open(os.path.join(out_path, 'val.txt'), 'w') as f:
f.write('\n'.join(out_lst[trn_num:]))
执行python代码: 参数说明:第一个参数为训练图像目录,第二个参数为输出文件夹
python generate_label.py /path datasets
结果将在datasets目录下生成train.txt和val.txt,分别表示训练集和验证集。
1.训练的配置文件config/training.yaml中可以设置数据路径,batch_size大小,epoch总数和初始学习率:
2.运行以下命令:
python train.py --backbone_network mobilenet --config config/training.yaml
参数说明: ⦁ --backbone_network:网络的backbone ⦁ --config:训练配置文件
3.测试模型
python demo.py --backbone_network mobilenet --model_path checkpoints/LiteSeg-mobilenet-card_epoch-99.pth --images_path samples/images --output_path samples/predictions --gpu
参数说明: ⦁ --backbone_network:网络的backbone ⦁ --model_path: 训练得到的模型 ⦁ --images_path:图像文件夹路径 ⦁ --output_path:输出文件夹路径 ⦁ --gpu:是否使用gpu预测
输出结果:
1.要C++进行模型部署,首先要把模型转换onnx,转换成onnx后可以用onnxruntime、opencv dnn 进行推理部署。
python convert_to_onnx.py --input checkpoints/LiteSeg-mobilenet-card_epoch-99.pth --output LiteSeg-512.onnx
参数说明: ⦁ --input:训练得到的模型 ⦁ --output: 输出onnx模型,可以采用opencv的dnn调用
2.这里演示使用的是opencv dnn进行推理。 推理代码:
void LiteNet(cv::Mat& cv_src, cv::Mat& cv_dst, cv::dnn::Net &net)
{
cv::Size reso(512, 512);
cv::Mat blob = cv::dnn::blobFromImage(cv_src, 1.0 / 255, reso,
cv::Scalar(0.485, 0.456, 0.406), true, false);
net.setInput(blob);
cv::Mat out = net.forward();
cv::Mat segm = cv::Mat::zeros(out.size[2], out.size[3], CV_8UC1);
for (int i = 0; i < out.size[2] * out.size[3]; ++i)
{
if (out.ptr<float>(0, 0)[i] < out.ptr<float>(0, 1)[i])
{
segm.data[i] = 255;
}
}
cv::resize(segm, cv_dst, cv_src.size(), 0.0, 0.0, cv::INTER_NEAREST);
}
void imshow(std::string name, const cv::Mat& img)
{
cv::namedWindow(name, 0);
int max_rows = 500;
int max_cols = 600;
if (img.rows >= img.cols && img.rows > max_rows) {
cv::resizeWindow(name, cv::Size(img.cols * max_rows / img.rows, max_rows));
}
else if (img.cols >= img.rows && img.cols > max_cols) {
cv::resizeWindow(name, cv::Size(max_cols, img.rows * max_cols / img.cols));
}
cv::imshow(name, img);
}
int main()
{
std::string net_path = "models/LiteSeg-512.onnx";
cv::dnn::Net net = cv::dnn::readNet(net_path);
std::string path = "images";
std::vector<std::string> filenames;
cv::glob(path, filenames, false);
for (auto name : filenames)
{
cv::Mat cv_src = cv::imread(name);
auto t0 = cv::getTickCount();
cv::Mat cv_dst;
LiteNet(cv_src, cv_dst, net);
auto t1 = cv::getTickCount();
std::cout << "elapsed time: " << (t1 - t0) * 1000.0 / cv::getTickFrequency() << "ms" << std::endl;
imshow("src", cv_src);
imshow("dst", cv_dst);
cv::waitKey();
}
return 0;
}
3.推理结果
4.每张推理所花的时间:
5.VS2019项目测试代码地址: