一、Pytorch模型转换成onnx格式
使用Pytorch自带的torch.onnx.export函数即可将Pytorch模型转换成onnx格式。
images = Variable(images).float().cuda()
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
config = yaml.load(open('brick_hrnet.yaml'))
net = HighResolutionNet(config)
net.float().cuda()
net.eval()
net.load_state_dict(torch.load(config['checkout']))
# Export the model to an ONNX file
with torch.no_grad():
output = torch.onnx.export(net,
images,
'hrnet_49.onnx',
verbose=False)
print("Export of torch_model.onnx complete!")
二、安装TensorRT6.0
根据官网的指导教程安装TensorRT6.0即可:
https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html 需要注册认证才能下载
这里我下载deb的包,然后根据如下官网的教程安装的。
三、onnx生成TensorRT engine
按照以下命令行进行操作:
git clone --recurse-submodules -b 6.0 https://github.com/onnx/onnx-tensorrt.git
cd onnx-tensorrt
# Update submodules
git submodule update --init --recursive
# Build
mkdir build && cd build
cmake .. -DCUDA_INCLUDE_DIRS=/usr/local/cuda/include/ -DTENSORRT_ROOT=/usr/lib/x86_64-linux-gnu
make -j8
sudo make install
# Update system config
sudo ldconfig
四、将onnx模型转换成trt格式
由于我们待转换的模型是HRNet,所以在第一步转换onnx格式的时候有以下需要注意的地方:不能出现.size,.shape的字眼!!!(因为TensorRT是静态图!)
在HRNet.py原版的代码中,在HighResolutionModule的forward过程中的fuse_layer环节,作者粗暴地将待fuse的层bilinear上采样到与当前层相同的大小,其实有两种改法:1.在make_fuse_layer的时候对于上采样的层指定好需要上采样的尺度,而不是不进行上采样(单纯的步长=1的卷积),但是这种改法需要重新训练网络;2.第二种改法就是每次构建stage的时候制定好每个branch的特征图大小,到时候在HighResolutionModule forward的过程中直接使用,而不是.shape获取;现在将第二种代码的更改示例如下:
只需关注#----------更改 开始!!!!-------------- #----------更改 结束!!!!--------------即可!
将HighResolutionNet中的_make_stage进行如下更改:
#这里实际上是制作单个stage
def _make_stage(self, layer_config, num_inchannels,
multi_scale_output=True):
num_modules = layer_config['NUM_MODULES']
num_branches = layer_config['NUM_BRANCHES']
num_blocks = layer_config['NUM_BLOCKS']
num_channels = layer_config['NUM_CHANNELS']
block = blocks_dict[layer_config['BLOCK']]
fuse_method = layer_config['FUSE_METHOD']
modules = []
for i in range(num_modules):
# multi_scale_output is only used last module
if not multi_scale_output and i == num_modules - 1:
reset_multi_scale_output = False
else:
reset_multi_scale_output = True
#这里是在构建高分辨率的单个分支
#这里姑且认为x的分辨率是branch的个数挂钩的
#----------更改 开始!!!!--------------
x_shape=[]
if(num_branches==2):
x_shape=[[512,624],[256,312]]
elif(num_branches==3):
x_shape = [[512, 624], [256, 312],[128,156]]
elif(num_branches==4):
x_shape = [[512, 624], [256, 312], [128, 156],[64,78]]
else:
print("error mzy!!!")
modules.append(
HighResolutionModule(num_branches,
block,
num_blocks,
num_inchannels,
num_channels,
fuse_method,
x_shape,
reset_multi_scale_output)
)
#----------更改 结束!!!!--------------
num_inchannels = modules[-1].get_num_inchannels()
return nn.Sequential(*modules), num_inchannels
将HighResolutionModule中的forward更改如下:
def forward(self, x):
#如果该stage只有一个分支的话,是不需要融合的
if self.num_branches == 1:
return [self.branches[0](x[0])]
#如果该stage 有多个分支的话,针对每一个branch各自进行传输
for i in range(self.num_branches):
x[i] = self.branches[i](x[i])
#以下是融合层
x_fuse = []
for i in range(len(self.fuse_layers)):
y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
for j in range(1, self.num_branches):
if i == j:
y = y + x[j]
elif j > i:
#----------更改 开始!!!!--------------
#modified by mzy
y = y + F.interpolate(
self.fuse_layers[i][j](x[j]),
#size=[x[i].shape[2], x[i].shape[3]],
size=self.x_shape[i],
mode='bilinear')
#----------更改 结束!!!!--------------
else:
y = y + self.fuse_layers[i][j](x[j])
x_fuse.append(self.relu(y))
return x_fuse
然后在HighResolutionModule init函数的末尾加上:
self.x_shape=x_shape
更改完HRNet.py之后,我们使用步骤一的方法将其打包成onnx格式;
然后,在安装好的onnx2trt环境下运行(也就是build目录下):
onnx2trt hrnet_add_320_and_322_49.onnx -o hrnet_add_320_and_322_49.trt -b 1
其中-b 后面的数值代表max batch size,可根据自身需求调整
运行上述命令之后,我们可以得到如下结果:
mzy@mzy-Precision-3630-Tower:~/TensorRT/onnx-tensorrt/build$ onnx2trt hrnet_add_320_and_322_49.onnx -o hrnet_add_320_and_322_49.trt -b 1
----------------------------------------------------------------
Input filename: hrnet_add_320_and_322_49.onnx
ONNX IR version: 0.0.4
Opset version: 9
Producer name: pytorch
Producer version: 1.3
Domain:
Model version: 0
Doc string:
----------------------------------------------------------------
WARNING: ONNX model has a newer ir_version (0.0.4) than this parser was built against (0.0.3).
Parsing model
Building TensorRT engine, FP16 available:0
Max batch size: 1
Max workspace size: 1024 MiB
[2021-03-31 01:41:13 WARNING] TensorRT was linked against cuBLAS 10.2.0 but loaded cuBLAS 10.1.0
[2021-03-31 01:44:01 WARNING] TensorRT was linked against cuBLAS 10.2.0 but loaded cuBLAS 10.1.0
Writing TensorRT engine to hrnet_add_320_and_322_49.trt
All done
此时build目录下会生成对应的trt文件;
这里需要注意的是!!!:
onnx2trt有以下参数,我们可以重点指定max_batch_size\model_data_type_bit_depth\max_workspace_size_bytes,
如果显示FP16 available:0,代表不是使用的float16或者说不支持float16,使用的是float32,这个精度会影响到后续我们加载数据到指针内存,所以这里需要注意一下。
Usage: onnx2trt onnx_model.pb
[-o engine_file.trt] (output TensorRT engine)
[-t onnx_model.pbtxt] (output ONNX text file without weights)
[-T onnx_model.pbtxt] (output ONNX text file with weights)
[-b max_batch_size (default 32)]
[-w max_workspace_size_bytes (default 1 GiB)]
[-d model_data_type_bit_depth] (32 => float32, 16 => float16)
[-l] (list layers and their shapes)
[-g] (debug mode)
[-v] (increase verbosity)
[-q] (decrease verbosity)
[-V] (show version information)
[-h] (show help)
五、TensorRT+CLion加载推理模型
加载TensorRT库文件
不同于以往的库,在cmakelist里面加载是加载头文件和库文件路径,这里直接find_library搜索库文件
cmake_minimum_required(VERSION 3.16)
project(TensorRT_HRNet)
set(CMAKE_CXX_STANDARD 11)
set(OpenCV_DIR /home/mzy/workspace/opencv-3.4/build)
find_package( OpenCV 3 REQUIRED )
INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
find_library(NVINFER NAMES libnvinfer.so)
find_library(NVPARSERS NAMES nvparsers)
find_library(NVONNXPARSERS NAMES nvonnxparser)
if(NVINFER)
message("TensorRT is available!")
message("NVINFER: ${NVINFER}")
message("NVPARSERS: ${NVPARSERS}")
message("NVONNXPARSERS: ${NVONNXPARSERS}")
set(TRT_AVAIL ON)
else()
message("TensorRT is NOT Available")
set(TRT_AVAIL OFF)
endif()
add_executable(TensorRT_HRNet BrickDetect.h BrickDetect.cpp main3.cpp)
target_link_libraries( TensorRT_HRNet
${OpenCV_LIBS}
${CUDA_LIBRARIES}
${NVINFER}
${NVPARSERS}
${NVONNXPARSERS}
${TensorRT_LIBRARIES}
)
使用TensorRT load 已经转好的trt engine执行推理获取结果主要可分为以下几个步骤:
1.加载trt engine,并反序列化
2.加载数据,根据需要进行前置处理,然后将数据拷贝到指针数组中传递给gpu;
3.执行推理过程;
4.获取推理结果,并将其从gpu指针拷贝到cpu内存上,方便下一步的后处理;
5.执行后处理;
下面我们以HRNet为例,记录上述过程
加载trt engine,并反序列化
//const std::string engine_name ---为trt模型的存储路径
//TRTUniquePtr<nvinfer1::ICudaEngine>& engine ------ TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};为空的engine,用于返回创建好的engine
//TRTUniquePtr<nvinfer1::IExecutionContext>& context ---- TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};空的上下文,同样用于返回创建好的上下文
void BrickDetect::deserializeEngineModel(const std::string engine_name,
TRTUniquePtr<nvinfer1::ICudaEngine>& engine,
TRTUniquePtr<nvinfer1::IExecutionContext>& context) {
std::ifstream in_file(engine_name.c_str(), std::ios::in | std::ios::binary);
if (!in_file.is_open()) {
std::cerr << "ERROR: fail to open file: " << engine_name.c_str() << std::endl;
exit(1);
}
std::streampos begin, end;
begin = in_file.tellg();
in_file.seekg(0, std::ios::end);
end = in_file.tellg();
size_t size = end - begin;
std::cout << "engine file size: " << size << " bytes" << std::endl;
in_file.seekg(0, std::ios::beg);
std::unique_ptr<unsigned char[]> engine_data(new unsigned char[size]);
in_file.read((char*)engine_data.get(), size);
in_file.close();
// deserialize the engine
nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
engine.reset(runtime->deserializeCudaEngine((const void*)engine_data.get(), size, nullptr));
context.reset(engine->createExecutionContext());
}
加载数据,根据需要进行前置处理,然后将数据拷贝到指针数组中传递给gpu
// get sizes of input and output and allocate memory
// required for input data and for output data
std::vector<nvinfer1::Dims> input_dims; // we expect only one input
std::vector<nvinfer1::Dims> output_dims; // and one output
std::vector<void*> buffers(engine->getNbBindings()); // cpu buffers for input and output data
std::vector<void*> gpu_buffers(engine->getNbBindings()); // gpu buffers for input and output data
// 创建cuda流, 用于管理数据复制, 存取和计算的并发操作
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
for (size_t i=0; i < engine->getNbBindings(); ++i) {
auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
buffers[i] = (void*)malloc(binding_size);
CHECK(cudaMalloc((void**)(&gpu_buffers[i]), binding_size));
if (engine->bindingIsInput(i)) {
input_dims.emplace_back(engine->getBindingDimensions(i));
}
else {
output_dims.emplace_back(engine->getBindingDimensions(i));
}
}
if (input_dims.empty() || output_dims.empty()) {
std::cerr << "Expect at least one input and one output for network\n";
return result;
}
//modified by mzy
input_dims[0].d[0]=1;
input_dims[0].d[1]=3;
input_dims[0].d[2]=512;
input_dims[0].d[3]=624;
//对图像进行前处理,并将图像拷贝到指针cpu数组buffers[0]当中
preprocessImage(image, (float*)buffers[0], input_dims[0]);
// 从内存到显存, 从CPU到GPU, 将输入数据拷贝到显存
// buffers[0]是读入内存中的数据; gpu_buffers[0]是显存上的存储区域, 用于存放输入数据
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(gpu_buffers[0], buffers[0],
getSizeByDim(engine->getBindingDimensions(0)) * batch_size * sizeof(float),
cudaMemcpyHostToDevice, stream));
其中preprocessImage函数的代码为:
void preprocessImage(cv::Mat frame, float* cpu_input, const nvinfer1::Dims& dims) {
// read input image
//cv::Mat frame = cv::imread(image_path);
if (frame.rows!=512 || frame.cols!=612 || frame.channels()!=3) {
std::cerr << "Input image size or channels error!!!!!" << " load failed\n";
}
//我们在Pytorch上训练的时候是转为RGB的
cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB);
//将图片由612-》填充到624的大小
cv::copyMakeBorder(frame,frame,0,0,0,12,cv::BORDER_CONSTANT,0);
auto input_width = dims.d[3];
auto input_height = dims.d[2];
auto channels = dims.d[1];
std::cout<<"---attention----start"<<std::endl;
std::cout << "(" << channels << ", "
<< input_height << ", "
<< input_width << ")" << std::endl;
std::cout<<"---attention----end"<<std::endl;
auto input_size = cv::Size(input_width, input_height);
cv::Mat resized=frame;
cv::Mat flt_image;
//这里将图片归一化到/255
resized.convertTo(flt_image, CV_32FC3, 1.f / 255.f);
std::vector<cv::Mat> chw;
for (size_t i=0; i < channels; ++i) {
chw.emplace_back(cv::Mat(input_size, CV_32FC1, cpu_input + i * input_width * input_height));
}
cv::split(flt_image, chw);
}
执行推理过程
// 启动cuda核, 异步执行推理计算.
context->enqueue(batch_size, gpu_buffers.data(), stream, nullptr);
获取推理结果,并将其从gpu指针拷贝到cpu内存上,方便下一步的后处理
// 从显存到内存, 将计算结果拷贝回内存中.
// buffers[1]是内存中的存储区域; gpu_buffers[1]是显存中的存储区域, 存放模型输出.
CHECK(cudaMemcpyAsync(buffers[1], gpu_buffers[1],
getSizeByDim(engine->getBindingDimensions(1)) * batch_size * sizeof(float),
cudaMemcpyDeviceToHost, stream));
// 这个是为了同步不同的流
cudaStreamSynchronize(stream);
output_dims[0].d[0]=1;
output_dims[0].d[1]=4;
output_dims[0].d[2]=512;
output_dims[0].d[3]=624;
执行后处理
// postprocess results
result=postprocessResults((float*)buffers[1], output_dims[0], batch_size);
其中postprocessResults的流程为:
// postprocessing stage 后处理步骤----------------------------------------------------------------------
std::vector<cv::Point> BrickDetect::postprocessResults(float* cpu_output, const nvinfer1::Dims &dims, int batch_size) {
int output_width=624;
int output_height=512;
cv::Size output_size(output_width,output_height);
std::vector<cv::Mat>output_heatmap;
for(int i=0;i<4;i++)
{
cv::Mat temp(output_size, CV_32FC1, cpu_output + i * output_width * output_height);
output_heatmap.push_back(temp);
}
//以下是对cv::Mat 进行操作获取points
std::vector<cv::Point> result=get_peak_points(output_heatmap);
return result;
}
销毁流对象,释放内存、显存
// 销毁流对象
cudaStreamDestroy(stream);
// 释放显存
for (void* gpu_buf : gpu_buffers) {
CHECK(cudaFree(gpu_buf));
}
// 释放内存
for (void* buf : buffers) {
free(buf);
}