一、Pytorch模型转换成onnx格式

使用Pytorch自带的torch.onnx.export函数即可将Pytorch模型转换成onnx格式。

images = Variable(images).float().cuda()
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
config = yaml.load(open('brick_hrnet.yaml'))
net = HighResolutionNet(config)


net.float().cuda()
net.eval()
net.load_state_dict(torch.load(config['checkout']))


# Export the model to an ONNX file
with torch.no_grad():
    output = torch.onnx.export(net,
                              images,
                              'hrnet_49.onnx',
                              verbose=False)
print("Export of torch_model.onnx complete!")

二、安装TensorRT6.0

根据官网的指导教程安装TensorRT6.0即可:
https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html 需要注册认证才能下载
这里我下载deb的包,然后根据如下官网的教程安装的。

三、onnx生成TensorRT engine

按照以下命令行进行操作:

git clone --recurse-submodules -b 6.0 https://github.com/onnx/onnx-tensorrt.git
cd onnx-tensorrt

# Update submodules
git submodule update --init --recursive

# Build
mkdir build && cd build

cmake .. -DCUDA_INCLUDE_DIRS=/usr/local/cuda/include/ -DTENSORRT_ROOT=/usr/lib/x86_64-linux-gnu
make -j8

sudo make install

# Update system config
sudo ldconfig

四、将onnx模型转换成trt格式

由于我们待转换的模型是HRNet,所以在第一步转换onnx格式的时候有以下需要注意的地方:不能出现.size,.shape的字眼!!!(因为TensorRT是静态图!)
在HRNet.py原版的代码中,在HighResolutionModule的forward过程中的fuse_layer环节,作者粗暴地将待fuse的层bilinear上采样到与当前层相同的大小,其实有两种改法:1.在make_fuse_layer的时候对于上采样的层指定好需要上采样的尺度,而不是不进行上采样(单纯的步长=1的卷积),但是这种改法需要重新训练网络;2.第二种改法就是每次构建stage的时候制定好每个branch的特征图大小,到时候在HighResolutionModule forward的过程中直接使用,而不是.shape获取;现在将第二种代码的更改示例如下:
只需关注#----------更改 开始!!!!-------------- #----------更改 结束!!!!--------------即可!
将HighResolutionNet中的_make_stage进行如下更改:

#这里实际上是制作单个stage
    def _make_stage(self, layer_config, num_inchannels,
                    multi_scale_output=True):
        num_modules = layer_config['NUM_MODULES']
        num_branches = layer_config['NUM_BRANCHES']
        num_blocks = layer_config['NUM_BLOCKS']
        num_channels = layer_config['NUM_CHANNELS']
        block = blocks_dict[layer_config['BLOCK']]
        fuse_method = layer_config['FUSE_METHOD']

        modules = []
        for i in range(num_modules):
            # multi_scale_output is only used last module
            if not multi_scale_output and i == num_modules - 1:
                reset_multi_scale_output = False
            else:
                reset_multi_scale_output = True
            #这里是在构建高分辨率的单个分支
            #这里姑且认为x的分辨率是branch的个数挂钩的
            #----------更改 开始!!!!--------------
            x_shape=[]
            if(num_branches==2):
                x_shape=[[512,624],[256,312]]
            elif(num_branches==3):
                x_shape = [[512, 624], [256, 312],[128,156]]
            elif(num_branches==4):
                x_shape = [[512, 624], [256, 312], [128, 156],[64,78]]
            else:
                print("error mzy!!!")
            
            modules.append(
                HighResolutionModule(num_branches,
                                     block,
                                     num_blocks,
                                     num_inchannels,
                                     num_channels,
                                     fuse_method,
                                     x_shape,
                                     reset_multi_scale_output)
            )
            #----------更改 结束!!!!--------------
            num_inchannels = modules[-1].get_num_inchannels()

        return nn.Sequential(*modules), num_inchannels

将HighResolutionModule中的forward更改如下:

def forward(self, x):
        #如果该stage只有一个分支的话,是不需要融合的
        if self.num_branches == 1:
            return [self.branches[0](x[0])]
        #如果该stage 有多个分支的话,针对每一个branch各自进行传输
        for i in range(self.num_branches):
            x[i] = self.branches[i](x[i])
        #以下是融合层
        x_fuse = []
        for i in range(len(self.fuse_layers)):
            y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
            for j in range(1, self.num_branches):
                if i == j:
                    y = y + x[j]
                elif j > i:
                    #----------更改 开始!!!!--------------
                    #modified by mzy
                    y = y + F.interpolate(
                       self.fuse_layers[i][j](x[j]),
                      #size=[x[i].shape[2], x[i].shape[3]],
                        size=self.x_shape[i],
                       mode='bilinear')
                    #----------更改 结束!!!!--------------

                else:
                    y = y + self.fuse_layers[i][j](x[j])
            x_fuse.append(self.relu(y))

        return x_fuse

然后在HighResolutionModule init函数的末尾加上:

self.x_shape=x_shape

更改完HRNet.py之后,我们使用步骤一的方法将其打包成onnx格式;
然后,在安装好的onnx2trt环境下运行(也就是build目录下):

onnx2trt hrnet_add_320_and_322_49.onnx -o hrnet_add_320_and_322_49.trt -b 1

其中-b 后面的数值代表max batch size,可根据自身需求调整
运行上述命令之后,我们可以得到如下结果:

mzy@mzy-Precision-3630-Tower:~/TensorRT/onnx-tensorrt/build$ onnx2trt hrnet_add_320_and_322_49.onnx -o hrnet_add_320_and_322_49.trt -b 1
----------------------------------------------------------------
Input filename:   hrnet_add_320_and_322_49.onnx
ONNX IR version:  0.0.4
Opset version:    9
Producer name:    pytorch
Producer version: 1.3
Domain:           
Model version:    0
Doc string:       
----------------------------------------------------------------
WARNING: ONNX model has a newer ir_version (0.0.4) than this parser was built against (0.0.3).
Parsing model
Building TensorRT engine, FP16 available:0
    Max batch size:     1
    Max workspace size: 1024 MiB
[2021-03-31 01:41:13 WARNING] TensorRT was linked against cuBLAS 10.2.0 but loaded cuBLAS 10.1.0
[2021-03-31 01:44:01 WARNING] TensorRT was linked against cuBLAS 10.2.0 but loaded cuBLAS 10.1.0
Writing TensorRT engine to hrnet_add_320_and_322_49.trt
All done

此时build目录下会生成对应的trt文件;
这里需要注意的是!!!:
onnx2trt有以下参数,我们可以重点指定max_batch_size\model_data_type_bit_depth\max_workspace_size_bytes,
如果显示FP16 available:0,代表不是使用的float16或者说不支持float16,使用的是float32,这个精度会影响到后续我们加载数据到指针内存,所以这里需要注意一下。

Usage: onnx2trt onnx_model.pb
                [-o engine_file.trt]  (output TensorRT engine)
                [-t onnx_model.pbtxt] (output ONNX text file without weights)
                [-T onnx_model.pbtxt] (output ONNX text file with weights)
                [-b max_batch_size (default 32)]
                [-w max_workspace_size_bytes (default 1 GiB)]
                [-d model_data_type_bit_depth] (32 => float32, 16 => float16)
                [-l] (list layers and their shapes)
                [-g] (debug mode)
                [-v] (increase verbosity)
                [-q] (decrease verbosity)
                [-V] (show version information)
                [-h] (show help)

五、TensorRT+CLion加载推理模型

加载TensorRT库文件

不同于以往的库,在cmakelist里面加载是加载头文件和库文件路径,这里直接find_library搜索库文件

cmake_minimum_required(VERSION 3.16)
project(TensorRT_HRNet)

set(CMAKE_CXX_STANDARD 11)
set(OpenCV_DIR /home/mzy/workspace/opencv-3.4/build)
find_package( OpenCV 3 REQUIRED )
INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
find_package(CUDA REQUIRED)
INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
find_library(NVINFER NAMES libnvinfer.so)
find_library(NVPARSERS NAMES nvparsers)
find_library(NVONNXPARSERS NAMES nvonnxparser)
if(NVINFER)
    message("TensorRT is available!")
    message("NVINFER: ${NVINFER}")
    message("NVPARSERS: ${NVPARSERS}")
    message("NVONNXPARSERS: ${NVONNXPARSERS}")
    set(TRT_AVAIL ON)
else()
    message("TensorRT is NOT Available")
    set(TRT_AVAIL OFF)
endif()

add_executable(TensorRT_HRNet BrickDetect.h BrickDetect.cpp main3.cpp)
target_link_libraries( TensorRT_HRNet
        ${OpenCV_LIBS}
        ${CUDA_LIBRARIES}
        ${NVINFER}
        ${NVPARSERS}
        ${NVONNXPARSERS}
        ${TensorRT_LIBRARIES}
        )

使用TensorRT load 已经转好的trt engine执行推理获取结果主要可分为以下几个步骤:
1.加载trt engine,并反序列化
2.加载数据,根据需要进行前置处理,然后将数据拷贝到指针数组中传递给gpu;
3.执行推理过程;
4.获取推理结果,并将其从gpu指针拷贝到cpu内存上,方便下一步的后处理;
5.执行后处理;
下面我们以HRNet为例,记录上述过程
加载trt engine,并反序列化

//const std::string engine_name ---为trt模型的存储路径
//TRTUniquePtr<nvinfer1::ICudaEngine>& engine  ------    TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};为空的engine,用于返回创建好的engine
//TRTUniquePtr<nvinfer1::IExecutionContext>& context ----  TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};空的上下文,同样用于返回创建好的上下文
void BrickDetect::deserializeEngineModel(const std::string engine_name,
                            TRTUniquePtr<nvinfer1::ICudaEngine>& engine,
                            TRTUniquePtr<nvinfer1::IExecutionContext>& context) {
    std::ifstream in_file(engine_name.c_str(), std::ios::in | std::ios::binary);

    if (!in_file.is_open()) {
        std::cerr << "ERROR: fail to open file: " << engine_name.c_str() << std::endl;
        exit(1);
    }

    std::streampos begin, end;
    begin = in_file.tellg();
    in_file.seekg(0, std::ios::end);
    end = in_file.tellg();
    size_t size = end - begin;
    std::cout << "engine file size: " << size << " bytes" << std::endl;
    in_file.seekg(0, std::ios::beg);
    std::unique_ptr<unsigned char[]> engine_data(new unsigned char[size]);
    in_file.read((char*)engine_data.get(), size);
    in_file.close();

    // deserialize the engine
    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger);
    engine.reset(runtime->deserializeCudaEngine((const void*)engine_data.get(), size, nullptr));
    context.reset(engine->createExecutionContext());

}

加载数据,根据需要进行前置处理,然后将数据拷贝到指针数组中传递给gpu

// get sizes of input and output and allocate memory
    // required for input data and for output data
    std::vector<nvinfer1::Dims> input_dims;  // we expect only one input
    std::vector<nvinfer1::Dims> output_dims; // and one output
    std::vector<void*> buffers(engine->getNbBindings()); // cpu buffers for input and output data
    std::vector<void*> gpu_buffers(engine->getNbBindings()); // gpu buffers for input and output data

    // 创建cuda流, 用于管理数据复制, 存取和计算的并发操作
    // Create stream
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));

    for (size_t i=0; i < engine->getNbBindings(); ++i) {
        auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
        buffers[i] = (void*)malloc(binding_size);
        CHECK(cudaMalloc((void**)(&gpu_buffers[i]), binding_size));
        if (engine->bindingIsInput(i)) {
            input_dims.emplace_back(engine->getBindingDimensions(i));
        }
        else {
            output_dims.emplace_back(engine->getBindingDimensions(i));
        }
    }

    if (input_dims.empty() || output_dims.empty()) {
        std::cerr << "Expect at least one input and one output for network\n";
        return result;
    }
    //modified by mzy
    input_dims[0].d[0]=1;
    input_dims[0].d[1]=3;
    input_dims[0].d[2]=512;
    input_dims[0].d[3]=624;
    //对图像进行前处理,并将图像拷贝到指针cpu数组buffers[0]当中
    preprocessImage(image, (float*)buffers[0], input_dims[0]);

    // 从内存到显存, 从CPU到GPU, 将输入数据拷贝到显存
    // buffers[0]是读入内存中的数据; gpu_buffers[0]是显存上的存储区域, 用于存放输入数据
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(gpu_buffers[0], buffers[0],
                          getSizeByDim(engine->getBindingDimensions(0)) * batch_size * sizeof(float),
                          cudaMemcpyHostToDevice, stream));

其中preprocessImage函数的代码为:

void preprocessImage(cv::Mat frame, float* cpu_input, const nvinfer1::Dims& dims) {
    // read input image
    //cv::Mat frame = cv::imread(image_path);
    if (frame.rows!=512 || frame.cols!=612 || frame.channels()!=3) {
        std::cerr << "Input image size or channels error!!!!!"  << " load failed\n";
    }
    //我们在Pytorch上训练的时候是转为RGB的
    cv::cvtColor(frame, frame, cv::COLOR_BGR2RGB);
    //将图片由612-》填充到624的大小
    cv::copyMakeBorder(frame,frame,0,0,0,12,cv::BORDER_CONSTANT,0);

    auto input_width = dims.d[3];
    auto input_height = dims.d[2];
    auto channels = dims.d[1];
    std::cout<<"---attention----start"<<std::endl;
    std::cout << "(" << channels << ", "
              << input_height << ", "
              << input_width << ")" << std::endl;
    std::cout<<"---attention----end"<<std::endl;
    auto input_size = cv::Size(input_width, input_height);
    cv::Mat resized=frame;
    cv::Mat flt_image;
    //这里将图片归一化到/255
    resized.convertTo(flt_image, CV_32FC3, 1.f / 255.f);
    std::vector<cv::Mat> chw;
    for (size_t i=0; i < channels; ++i) {
        chw.emplace_back(cv::Mat(input_size, CV_32FC1, cpu_input + i * input_width * input_height));
    }
    cv::split(flt_image, chw);
}

执行推理过程

// 启动cuda核, 异步执行推理计算.
    context->enqueue(batch_size, gpu_buffers.data(), stream, nullptr);

获取推理结果,并将其从gpu指针拷贝到cpu内存上,方便下一步的后处理

// 从显存到内存, 将计算结果拷贝回内存中.
    // buffers[1]是内存中的存储区域; gpu_buffers[1]是显存中的存储区域, 存放模型输出.
    CHECK(cudaMemcpyAsync(buffers[1], gpu_buffers[1],
                          getSizeByDim(engine->getBindingDimensions(1)) * batch_size * sizeof(float),
                          cudaMemcpyDeviceToHost, stream));

    // 这个是为了同步不同的流
    cudaStreamSynchronize(stream);
    output_dims[0].d[0]=1;
    output_dims[0].d[1]=4;
    output_dims[0].d[2]=512;
    output_dims[0].d[3]=624;

执行后处理

// postprocess results
    result=postprocessResults((float*)buffers[1], output_dims[0], batch_size);

其中postprocessResults的流程为:

// postprocessing stage 后处理步骤----------------------------------------------------------------------
std::vector<cv::Point> BrickDetect::postprocessResults(float* cpu_output, const nvinfer1::Dims &dims, int batch_size) {
    int output_width=624;
    int output_height=512;
    cv::Size output_size(output_width,output_height);
    std::vector<cv::Mat>output_heatmap;
    for(int i=0;i<4;i++)
    {
        cv::Mat temp(output_size, CV_32FC1, cpu_output + i * output_width * output_height);
        output_heatmap.push_back(temp);
    }
    //以下是对cv::Mat 进行操作获取points
    std::vector<cv::Point> result=get_peak_points(output_heatmap);
    return result;
}

销毁流对象,释放内存、显存

// 销毁流对象
    cudaStreamDestroy(stream);
    // 释放显存
    for (void* gpu_buf : gpu_buffers) {
        CHECK(cudaFree(gpu_buf));
    }
    // 释放内存
    for (void* buf : buffers) {
        free(buf);
    }