目录
0. 写在前面
1. 图像物体识别测试demo
2. 视频文件物体识别测试demo
3. 问题与解决办法
0. 写在前面
Keras YOLO V4代码地址:https://github.com/miemie2013/Keras-YOLOv4
训练数据集COCO2017
1. 图像物体识别测试demo
直接使用GPU加速会提示错误:
训练、测试Tensorflow、Keras代码时,出现could not create cudnn handle: CUDNN_STATUS_NOT_INITIALIZED、error retrieving driver version: Unimplemented: kernel reported driver version not implemented on Windows、could not destroy cudnn handle: CUDNN_STATUS_BAD_PARAM等错误。
错误主要指向cudnn,但是CUDA版本和cudnn版本是符合当前tensorflow要求的,因此只能是GPU占用问题导致的。解决方法如下:
1. tensorflow 框架下设置GPU按需分配:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(config=config) as sess:
...
2. keras框架(Tensorflow backend) 设置GPU按需分配:
import tensorflow as tf
from keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
3. Tensorflow 2.0 设置GPU按需分配方式(没有session):
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
按照解决办法将源代码进行修改、优化如下:(文件名: demo.py)
#! /usr/bin/env python
# coding=utf-8
# ================================================================
#
#
# Created date: 2020-05-20 15:35:27
# Description : keras_yolov4
#
# ================================================================
from collections import deque
import datetime
import cv2
import os
import time
import numpy as np
import tensorflow as tf
import keras.layers as layers
from tools.cocotools import get_classes
from model.yolov4 import YOLOv4
from model.decode_np import Decode
import tensorflow as tf
import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
# 6G的卡,训练时如果要预测,则设置use_gpu = False,否则显存不足。
# use_gpu = False
use_gpu = True
# 显存分配。
if use_gpu:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# 方法一:直接分配GPU一定比例的内存,容易出现爆显存的情况
from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 1.0
# set_session(tf.Session(config=config))
# 方法二: 调整参数,使用gpu_options.allow_growth = True根据硬件自动分配显存,防止爆显存而出错
# config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False)
# config.gpu_options.allow_growth = True
# session = tf.Session(config=config)
# 方法三: 类似于上述方法,使用keras 和 tensorflow后端
from keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
if __name__ == '__main__':
# classes_path = 'data/voc_classes.txt'
classes_path = 'data/coco_classes.txt'
# model_path可以是'yolov4.h5'、'./weights/step00001000.h5'这些。
model_path = 'yolov4.h5'
# model_path = './weights/step00070000.h5'
# input_shape越大,精度会上升,但速度会下降。
input_shape = (320, 320)
# input_shape = (416, 416)
# input_shape = (608, 608)
# 验证时的分数阈值和nms_iou阈值
conf_thresh = 0.05
nms_thresh = 0.45
# 是否给图片画框。不画可以提速。读图片、后处理还可以继续优化。
draw_image = True
# draw_image = False
num_anchors = 3
all_classes = get_classes(classes_path)
num_classes = len(all_classes)
inputs = layers.Input(shape=(None, None, 3))
yolo = YOLOv4(inputs, num_classes, num_anchors)
yolo.load_weights(model_path, by_name=True)
_decode = Decode(conf_thresh, nms_thresh, input_shape, yolo, all_classes)
if not os.path.exists('images/res/'): os.mkdir('images/res/')
path_dir = os.listdir('images/test')
# warm up
if use_gpu:
for k, filename in enumerate(path_dir):
image = cv2.imread('images/test/' + filename)
image, boxes, scores, classes = _decode.detect_image(image, draw_image=False)
if k == 10:
break
time_stat = deque(maxlen=20)
start_time = time.time()
end_time = time.time()
num_imgs = len(path_dir)
start = time.time()
for k, filename in enumerate(path_dir):
image = cv2.imread('images/test/' + filename)
image, boxes, scores, classes = _decode.detect_image(image, draw_image)
# 估计剩余时间
start_time = end_time
end_time = time.time()
time_stat.append(end_time - start_time)
time_cost = np.mean(time_stat)
eta_sec = (num_imgs - k) * time_cost
eta = str(datetime.timedelta(seconds=int(eta_sec)))
logger.info('Infer iter {}, num_imgs={}, eta={}.'.format(k, num_imgs, eta))
if draw_image:
cv2.imwrite('images/res/' + filename, image)
logger.info("Detection bbox results save in images/res/{}".format(filename))
cost = time.time() - start
logger.info('total time: {0:.6f}s'.format(cost))
logger.info('Speed: %.6fs per image, %.1f FPS.'%((cost / num_imgs), (num_imgs / cost)))
测试参数:
# input_shape越大,精度会上升,但速度会下降。 # input_shape = (320, 320) input_shape = (416, 416) # input_shape = (608, 608)
测试结果:17FPS
2. 视频文件物体识别测试demo
准备好视频文件,修改相应文件路径代码,设置GPU参数和测试参数,运行demo_vedio.py文件即可,测试帧率10-20FPS。
#! /usr/bin/env python
# coding=utf-8
# ================================================================
#
# Created date: 2020-06-03 15:35:27
# Description : keras_yolov4
#
# ================================================================
from collections import deque
import datetime
import cv2
import os
import colorsys
import random
import time
import numpy as np
import tensorflow as tf
import keras.layers as layers
from tools.cocotools import get_classes
from model.yolov4 import YOLOv4
from model.decode_np import Decode
import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
# 6G的卡,训练时如果要预测,则设置use_gpu = False,否则显存不足。
# use_gpu = False
use_gpu = True
# 显存分配。
if use_gpu:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
# 方法一:直接分配GPU一定比例的内存,容易出现爆显存的情况
from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 1.0
# set_session(tf.Session(config=config))
# 方法二: 调整参数,使用gpu_options.allow_growth = True根据硬件自动分配显存,防止爆显存而出错
# config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False)
# config.gpu_options.allow_growth = True
# session = tf.Session(config=config)
# 方法三: 类似于上述方法,使用keras 和 tensorflow后端
from keras import backend as K
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)
def process_image(img, input_shape):
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
h, w = img.shape[:2]
scale_x = float(input_shape[1]) / w
scale_y = float(input_shape[0]) / h
img = cv2.resize(img, None, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_CUBIC)
pimage = img.astype(np.float32) / 255.
pimage = np.expand_dims(pimage, axis=0)
return pimage
def draw(image, boxes, scores, classes, all_classes, colors):
image_h, image_w, _ = image.shape
for box, score, cl in zip(boxes, scores, classes):
x0, y0, x1, y1 = box
left = max(0, np.floor(x0 + 0.5).astype(int))
top = max(0, np.floor(y0 + 0.5).astype(int))
right = min(image.shape[1], np.floor(x1 + 0.5).astype(int))
bottom = min(image.shape[0], np.floor(y1 + 0.5).astype(int))
bbox_color = colors[cl]
# bbox_thick = 1 if min(image_h, image_w) < 400 else 2
bbox_thick = 1
cv2.rectangle(image, (left, top), (right, bottom), bbox_color, bbox_thick)
bbox_mess = '%s: %.2f' % (all_classes[cl], score)
t_size = cv2.getTextSize(bbox_mess, 0, 0.5, thickness=1)[0]
cv2.rectangle(image, (left, top), (left + t_size[0], top - t_size[1] - 3), bbox_color, -1)
cv2.putText(image, bbox_mess, (left, top - 2), cv2.FONT_HERSHEY_SIMPLEX,
0.5, (0, 0, 0), 1, lineType=cv2.LINE_AA)
if __name__ == '__main__':
video_path = 'OpenPose.mp4'
output_dir = './video_out'
# classes_path = 'data/voc_classes.txt'
classes_path = 'data/coco_classes.txt'
# model_path可以是'yolov4.h5'、'./weights/step00001000.h5'这些。
model_path = 'yolov4.h5'
# model_path = './weights/step00070000.h5'
# input_shape越大,精度会上升,但速度会下降。
input_shape = (320, 320)
# input_shape = (416, 416)
# input_shape = (608, 608)
# 验证时的分数阈值和nms_iou阈值
conf_thresh = 0.05
nms_thresh = 0.45
keep_top_k = 100
nms_top_k = 100
# 是否给图片画框。不画可以提速。读图片、后处理还可以继续优化。
draw_image = True
# draw_image = False
# 初始卷积核个数
initial_filters = 32
anchors = np.array([
[[12, 16], [19, 36], [40, 28]],
[[36, 75], [76, 55], [72, 146]],
[[142, 110], [192, 243], [459, 401]]
])
# 一些预处理
anchors = anchors.astype(np.float32)
num_anchors = len(anchors[0]) # 每个输出层有几个先验框
all_classes = get_classes(classes_path)
num_classes = len(all_classes)
inputs = layers.Input(shape=(None, None, 3))
yolo = YOLOv4(inputs, num_classes, num_anchors, initial_filters, True, anchors, conf_thresh, nms_thresh, keep_top_k, nms_top_k)
yolo.load_weights(model_path, by_name=True)
if not os.path.exists('images/res/'): os.mkdir('images/res/')
# 定义颜色
hsv_tuples = [(1.0 * x / num_classes, 1., 1.) for x in range(num_classes)]
colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
colors = list(map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors))
random.seed(0)
random.shuffle(colors)
random.seed(None)
capture = cv2.VideoCapture(video_path)
fps = 30
width = int(capture.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(capture.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_name = os.path.split(video_path)[-1]
if not os.path.exists(output_dir):
os.makedirs(output_dir)
out_path = os.path.join(output_dir, video_name)
writer = cv2.VideoWriter(out_path, fourcc, fps, (width, height))
index = 1
start = time.time()
while (1):
ret, frame = capture.read()
if not ret:
break
print('detect frame:%d' % (index))
index += 1
# 预处理方式一
pimage = process_image(np.copy(frame), input_shape)
# 预处理方式二
# pimage = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# pimage = np.expand_dims(pimage, axis=0)
outs = yolo.predict(pimage)
boxes, scores, classes = outs[0][0], outs[1][0], outs[2][0]
img_h, img_w, _ = frame.shape
a = input_shape[0]
boxes = boxes * [img_w/a, img_h/a, img_w/a, img_h/a]
if boxes is not None and draw_image:
draw(frame, boxes, scores, classes, all_classes, colors)
cv2.imshow("detection", frame)
writer.write(frame)
if cv2.waitKey(110) & 0xff == 27:
break
writer.release()
视频文件识别过程截图:(如果你问我她是谁,我会说:我不知道。)
显卡显存占用截图:
3. 问题与解决办法
问题: 检测速度尚可,但是如果要移植到Arm嵌入式系统中有可能面临算力不足,实时性不够的问题。
解决办法:由于在预测的同时使用OpenCV不停的写入视频处理帧画面、显示图形化界面、文件IO操作,导致预览的处理实时性较差,在只有文件IO、且帧图像的尺度在小点的的情况下检测速度应该会有所提升。