表格识别技术主要使用基于注意力机制的图片描述模型 RARE,整体流程如下图所示,对于其中的表格区域进行表格识别处理。
表格识别的难点主要在于表格结构的提取,以及将表格信息与 OCR 信息融合。整体流程可以分为上下两部分,其中上半部分(黑色支路)是普通的 OCR 过程,通过(1)文本检测模块对表格图片进行单行文字检测,获得坐标,然后通过(2)文本识别模块识别模型得到文字结果。
而在下半部分的在蓝色支路中,表格图片首先经过(3)表格结构预测模块,获得每个 Excel 单元格的四点坐标与表格结构信息。结合黑色支路文本检测获得的单行文字文本框 4 点坐标,共同输入(4)Cell 坐标聚合模块,再通过(5)Cell 文本聚合模块,将属于同一单元格的文本拼接在一起。最后结合表格结构信息,通过(6)Excel 导出模块获得 Excel 形式的表格数据。
下面分别针每个模块分别展开介绍。
待识别表格图片:
识别代码:
from mailmerge import MailMerge
from sgluematch import supergluematch
import tools.infer.utility as util
import tools.infer.predict_system as pd
from PIL import Image
import numpy as np
import math
import cv2
import os
def comppic(raw_dir, comp_dir):
for n in os.listdir(comp_dir):
cimg = cv2.imread(comp_dir + n)
if cimg is not None:
ch, cw = cimg.shape[:2]
new = np.zeros((ch, cw, 3), dtype=np.uint8)
img1_pil = Image.fromarray(cv2.cvtColor(new, cv2.COLOR_BGR2RGB)) # 转换为PIL格式
for m in os.listdir(raw_dir):
rimg = cv2.imread(raw_dir + m)
ry, rx = rimg.shape[:2]
csite = supergluematch(rimg, cimg)
if type(csite) == int:
continue
else:
xscale = rx/(csite[1][0] - csite[0][0])
yscale = ry/(csite[1][1] - csite[0][1])
matchimg = cv2.resize(rimg, (0, 0), fx=xscale, fy=yscale, interpolation=cv2.INTER_NEAREST)
img2_pil = Image.fromarray(cv2.cvtColor(matchimg, cv2.COLOR_BGR2RGB))
img1_pil.paste(img2_pil, (csite[0][0], csite[0][1])) # img2贴在img1指定位置,位置是(左,上)
new = cv2.cvtColor(np.asarray(img1_pil), cv2.COLOR_RGB2BGR) # PIL转换为cv2格式
png = n.split(".")[0] + ".png"
cv2.imwrite("./temp/" + png, new)
return "./temp/"
def create_hue_mask(img, lower_color, upper_color):
lower = np.array(lower_color, np.uint8)
upper = np.array(upper_color, np.uint8)
mask = cv2.inRange(img, lower, upper)
return mask
def findlight(img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
light_hue = create_hue_mask(hsv, [100, 200, 180], [120, 255, 255])
element = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
light = cv2.dilate(light_hue, element, iterations=1)
return light
def finddark(img):
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
dark_hue = create_hue_mask(hsv, [0, 0, 0], [180, 255, 90])
element = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
dark = cv2.dilate(dark_hue, element, iterations=2)
return dark
def distance(x1, y1, x2, y2):
return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
def specialarea(img, text_sys):
blue = findlight(img)
black = finddark(img)
becontours, hierarchy = cv2.findContours(blue, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
bkcontours, hierarchy = cv2.findContours(black, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
checkp = []
checkcnt = []
result = []
for i, cnt in enumerate(becontours):
sx, sy, w, h = cv2.boundingRect(cnt)
if w*h > 50:
checkp.append([sx, sy])
for j in checkp:
minlen = math.inf
for i, cnt in enumerate(bkcontours):
leftmost = tuple(cnt[cnt[:, :, 0].argmin()][0])
cal_len = distance(leftmost[0], leftmost[1], j[0], j[1])
if cal_len < minlen:
checkcnt = cnt
minlen = cal_len
sx, sy, w, h = cv2.boundingRect(checkcnt)
image = img[sy-2: sy + h+2, sx-5: sx + w+5]
dboxes, re_res = text_sys(image)
image = [re_res[i][0] for i in range(len(re_res))]
result.append([j, image])
return result
if __name__ == "__main__":
template = '一般情况.docx'
comp_template = 'abcdefg'
deal_dir = comppic("./data/", "./template/")
args = util.parse_args(2)
text_sys = pd.TextSystem(args)
with open("./template/classes.txt", "r") as cfile:
classes = cfile.read().split("\n")
data = {}
for i in comp_template:
img = cv2.imread(deal_dir + i + ".png")
if img is not None:
height, width = img.shape[:2]
with open("./template/" + i + ".txt", "r") as file:
data_site = file.read()
checkr = specialarea(img, text_sys)
for j in data_site.split("\n"):
site = j.split(" ")
if len(site) == 5:
label, cx, cy, w, h = site
cx, cy, w, h = int(float(cx) * width), int(float(cy) * height), int(float(w) * width), int(
float(h) * height)
label = classes[int(label)]
sy, ey, sx, ex = int(cy - h / 2), int(cy + h / 2), int(cx - w / 2), int(cx + w / 2)
exist = False
for a in checkr:
if (sy < a[0][1]) & (ey > a[0][1]) & (sx < a[0][0]) & (ex > a[0][0]):
data[label] = a[1]
exist = True
break
if not exist:
cut_img = img[sy: ey, sx: ex]
dt_boxes, rec_res = text_sys(cut_img)
txts = [rec_res[i][0] for i in range(len(rec_res))]
data[label] = txts
for i, j in data.items():
print(i, j)
document = MailMerge(template)
document.merge(sg='165', )
document.write('生成的1份证明.docx')
识别效果: