表格识别技术主要使用基于注意力机制的图片描述模型 RARE,整体流程如下图所示,对于其中的表格区域进行表格识别处理。

表格识别的难点主要在于表格结构的提取,以及将表格信息与 OCR 信息融合。整体流程可以分为上下两部分,其中上半部分(黑色支路)是普通的 OCR 过程,通过(1)文本检测模块对表格图片进行单行文字检测,获得坐标,然后通过(2)文本识别模块识别模型得到文字结果。
而在下半部分的在蓝色支路中,表格图片首先经过(3)表格结构预测模块,获得每个 Excel 单元格的四点坐标与表格结构信息。结合黑色支路文本检测获得的单行文字文本框 4 点坐标,共同输入(4)Cell 坐标聚合模块,再通过(5)Cell 文本聚合模块,将属于同一单元格的文本拼接在一起。最后结合表格结构信息,通过(6)Excel 导出模块获得 Excel 形式的表格数据。
下面分别针每个模块分别展开介绍。

待识别表格图片:

ocr 数字 python python ocr 表格_python

识别代码:

from mailmerge import MailMerge
from sgluematch import supergluematch
import tools.infer.utility as util
import tools.infer.predict_system as pd
from PIL import Image
import numpy as np
import math
import cv2
import os


def comppic(raw_dir, comp_dir):
    for n in os.listdir(comp_dir):
        cimg = cv2.imread(comp_dir + n)
        if cimg is not None:
            ch, cw = cimg.shape[:2]
            new = np.zeros((ch, cw, 3), dtype=np.uint8)
            img1_pil = Image.fromarray(cv2.cvtColor(new, cv2.COLOR_BGR2RGB))  # 转换为PIL格式
            for m in os.listdir(raw_dir):
                rimg = cv2.imread(raw_dir + m)
                ry, rx = rimg.shape[:2]
                csite = supergluematch(rimg, cimg)
                if type(csite) == int:
                    continue
                else:
                    xscale = rx/(csite[1][0] - csite[0][0])
                    yscale = ry/(csite[1][1] - csite[0][1])
                    matchimg = cv2.resize(rimg, (0, 0), fx=xscale, fy=yscale, interpolation=cv2.INTER_NEAREST)
                    img2_pil = Image.fromarray(cv2.cvtColor(matchimg, cv2.COLOR_BGR2RGB))
                    img1_pil.paste(img2_pil, (csite[0][0], csite[0][1]))  # img2贴在img1指定位置,位置是(左,上)
            new = cv2.cvtColor(np.asarray(img1_pil), cv2.COLOR_RGB2BGR)  # PIL转换为cv2格式
            png = n.split(".")[0] + ".png"
            cv2.imwrite("./temp/" + png, new)
        return "./temp/"


def create_hue_mask(img, lower_color, upper_color):
    lower = np.array(lower_color, np.uint8)
    upper = np.array(upper_color, np.uint8)
    mask = cv2.inRange(img, lower, upper)
    return mask


def findlight(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    light_hue = create_hue_mask(hsv, [100, 200, 180], [120, 255, 255])
    element = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    light = cv2.dilate(light_hue, element, iterations=1)
    return light


def finddark(img):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    dark_hue = create_hue_mask(hsv, [0, 0, 0], [180, 255, 90])
    element = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
    dark = cv2.dilate(dark_hue, element, iterations=2)
    return dark


def distance(x1, y1, x2, y2):
    return math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)


def specialarea(img, text_sys):
    blue = findlight(img)
    black = finddark(img)
    becontours, hierarchy = cv2.findContours(blue, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    bkcontours, hierarchy = cv2.findContours(black, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
    checkp = []
    checkcnt = []
    result = []
    for i, cnt in enumerate(becontours):
        sx, sy, w, h = cv2.boundingRect(cnt)
        if w*h > 50:
            checkp.append([sx, sy])
    for j in checkp:
        minlen = math.inf
        for i, cnt in enumerate(bkcontours):
            leftmost = tuple(cnt[cnt[:, :, 0].argmin()][0])
            cal_len = distance(leftmost[0], leftmost[1], j[0], j[1])
            if cal_len < minlen:
                checkcnt = cnt
                minlen = cal_len
        sx, sy, w, h = cv2.boundingRect(checkcnt)
        image = img[sy-2: sy + h+2, sx-5: sx + w+5]
        dboxes, re_res = text_sys(image)
        image = [re_res[i][0] for i in range(len(re_res))]
        result.append([j, image])
    return result


if __name__ == "__main__":

    template = '一般情况.docx'

    comp_template = 'abcdefg'

    deal_dir = comppic("./data/", "./template/")

    args = util.parse_args(2)
    text_sys = pd.TextSystem(args)

    with open("./template/classes.txt", "r") as cfile:
        classes = cfile.read().split("\n")

    data = {}

    for i in comp_template:
        img = cv2.imread(deal_dir + i + ".png")
        if img is not None:
            height, width = img.shape[:2]
            with open("./template/" + i + ".txt", "r") as file:
                data_site = file.read()

            checkr = specialarea(img, text_sys)

            for j in data_site.split("\n"):
                site = j.split(" ")
                if len(site) == 5:
                    label, cx, cy, w, h = site
                    cx, cy, w, h = int(float(cx) * width), int(float(cy) * height), int(float(w) * width), int(
                        float(h) * height)
                    label = classes[int(label)]
                    sy, ey, sx, ex = int(cy - h / 2), int(cy + h / 2), int(cx - w / 2), int(cx + w / 2)
                    exist = False
                    for a in checkr:
                        if (sy < a[0][1]) & (ey > a[0][1]) & (sx < a[0][0]) & (ex > a[0][0]):
                            data[label] = a[1]
                            exist = True
                            break
                    if not exist:
                        cut_img = img[sy: ey, sx: ex]
                        dt_boxes, rec_res = text_sys(cut_img)
                        txts = [rec_res[i][0] for i in range(len(rec_res))]
                        data[label] = txts
    for i, j in data.items():
        print(i, j)
    document = MailMerge(template)

    document.merge(sg='165', )

    document.write('生成的1份证明.docx')

识别效果:

ocr 数字 python python ocr 表格_paddle_02