最近做了一个pdf解析的工具,能够对部分的pdf文档进行解析,对扫描版的pdf没啥用,我这里把我实现的代码分享出来:

  • 安装
pip install pdfplumber

 

  • 代码
import pdfplumber
import pandas as pd
import os
from tqdm import tqdm
# pip install pdfplumber

def PDF_parser(xlsx_name,pdf_name):
writer=pd.ExcelWriter(xlsx_name,engine="openpyxl")
with pdfplumber.open(pdf_name) as pdf:
# print(pdf.pages)
num=0
for i in range(len(pdf.pages)):
page = pdf.pages[i]
# print(first_page.chars[0])
table = page.extract_tables() #提取表格

# print(page.page_number)
# print(table)
for index,t in enumerate(table):
num+=1
# 得到的table是嵌套list类型,转化成DataFrame更加方便查看和分析
df = pd.DataFrame(t[1:], columns=t[0]) #把表格编程dataframe
df.to_excel(writer, sheet_name='Table '+str(num),index=False, engine='xlsxwriter') #写入到excel中
# csv_name='data/'+str(i)+'_'+str(index)+'.xlsx'
# with pd.ExcelWriter('output.xlsx',mode='a',engine="openpyxl") as writer:
# df.to_excel(writer, sheet_name='Sheet'+str(num))
# df.to_csv(csv_name,index=False)
# df.to_excel(csv_name,index=False)
print(df.shape)
writer.save()
writer.close()

def create(root_dir):
os.makedirs(root_dir,exist_ok=True)

if __name__ == "__main__":
# root_dir='Manhattan 2' # pdf directory
root_dir='problem' # pdf directory
output_dir='excel1' # excel output directory
create(output_dir) # 如果输出目录没有,则自动创建输出目录
pdf_names=os.listdir(root_dir)
pdf_names=[item for item in pdf_names if(item.split('.')[-1]=='pdf')] # 找出目录下所有的pdf
# pdf_name="NYSERDA Energy Assessment Report - Mount Morris Ascension Presbyterian Church.pdf"
# xlsx_name='output.xlsx'
print(len(pdf_names))
for pdf_name in tqdm(pdf_names): # 遍历每个pdf
print(pdf_name)
xlsx_name=pdf_name.replace('.pdf','.xlsx')
xlsx_name=os.path.join(output_dir,xlsx_name) # 输出的excel文件名
pdf_name=os.path.join(root_dir,pdf_name) # pdf的文件名
try:
PDF_parser(xlsx_name,pdf_name) #pdf解析
except Exception as e:
print(e)
pass

效果还行,注意只能够解析部分的文档。