paddlenlp关键词提取 python提取pdf关键词

转载

mob64ca13fd163c 2023-10-09 14:59:06

文章标签 paddlenlp关键词提取 python pandas 数据分析 excel 文章分类 NLP 人工智能

import os
import pandas as pd
import PyPDF2
import re
from concurrent.futures import ThreadPoolExecutor
import multiprocessing

# 定义一个函数，用于提取PDF文件中的内容
def extract_content(pdf_folder, pdf_filename, i):
    file_path = os.path.join(pdf_folder, pdf_filename)
    if os.path.exists(file_path):
        with open(file_path, 'rb') as f:
            # 使用PyPDF2库打开PDF文件，一次性读取PDF文件内容
            pdf_reader = PyPDF2.PdfReader(f)
            content = pdf_reader.read()

            # 在文本内容中查找conclusion段落
            conclusion_index = content.find('conclusion')
            conclusion = '' # 定义空字符串作为默认值
            if conclusion_index != -1:
                # 如果找到了conclusion段落，则根据目录页数查找对应段落
                directory_index = content.find('Directory')
                if directory_index != -1:
                    # 从目录页数开始，查找对应页数
                    directory_content = content[directory_index:]
                    page_number_index = directory_content.find(str(i + 2))
                    if page_number_index != -1:
                        # 找到目标页数，定位到对应段落
                        page_start_index = directory_content[page_number_index:].find('pagenum') + page_number_index
                        page_end_index = directory_content[page_start_index:].find('\n') + page_start_index
                        start_page_str = re.findall('\d+', directory_content[page_start_index:page_end_index])[0]
                        start_page = int(start_page_str)
                        end_page_index = directory_content[page_start_index:].find('\n') + page_start_index
                        end_page_str = re.findall('\d+', directory_content[end_page_index+1:])[0]
                        end_page = int(end_page_str)
                        target_content = ''
                        for j in range(start_page-1, end_page):
                            target_content += pdf_reader.pages[j].extract_text()
                        conclusion = target_content[target_content.find('conclusion'):]

            else:
                # 如果没有找到conclusion段落，则查找"in conclusion"的字眼段落
                in_conclusion_index = content.find('in conclusion')
                if in_conclusion_index != -1:
                    conclusion = content[in_conclusion_index:]
                else:
                    # 如果还是没有找到，则查找"summary"的字眼段落
                    summary_index = content.find('summary')
                    if summary_index != -1:
                        conclusion = content[summary_index:]

            return conclusion

# 读取原始Excel表格，获取pdf_filename列数据
df = pd.read_excel(r'C:\Users\win10\Desktop\工作簿1.xlsx')
pdf_filenames = df['pdf_filename']

# 在原Excel表格创建新列conclusion
df['conclusion'] = ''

# 设置线程池的最大线程数或进程池的最大进程数
max_workers = multiprocessing.cpu_count()

# 定义线程池或进程池
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # 遍历pdf文件夹中的所有文件，找到与pdf_filename对应的文件并提取conclusion段落
    pdf_folder = 'C:/Users/win10/Desktop/pdf_files'
    futures = []
    for i, pdf_filename in enumerate(pdf_filenames):
        futures.append(executor.submit(extract_content, pdf_folder, pdf_filename, i))

    # 将结果写入新Excel表格
    for i, future in enumerate(futures):
        df.loc[i, 'conclusion'] = future.result()

# 保存修改后的Excel表格
df.to_excel(r'C:\Users\win10\Desktop\工作簿1_mod.xlsx', index=False)

在每次处理 PDF 文件时，循环读取每一页并逐页提取内容会导致循环次数较多。可以考虑一次性读取整个 PDF 文件的内容，然后在内存中查找所需内容。这样只需要进行一次循环就可以完成所有内容的提取，从而减少循环次数。
具体做法是，在打开 PDF 文件后，使用 PyPDF2 库的 PdfReader 对象的 read() 方法一次性读取出整个 PDF 文件的内容，并保存在一个字符串变量中。然后再在这个字符串中查找所需内容，而不是逐页读取和查找。

# 使用PyPDF2库打开PDF文件，一次性读取PDF文件内容

pdf_reader = PyPDF2.PdfReader(f)

利用多进程或者多线程并发处理：可以考虑使用Python自带的multiprocessing或者第三方库concurrent.futures等支持多进程或者多线程并发处理的工具库，将不同的任务分配给不同的线程或进程执行，可以显著提高程序的运行效率。
具体来说，如果想要利用多进程或者多线程并发处理，可以将遍历PDF文件部分的代码封装成一个函数，然后通过Pool（进程池）或者ThreadPoolExecutor（线程池）的方式来启动多个子进程或子线程。例如：

from concurrent.futures import ThreadPoolExecutor
import multiprocessing

# 设置线程池的最大线程数或进程池的最大进程数
max_workers = multiprocessing.cpu_count()

# 定义线程池或进程池
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # 遍历pdf文件夹中的所有文件，找到与pdf_filename对应的文件并提取conclusion段落
    pdf_folder = 'C:/Users/win10/Desktop/pdf_files'
    futures = []
    for i, pdf_filename in enumerate(pdf_filenames):
        futures.append(executor.submit(extract_content, pdf_folder, pdf_filename, i))

    # 将结果写入新Excel表格
    for i, future in enumerate(futures):
        df.loc[i, 'conclusion'] = future.result()

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。