#!/usr/bin/python3
#
# Python OCR PDF Extraction
# https://github.com/tesseract-ocr/tesseract
#
# sudo apt install tesseract-ocr
# sudo apt install libtesseract-dev
# pip install pytesseract PyPDF2 pdfplumber opencv-python pillow
# pip install pdf2image
# sudo apt-get install poppler-utils
# sudo apt-get install tesseract-ocr-chi-sim # Simplified Chinese
# sudo apt-get install tesseract-ocr-chi-tra # Traditional Chinese
# tesseract --list-langs
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
import cv2
import numpy as np
from PIL import Image
# Path to Tesseract executable (update to match your system)
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
def preprocess_image(pil_image):
"""
Preprocesses an image for OCR using OpenCV.
Converts to grayscale, applies thresholding.
"""
# Convert PIL image to OpenCV format
open_cv_image = np.array(pil_image)
# Convert RGB to BGR (OpenCV default format)
open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
# Convert to grayscale
gray_image = cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY)
# Apply binary thresholding
_, thresh_image = cv2.threshold(gray_image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return thresh_image
def extract_text_from_pdf(pdf_path):
# First try extracting text from the PDF directly
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() or ""
# If no text is extracted, assume it's a scanned PDF and use OCR
if not text.strip():
images = convert_from_path(pdf_path)
for image in images:
# Preprocess image for better OCR results
preprocessed_image = preprocess_image(image)
# Convert OpenCV image back to PIL format for Tesseract
pil_image = Image.fromarray(preprocessed_image)
# Perform OCR
text += pytesseract.image_to_string(pil_image, lang='chi_sim')
return text
# Example usage
pdf_path = "scan_2025-01-02_09.31.pdf"
extracted_text = extract_text_from_pdf(pdf_path)
print(extracted_text)