import os
import threading
import http.server
import socketserver
from pathlib import Path
from openai import OpenAI
from PIL import Image
import time
import logging
import fitz # PyMuPDF
import signal
import sys
import httpx
# ========== 配置日志 ==========
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# ========== 配置参数 ==========
PDF_DIR = "./pdf" # PDF 文件存放目录
IMG_DIR = "./img" # 图片存放目录
OUT_DIR = "./out" # 识别结果输出目录
OCR_IMAGE_PATH = "./img/ocr.png" # 固定图片路径
# API 配置
API_BASE = "http://localhost:8080/v1" # llama-server 地址
API_KEY = "sk-no-key-required"
MODEL_NAME = "Qwen3.6-27B"
# HTTP 服务器配置
HTTP_SERVER_HOST = "0.0.0.0"
HTTP_SERVER_PORT = 8088
# 图片和请求配置
ZOOM_FACTOR = 1.0 # 缩放倍数(1.0 = 72 DPI)
IMAGE_QUALITY = 95 # JPEG 压缩质量
REQUEST_TIMEOUT = 120.0 # 单个请求超时时间(秒)
MAX_RETRIES = 3 # 最大重试次数
PAGE_DELAY = 0.5 # 每页处理后的极短延迟
# 全局变量
httpd = None
# 创建必要的目录
Path(IMG_DIR).mkdir(parents=True, exist_ok=True)
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)
# 初始化 OpenAI 客户端 (精简版)
http_client = httpx.Client(
timeout=httpx.Timeout(connect=10.0, read=REQUEST_TIMEOUT, write=10.0, pool=10.0)
)
client = OpenAI(
base_url=API_BASE,
api_key=API_KEY,
http_client=http_client,
max_retries=0 # 我们自己控制重试逻辑
)
# ========== HTTP 服务器部分 (保留原有结构) ==========
class ImageHTTPHandler(http.server.SimpleHTTPRequestHandler):
def __init__(self, *args, **kwargs):
super().__init__(*args, directory=IMG_DIR, **kwargs)
def end_headers(self):
self.send_header('Access-Control-Allow-Origin', '*')
# 允许基础缓存,因为我们通过 URL 参数 ?t= 控制图片更新
self.send_header('Cache-Control', 'no-cache')
super().end_headers()
def log_message(self, format, *args):
# 禁用烦人的 HTTP GET 访问日志,保持控制台清爽
pass
class ReuseAddressTCPServer(socketserver.TCPServer):
allow_reuse_address = True
def start_http_server():
global httpd
try:
httpd = ReuseAddressTCPServer((HTTP_SERVER_HOST, HTTP_SERVER_PORT), ImageHTTPHandler)
server_thread = threading.Thread(target=httpd.serve_forever, daemon=True)
server_thread.start()
logger.info(f"HTTP 服务器已启动: http://{HTTP_SERVER_HOST}:{HTTP_SERVER_PORT}")
return True
except OSError as e:
logger.error(f"HTTP 服务器启动失败: {e}")
return False
def stop_http_server():
global httpd
if httpd:
httpd.shutdown()
httpd.server_close()
httpd = None
def signal_handler(sig, frame):
logger.info("\n正在优雅关闭...")
stop_http_server()
sys.exit(0)
# ========== 核心处理逻辑 ==========
def pdf_to_single_image(pdf_path: str, page_num: int, output_path: str):
"""将 PDF 指定页面转换为图片并保存"""
doc = fitz.open(pdf_path)
if page_num < 1 or page_num > len(doc):
doc.close()
return None
page = doc[page_num - 1]
try:
zoom_matrix = fitz.Matrix(ZOOM_FACTOR, ZOOM_FACTOR)
pix = page.get_pixmap(matrix=zoom_matrix, colorspace=fitz.csRGB, alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img.save(output_path, "JPEG", quality=IMAGE_QUALITY, optimize=True)
doc.close()
return img
except Exception as e:
logger.error(f" 第 {page_num} 页转换图片失败: {e}")
doc.close()
return None
def recognize_image(image_url: str, page_num: int) -> tuple:
"""极其精简的识别函数,拥抱服务端缓存"""
# 【核心优化】:使用固定的系统提示词,绝不加随机字符串!
# 这能让 llama.cpp 在处理多页时,完美命中前面文本的 Prompt Cache
system_prompt = "请识别图片中的文本,保持原文排版顺序。只输出识别到的文字,不需解释。"
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": image_url}},
{"type": "text", "text": "请识别文本,只要正文,不作解释。"}
]}
]
for attempt in range(MAX_RETRIES + 1):
start_time = time.time()
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=4096,
temperature=0.1 # 极低的温度保证输出稳定
)
elapsed_time = time.time() - start_time
result = response.choices[0].message.content
if result:
logger.info(f" ✓ 第 {page_num} 页识别成功,耗时: {elapsed_time:.2f}s")
return result, elapsed_time
else:
return f"[第 {page_num} 页识别结果为空]", elapsed_time
except Exception as e:
elapsed_time = time.time() - start_time
logger.warning(f" ⚠ 第 {page_num} 页请求失败 (尝试 {attempt+1}/{MAX_RETRIES+1}): {e}")
if attempt < MAX_RETRIES:
time.sleep(3.0 * (attempt + 1))
else:
return f"[第 {page_num} 页识别出错: {str(e)}]", elapsed_time
def process_pdf(pdf_path: str):
"""处理单个 PDF 文件"""
pdf_name = Path(pdf_path).stem
logger.info(f"\n{'='*50}\n开始处理: {pdf_name}.pdf")
doc = fitz.open(pdf_path)
total_pages = len(doc)
doc.close()
all_text_pages = []
page_times = []
error_pages = []
pdf_start_time = time.time()
for page_num in range(1, total_pages + 1):
page_start_time = time.time()
# 使用时间戳更新图片 URL 避免浏览器/HTTP客户端缓存旧图片
image_url = f"http://{HTTP_SERVER_HOST}:{HTTP_SERVER_PORT}/ocr.png?t={int(time.time()*1000)}"
# 1. 转换图片
img = pdf_to_single_image(pdf_path, page_num, OCR_IMAGE_PATH)
if img is None:
all_text_pages.append(f"【第 {page_num} 页】\n[转换失败]\n")
error_pages.append(page_num)
continue
time.sleep(0.1) # 确保磁盘写入完成
# 2. 提交识别
page_text, recog_time = recognize_image(image_url, page_num)
preview_text = page_text.strip().replace('\n', ' ')[:100] # 取前100字并压缩换行
print(f"{preview_text}{'...' if len(page_text) > 100 else ''}")
if "识别出错" in page_text:
error_pages.append(page_num)
all_text_pages.append(f"\n{page_text}\n")
page_times.append(recog_time)
# 每页之间稍作喘息
time.sleep(PAGE_DELAY)
# 保存结果
total_elapsed = time.time() - pdf_start_time
avg_time = sum(page_times) / len(page_times) if page_times else 0
final_text = "\n".join(all_text_pages)
output_path = os.path.join(OUT_DIR, f"{pdf_name}.txt")
with open(output_path, 'w', encoding='utf-8') as f:
f.write(final_text)
f.write(f"\n\n{'='*50}\n【识别统计】\n总耗时: {total_elapsed:.2f} 秒\n平均每页: {avg_time:.2f} 秒\n")
if error_pages:
f.write(f"\n【错误页面】\n{', '.join(map(str, error_pages))}\n")
logger.info(f"\n--- {pdf_name}.pdf 处理完成 ---")
logger.info(f"平均速度: {avg_time:.2f}s/页。结果已保存至: {output_path}")
def main():
signal.signal(signal.SIGINT, signal_handler)
if not start_http_server():
return
pdf_files = list(Path(PDF_DIR).glob("*.pdf")) + list(Path(PDF_DIR).glob("*.PDF"))
if not pdf_files:
logger.warning(f"目录 {PDF_DIR} 中未找到 PDF 文件")
stop_http_server()
return
logger.info(f"找到 {len(pdf_files)} 个 PDF 文件准备处理...")
for idx, pdf_path in enumerate(pdf_files, 1):
try:
process_pdf(str(pdf_path))
if idx < len(pdf_files):
time.sleep(2)
except Exception as e:
logger.error(f"处理 {pdf_path.name} 时发生严重错误: {e}")
logger.info("\n全部任务执行完毕!")
stop_http_server()
if __name__ == "__main__":
main()