Replies: 6 comments 5 replies
-
你好。通过 命令行 调用 Umi-OCR.exe 指令时,参数是与UI界面设置的一致,比如截图指令沿用截图标签页的设置。 但是,通过 HTTP接口 来调用时,设置是 独立的 ,只能通过HTTP传入的参数字典(即options项)控制。 UI界面能控制的参数,HTTP接口也全部能够控制,并且效果是相同的(底层沿用同一套任务流程),理论上不会出现界面执行正常而接口不正常的情况。如果有这种情况,大概率是参数设置不正确。 你可以放一下具体的问题和你的调用代码,我们一起分析下。 |
Beta Was this translation helpful? Give feedback.
-
import os
from pdf2image import convert_from_path
import requests
import base64
import os
import pandas as pd
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
from datetime import datetime
import shutil
import random
import openai
from openai import OpenAI
import requests
import json
import numpy as np
import re
import os
import re
import os
from tkinter import Tk
from tkinter.filedialog import askdirectory
pdf_folderPath =(r'C:\Users\bmqys\PycharmProjects\pythonProject5\全部文件')# PDF 文件夹路径
output_conversionPath = (r'C:\Users\bmqys\PycharmProjects\pythonProject5\全部文件\转换结果')
def pdf_to_images(pdf_path, output_folder, poppler_path=None):
"""
将 PDF 转换为图片
"""
images = convert_from_path(pdf_path, dpi=300, poppler_path=poppler_path)
image_paths = []
# 保存每页为图片
for i, page in enumerate(images):
image_path = os.path.join(output_folder, f"{os.path.basename(pdf_path)}_page_{i + 1}.png")
page.save(image_path, "PNG")
image_paths.append(image_path)
return image_paths
def ocr_image(image_path):
"""
使用 UMI-OCR HTTP 服务对图片执行 OCR
"""
# 设置 UMI-OCR HTTP 服务地址
url = 'http://127.0.0.1:1224/api/ocr'
# 读取图片并进行 Base64 编码
with open(image_path, 'rb') as image_file:
encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
# 构建 POST 请求数据
data = {
"base64": encoded_image,
"options": {
"ocr.language": "models/config_chinese.txt",
"ocr.cls": "true",
"tbpu.parser": "multi_para",
"data.format": "text",
"doc.extractionMode": "mixed"# 返回纯文本格式
}
}
# 发送请求
response = requests.post(url, json=data)
if response.status_code == 200:
result = response.json()
if result.get('code') == 100:
return result.get('data', '')
return ''
def process_folder(pdf_folder, output_folder, poppler_path=None):
"""
遍历文件夹内所有 PDF 文件,并将其转换为 TXT 文件
"""
os.makedirs(output_folder, exist_ok=True) # 创建输出文件夹
for file_name in os.listdir(pdf_folder):
if file_name.endswith('.pdf'):
pdf_path = os.path.join(pdf_folder, file_name)
txt_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.txt")
temp_image_folder = os.path.join(output_folder, "temp_images")
# 创建临时图片文件夹
os.makedirs(temp_image_folder, exist_ok=True)
print(f"正在处理文件: {pdf_path}")
try:
# 将 PDF 转为图片
image_paths = pdf_to_images(pdf_path, temp_image_folder, poppler_path=poppler_path)
# OCR 识别每页的图片并合并文本
full_text = ""
for image_path in image_paths:
text = ocr_image(image_path)
full_text += text + "\n"
# 保存 OCR 结果为 TXT 文件
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(full_text)
print(f"文件处理完成: {txt_path}")
except Exception as e:
print(f"处理文件时发生错误: {pdf_path}, 错误信息: {e}")
# 删除临时图片文件
for temp_image in os.listdir(temp_image_folder):
os.remove(os.path.join(temp_image_folder, temp_image))
os.rmdir(temp_image_folder)
# 输入文件夹路径
pdf_folder = r"{}".format(pdf_folderPath) # PDF 文件夹路径
output_folder = output_conversionPath # 输出文件夹路径
process_folder(pdf_folder, output_folder) 这个是我的代码 |
Beta Was this translation helpful? Give feedback.
-
账单.txt |
Beta Was this translation helpful? Give feedback.
-
不好意思久等了。 你的exe界面可否截图给我看看?PDF识别标签页的设置栏。 检查下与你的设置项是否一致。
|
Beta Was this translation helpful? Give feedback.
-
新年好。 观察 我认为很可能原PDF是文字混合型的(即PDF中写入了文本数据,而不是纯图片)。根据你的exe截图,当使用 你可以尝试用WPS等PDF阅读器打开此PDF(注意鼠标切换成箭头而不是手型),看看能否框选、复制文字;如果可以,说明确实是混合型文件。 或者,你可以在Umi exe中将内容提取模式设为 在这种情况下,你依然可以使用Umi HTTP 文档识别流程 来实现直接从PDF中提取文本。 第1步上传文件时,可以使用以下参数:
第3步获取下载链接时,可以使用以下参数:
|
Beta Was this translation helpful? Give feedback.
-
久等了,这是调用代码,已帮你封装为单个函数,可直接调用,效果与UI面板操作相同。 点击展开函数调用示例:from 下面的代码 import extract_pdf_to_txt
extract_pdf_to_txt(
pdf_path=r"C:\Users\My\Desktop\test\23442.pdf", # 原文件
txt_path=r"C:\Users\My\Desktop\test\提取文本.txt", # 保存文件路径
) 封装函数:# https://github.com/hiroi-sora/Umi-OCR/blob/main/docs/http/api_doc.md#/api/doc
import os
import json
import time
import requests
# 封装函数:提取pdf文件中的文本,保存为txt文件
def extract_pdf_to_txt(
pdf_path, # 原文件路径
txt_path, # 保存文件路径
# Umi-OCR 接口地址
base_url="http://127.0.0.1:1224",
# 代理,默认为空
proxies={"http": None, "https": None},
):
url = "{}/api/doc/upload".format(base_url)
print("=======================================")
print("===== 1. Upload file, get task ID =====")
print("== URL:", url)
# File to be recognized
file_path = pdf_path
# Task parameters
options_json = json.dumps(
{
"doc.extractionMode": "textOnly", # 仅拷贝原有文本,完全不进行OCR
}
)
with open(file_path, "rb") as file:
response = requests.post(
url, files={"file": file}, data={"json": options_json}, proxies=proxies
)
response.raise_for_status()
res_data = json.loads(response.text)
if res_data["code"] == 101:
# If code == 101, it indicates that the server did not receive the uploaded file.
# On some Linux systems, if file_name contains non-ASCII characters, this error might occur.
# In this case, we can specify a temp_name containing only ASCII characters to construct the upload request.
file_name = os.path.basename(file_path)
file_prefix, file_suffix = os.path.splitext(file_name)
temp_name = "temp" + file_suffix
print("[Warning] Detected file upload failure: code == 101")
print(
"Attempting to use temp_name",
temp_name,
"instead of the original file_name",
file_name,
)
with open(file_path, "rb") as file:
response = requests.post(
url,
# use temp_name to construct the upload request
files={"file": (temp_name, file)},
data={"json": options_json},
proxies=proxies,
)
response.raise_for_status()
res_data = json.loads(response.text)
assert res_data["code"] == 100, "Task submission failed: {}".format(res_data)
id = res_data["data"]
print("Task ID:", id)
url = "{}/api/doc/result".format(base_url)
print("===================================================")
print("===== 2. Poll task status until OCR task ends =====")
print("== URL:", url)
headers = {"Content-Type": "application/json"}
data_str = json.dumps(
{
"id": id,
"is_data": True,
"format": "text",
"is_unread": True,
}
)
while True:
time.sleep(1)
response = requests.post(url, data=data_str, headers=headers, proxies=proxies)
response.raise_for_status()
res_data = json.loads(response.text)
assert res_data["code"] == 100, "Failed to get task status: {}".format(res_data)
print(
" Progress: {}/{}".format(
res_data["processed_count"], res_data["pages_count"]
)
)
if res_data["data"]:
print("{}\n========================".format(res_data["data"]))
if res_data["is_done"]:
state = res_data["state"]
assert state == "success", "Task execution failed: {}".format(
res_data["message"]
)
print("OCR task completed.")
break
url = "{}/api/doc/download".format(base_url)
print("======================================================")
print("===== 3. Generate target file, get download link =====")
print("== URL:", url)
# Download file parameters
download_options = {"file_types": ["txtPlain"]}
download_options["id"] = id
data_str = json.dumps(download_options)
response = requests.post(url, data=data_str, headers=headers, proxies=proxies)
response.raise_for_status()
res_data = json.loads(response.text)
assert res_data["code"] == 100, "Failed to get download URL: {}".format(res_data)
url = res_data["data"]
print("===================================")
print("===== 4. Download target file =====")
print("== URL:", url)
# Save location for downloaded files
download_path = txt_path
# 检查文件是否存在,如果存在则删除
if os.path.isfile(download_path):
os.remove(download_path)
# 获取上级目录
parent_dir = os.path.dirname(download_path)
# 如果上级目录不存在,则创建
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
response = requests.get(url, stream=True, proxies=proxies)
response.raise_for_status()
# Download file size
total_size = int(response.headers.get("content-length", 0))
downloaded_size = 0
log_size = 10485760 # Print progress every 10MB
with open(download_path, "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
file.write(chunk)
downloaded_size += len(chunk)
if downloaded_size >= log_size:
log_size = downloaded_size + 10485760
progress = (downloaded_size / total_size) * 100
print(
" Downloading file: {}MB | Progress: {:.2f}%".format(
int(downloaded_size / 1048576), progress
)
)
print("Target file downloaded successfully: ", download_path)
url = "{}/api/doc/clear/{}".format(base_url, id)
print("============================")
print("===== 5. Clean up task =====")
print("== URL:", url)
response = requests.get(url, proxies=proxies)
response.raise_for_status()
res_data = json.loads(response.text)
assert res_data["code"] == 100, "Task cleanup failed: {}".format(res_data)
print("Task cleaned up successfully.")
print("======================\nProcess completed.") |
Beta Was this translation helpful? Give feedback.
-
我也是今天发现的一点问题,所以想要请教一下如何去调整命令。我是按照文档里面去做:
options然后里面就是选择模式之类的。我认为这个请求应该是对应了使用.exe 程序时对应的设置,可是实际应用时仍是有出现识别内容错位的问题。希望可以求的一个帮助
Beta Was this translation helpful? Give feedback.
All reactions