import gradio as gr import re import os from transformers import pipeline from pypdf import PdfReader # 引入 PDF 读取库 # --- 1. 加载模型 --- print("正在加载 NER 模型...") ner_pipeline = pipeline("ner", model="uer/roberta-base-finetuned-cluener2020-chinese", aggregation_strategy="simple") # --- 2. 核心功能函数 --- def extract_money(text): """正则提取金额""" money_entities = [] pattern = r'([¥$€USDCNY人民币]*\s*\d{1,3}(?:,\d{3})*(?:\.\d+)?\s*[万亿]?(?:元|美元|欧元|CNY|HKD)?)' matches = re.finditer(pattern, text) for match in matches: val = match.group(0).strip() if len(val) > 1 and (re.search(r'[^\d.,]', val) or '.' in val): money_entities.append(val) return money_entities def run_ner_on_long_text(text): """分段处理长文本,避免 BERT 报错""" chunk_size = 400 all_results = [] for i in range(0, len(text), chunk_size): chunk = text[i : i + chunk_size] if not chunk.strip(): continue try: chunk_results = ner_pipeline(chunk) all_results.extend(chunk_results) except Exception as e: print(f"片段处理出错: {e}") continue return all_results def read_file_content(file_obj): """ 识别文件类型并提取文本 支持:.txt, .pdf """ content = "" try: file_path = file_obj.name file_ext = os.path.splitext(file_path)[1].lower() if file_ext == ".pdf": # 处理 PDF reader = PdfReader(file_path) for page in reader.pages: # 提取每一页的文本并拼接 text = page.extract_text() if text: content += text + "\n" else: # 默认当作 TXT 处理 with open(file_path, "r", encoding="utf-8") as f: content = f.read() except Exception as e: return f"ERROR: 文件读取失败 ({str(e)})" return content def extract_audit_info(contract_text, file_obj): # 1. 获取文本内容(优先读文件,否则读文本框) content = "" if file_obj is not None: content = read_file_content(file_obj) if content.startswith("ERROR"): return content # 返回错误信息 else: content = contract_text if not content or not content.strip(): return "⚠️ 未能提取到文本内容。请确保:\n1. 上传了正确的文件。\n2. 如果是 PDF,请确保是【文字版】而非【扫描图片版】(图片版需要OCR功能)。" # 限制分析长度(防止内存爆炸,取前 5000 字) if len(content) > 5000: process_text = content[:5000] warning = f"(提示:文本共 {len(content)} 字,仅分析前 5000 字)\n\n" else: process_text = content warning = "" # 2. AI 模型分析(分段) ner_results = run_ner_on_long_text(process_text) # 3. 正则提取金额 money_results = extract_money(process_text) # 4. 生成报告 output_str = f"=== 📊 提取报告 ===\n{warning}" # 金额部分 output_str += "💰【涉及金额】:\n" money_results = sorted(list(set(money_results))) # 去重排序 if money_results: for m in money_results: output_str += f"- {m}\n" else: output_str += "(无)\n" output_str += "\n" # 实体部分 label_map = { "organization": "🏢 组织/公司", "company": "🏢 公司", "name": "👤 人名", "time": "📅 时间/日期", "address": "📍 地址", "position": "💼 职位" } found_entities = {} for item in ner_results: group = item['entity_group'] word = item['word'] # 过滤杂质:置信度>0.4 且 长度>1 if item['score'] > 0.4 and len(word) > 1: cn_label = label_map.get(group, group) if cn_label not in found_entities: found_entities[cn_label] = set() found_entities[cn_label].add(word) for label, words in found_entities.items(): output_str += f"{label}:\n" for w in words: output_str += f"- {w}\n" output_str += "\n" return output_str # --- 3. Gradio 界面 --- with gr.Blocks() as demo: gr.Markdown("# 🧾 智能审计/合同信息提取") gr.Markdown("支持上传 **.txt** 或 **.pdf** 文件,自动提取金额、日期、公司名等。") with gr.Row(): with gr.Column(): input_text = gr.Textbox(label="直接粘贴文本", lines=8, placeholder="在此粘贴合同文本...") # 修改 file_types 支持 pdf input_file = gr.File(label="上传文件 (PDF / TXT)", file_types=[".txt", ".pdf"]) btn = gr.Button("🚀 开始分析", variant="primary") with gr.Column(): output = gr.Textbox(label="分析结果", lines=15) btn.click(extract_audit_info, inputs=[input_text, input_file], outputs=output) demo.launch()