From c3aa4e85e776d484c67af6e1ea6add4ed5e2f2a0 Mon Sep 17 00:00:00 2001 From: yuliang_guo Date: Sat, 31 Jan 2026 11:49:10 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0PPT/PPTX=E6=96=87?= =?UTF-8?q?=E4=BB=B6=E7=B1=BB=E5=9E=8B=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. upload.py: 添加ppt/pptx到允许上传的文件类型 2. knowledge_analysis_v2.py: 添加PPT内容提取方法_extract_ppt_content 3. requirements.txt: 添加python-pptx依赖 --- backend/app/api/v1/upload.py | 4 +- .../app/services/ai/knowledge_analysis_v2.py | 47 ++++++++++++++++++- backend/requirements.txt | 3 +- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/backend/app/api/v1/upload.py b/backend/app/api/v1/upload.py index 2255de2..3f91c81 100644 --- a/backend/app/api/v1/upload.py +++ b/backend/app/api/v1/upload.py @@ -23,10 +23,10 @@ logger = get_logger(__name__) router = APIRouter(prefix="/upload") # 支持的文件类型和大小限制 -# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、CSV、VTT、Properties +# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、PPT、CSV、VTT、Properties ALLOWED_EXTENSIONS = { 'txt', 'md', 'mdx', 'pdf', 'html', 'htm', - 'xlsx', 'xls', 'docx', 'doc', 'csv', 'vtt', 'properties' + 'xlsx', 'xls', 'docx', 'doc', 'pptx', 'ppt', 'csv', 'vtt', 'properties' } MAX_FILE_SIZE = 15 * 1024 * 1024 # 15MB diff --git a/backend/app/services/ai/knowledge_analysis_v2.py b/backend/app/services/ai/knowledge_analysis_v2.py index ea0eeef..80bb6ea 100644 --- a/backend/app/services/ai/knowledge_analysis_v2.py +++ b/backend/app/services/ai/knowledge_analysis_v2.py @@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2: """ 提取文档内容 - 支持:PDF、Word(docx)、Excel(xlsx/xls)、文本文件 + 支持:PDF、Word(docx)、Excel(xlsx/xls)、PPT(pptx/ppt)、文本文件 """ suffix = file_path.suffix.lower() @@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2: return await self._extract_docx_content(file_path) elif suffix in ['.xlsx', '.xls']: return await self._extract_excel_content(file_path) + elif suffix in ['.pptx', '.ppt']: + return await self._extract_ppt_content(file_path) elif suffix in ['.txt', '.md', '.text']: return await self._extract_text_content(file_path) else: @@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2: logger.error(f"Excel 文件读取失败: {e}") raise ValueError(f"Excel 文件读取失败: {e}") + async def _extract_ppt_content(self, file_path: Path) -> str: + """提取 PowerPoint 文件内容""" + try: + from pptx import Presentation + from pptx.util import Inches + + prs = Presentation(str(file_path)) + text_parts = [] + + for slide_num, slide in enumerate(prs.slides, 1): + slide_texts = [] + text_parts.append(f"【幻灯片 {slide_num}】") + + for shape in slide.shapes: + # 提取文本框内容 + if hasattr(shape, "text") and shape.text.strip(): + slide_texts.append(shape.text.strip()) + + # 提取表格内容 + if shape.has_table: + table = shape.table + for row in table.rows: + row_text = ' | '.join( + cell.text.strip() for cell in row.cells if cell.text.strip() + ) + if row_text: + slide_texts.append(row_text) + + if slide_texts: + text_parts.append('\n'.join(slide_texts)) + else: + text_parts.append("(无文本内容)") + + content = '\n\n'.join(text_parts) + return self._clean_content(content) + + except ImportError: + logger.error("python-pptx 未安装,无法读取 PPT 文件") + raise ValueError("服务器未安装 PPT 读取组件(python-pptx)") + except Exception as e: + logger.error(f"PPT 文件读取失败: {e}") + raise ValueError(f"PPT 文件读取失败: {e}") + def _clean_content(self, content: str) -> str: """清理和截断内容""" # 移除多余空白 diff --git a/backend/requirements.txt b/backend/requirements.txt index d551780..ad4e27a 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -51,9 +51,10 @@ openpyxl==3.1.2 json-repair>=0.25.0 jsonschema>=4.0.0 -# PDF 文档提取 +# 文档提取 PyPDF2>=3.0.0 python-docx>=1.0.0 +python-pptx>=0.6.21 # 证书生成 Pillow>=10.0.0