diff --git a/backend/app/api/v1/upload.py b/backend/app/api/v1/upload.py index 2255de2..3f91c81 100644 --- a/backend/app/api/v1/upload.py +++ b/backend/app/api/v1/upload.py @@ -23,10 +23,10 @@ logger = get_logger(__name__) router = APIRouter(prefix="/upload") # 支持的文件类型和大小限制 -# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、CSV、VTT、Properties +# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、PPT、CSV、VTT、Properties ALLOWED_EXTENSIONS = { 'txt', 'md', 'mdx', 'pdf', 'html', 'htm', - 'xlsx', 'xls', 'docx', 'doc', 'csv', 'vtt', 'properties' + 'xlsx', 'xls', 'docx', 'doc', 'pptx', 'ppt', 'csv', 'vtt', 'properties' } MAX_FILE_SIZE = 15 * 1024 * 1024 # 15MB diff --git a/backend/app/services/ai/knowledge_analysis_v2.py b/backend/app/services/ai/knowledge_analysis_v2.py index ea0eeef..80bb6ea 100644 --- a/backend/app/services/ai/knowledge_analysis_v2.py +++ b/backend/app/services/ai/knowledge_analysis_v2.py @@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2: """ 提取文档内容 - 支持:PDF、Word(docx)、Excel(xlsx/xls)、文本文件 + 支持:PDF、Word(docx)、Excel(xlsx/xls)、PPT(pptx/ppt)、文本文件 """ suffix = file_path.suffix.lower() @@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2: return await self._extract_docx_content(file_path) elif suffix in ['.xlsx', '.xls']: return await self._extract_excel_content(file_path) + elif suffix in ['.pptx', '.ppt']: + return await self._extract_ppt_content(file_path) elif suffix in ['.txt', '.md', '.text']: return await self._extract_text_content(file_path) else: @@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2: logger.error(f"Excel 文件读取失败: {e}") raise ValueError(f"Excel 文件读取失败: {e}") + async def _extract_ppt_content(self, file_path: Path) -> str: + """提取 PowerPoint 文件内容""" + try: + from pptx import Presentation + from pptx.util import Inches + + prs = Presentation(str(file_path)) + text_parts = [] + + for slide_num, slide in enumerate(prs.slides, 1): + slide_texts = [] + text_parts.append(f"【幻灯片 {slide_num}】") + + for shape in slide.shapes: + # 提取文本框内容 + if hasattr(shape, "text") and shape.text.strip(): + slide_texts.append(shape.text.strip()) + + # 提取表格内容 + if shape.has_table: + table = shape.table + for row in table.rows: + row_text = ' | '.join( + cell.text.strip() for cell in row.cells if cell.text.strip() + ) + if row_text: + slide_texts.append(row_text) + + if slide_texts: + text_parts.append('\n'.join(slide_texts)) + else: + text_parts.append("(无文本内容)") + + content = '\n\n'.join(text_parts) + return self._clean_content(content) + + except ImportError: + logger.error("python-pptx 未安装,无法读取 PPT 文件") + raise ValueError("服务器未安装 PPT 读取组件(python-pptx)") + except Exception as e: + logger.error(f"PPT 文件读取失败: {e}") + raise ValueError(f"PPT 文件读取失败: {e}") + def _clean_content(self, content: str) -> str: """清理和截断内容""" # 移除多余空白 diff --git a/backend/requirements.txt b/backend/requirements.txt index d551780..ad4e27a 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -51,9 +51,10 @@ openpyxl==3.1.2 json-repair>=0.25.0 jsonschema>=4.0.0 -# PDF 文档提取 +# 文档提取 PyPDF2>=3.0.0 python-docx>=1.0.0 +python-pptx>=0.6.21 # 证书生成 Pillow>=10.0.0