feat: 添加PPT/PPTX文件类型支持

1. upload.py: 添加ppt/pptx到允许上传的文件类型 2. knowledge_analysis_v2.py: 添加PPT内容提取方法_extract_ppt_content 3. requirements.txt: 添加python-pptx依赖
2026-01-31 11:49:10 +08:00
parent 4e817f6eef
commit c3aa4e85e7
3 changed files with 50 additions and 4 deletions
--- a/backend/app/api/v1/upload.py
+++ b/backend/app/api/v1/upload.py
@@ -23,10 +23,10 @@ logger = get_logger(__name__)
 router = APIRouter(prefix="/upload")

 # 支持的文件类型和大小限制
-# 支持格式：TXT、Markdown、MDX、PDF、HTML、Excel、Word、CSV、VTT、Properties
+# 支持格式：TXT、Markdown、MDX、PDF、HTML、Excel、Word、PPT、CSV、VTT、Properties
 ALLOWED_EXTENSIONS = {
    'txt', 'md', 'mdx', 'pdf', 'html', 'htm',
-    'xlsx', 'xls', 'docx', 'doc', 'csv', 'vtt', 'properties'
+    'xlsx', 'xls', 'docx', 'doc', 'pptx', 'ppt', 'csv', 'vtt', 'properties'
 }
 MAX_FILE_SIZE = 15 * 1024 * 1024  # 15MB

--- a/backend/app/services/ai/knowledge_analysis_v2.py
+++ b/backend/app/services/ai/knowledge_analysis_v2.py
@@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2:
        """
        提取文档内容
        
-        支持：PDF、Word（docx）、Excel（xlsx/xls）、文本文件
+        支持：PDF、Word（docx）、Excel（xlsx/xls）、PPT（pptx/ppt）、文本文件
        """
        suffix = file_path.suffix.lower()
        
@@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2:
                return await self._extract_docx_content(file_path)
            elif suffix in ['.xlsx', '.xls']:
                return await self._extract_excel_content(file_path)
+            elif suffix in ['.pptx', '.ppt']:
+                return await self._extract_ppt_content(file_path)
            elif suffix in ['.txt', '.md', '.text']:
                return await self._extract_text_content(file_path)
            else:
@@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2:
            logger.error(f"Excel 文件读取失败: {e}")
            raise ValueError(f"Excel 文件读取失败: {e}")
    
+    async def _extract_ppt_content(self, file_path: Path) -> str:
+        """提取 PowerPoint 文件内容"""
+        try:
+            from pptx import Presentation
+            from pptx.util import Inches
+            
+            prs = Presentation(str(file_path))
+            text_parts = []
+            
+            for slide_num, slide in enumerate(prs.slides, 1):
+                slide_texts = []
+                text_parts.append(f"【幻灯片 {slide_num}】")
+                
+                for shape in slide.shapes:
+                    # 提取文本框内容
+                    if hasattr(shape, "text") and shape.text.strip():
+                        slide_texts.append(shape.text.strip())
+                    
+                    # 提取表格内容
+                    if shape.has_table:
+                        table = shape.table
+                        for row in table.rows:
+                            row_text = ' | '.join(
+                                cell.text.strip() for cell in row.cells if cell.text.strip()
+                            )
+                            if row_text:
+                                slide_texts.append(row_text)
+                
+                if slide_texts:
+                    text_parts.append('\n'.join(slide_texts))
+                else:
+                    text_parts.append("(无文本内容)")
+            
+            content = '\n\n'.join(text_parts)
+            return self._clean_content(content)
+            
+        except ImportError:
+            logger.error("python-pptx 未安装，无法读取 PPT 文件")
+            raise ValueError("服务器未安装 PPT 读取组件(python-pptx)")
+        except Exception as e:
+            logger.error(f"PPT 文件读取失败: {e}")
+            raise ValueError(f"PPT 文件读取失败: {e}")
+    
    def _clean_content(self, content: str) -> str:
        """清理和截断内容"""
        # 移除多余空白