From c3aa4e85e776d484c67af6e1ea6add4ed5e2f2a0 Mon Sep 17 00:00:00 2001
From: yuliang_guo <anonymous@qq.com>
Date: Sat, 31 Jan 2026 11:49:10 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0PPT/PPTX=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E7=B1=BB=E5=9E=8B=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. upload.py: 添加ppt/pptx到允许上传的文件类型
2. knowledge_analysis_v2.py: 添加PPT内容提取方法_extract_ppt_content
3. requirements.txt: 添加python-pptx依赖
---
 backend/app/api/v1/upload.py                  |  4 +-
 .../app/services/ai/knowledge_analysis_v2.py  | 47 ++++++++++++++++++-
 backend/requirements.txt                      |  3 +-
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/backend/app/api/v1/upload.py b/backend/app/api/v1/upload.py
index 2255de2..3f91c81 100644
--- a/backend/app/api/v1/upload.py
+++ b/backend/app/api/v1/upload.py
@@ -23,10 +23,10 @@ logger = get_logger(__name__)
 router = APIRouter(prefix="/upload")
 
 # 支持的文件类型和大小限制
-# 支持格式：TXT、Markdown、MDX、PDF、HTML、Excel、Word、CSV、VTT、Properties
+# 支持格式：TXT、Markdown、MDX、PDF、HTML、Excel、Word、PPT、CSV、VTT、Properties
 ALLOWED_EXTENSIONS = {
     'txt', 'md', 'mdx', 'pdf', 'html', 'htm',
-    'xlsx', 'xls', 'docx', 'doc', 'csv', 'vtt', 'properties'
+    'xlsx', 'xls', 'docx', 'doc', 'pptx', 'ppt', 'csv', 'vtt', 'properties'
 }
 MAX_FILE_SIZE = 15 * 1024 * 1024  # 15MB
 
diff --git a/backend/app/services/ai/knowledge_analysis_v2.py b/backend/app/services/ai/knowledge_analysis_v2.py
index ea0eeef..80bb6ea 100644
--- a/backend/app/services/ai/knowledge_analysis_v2.py
+++ b/backend/app/services/ai/knowledge_analysis_v2.py
@@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2:
         """
         提取文档内容
         
-        支持：PDF、Word（docx）、Excel（xlsx/xls）、文本文件
+        支持：PDF、Word（docx）、Excel（xlsx/xls）、PPT（pptx/ppt）、文本文件
         """
         suffix = file_path.suffix.lower()
         
@@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2:
                 return await self._extract_docx_content(file_path)
             elif suffix in ['.xlsx', '.xls']:
                 return await self._extract_excel_content(file_path)
+            elif suffix in ['.pptx', '.ppt']:
+                return await self._extract_ppt_content(file_path)
             elif suffix in ['.txt', '.md', '.text']:
                 return await self._extract_text_content(file_path)
             else:
@@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2:
             logger.error(f"Excel 文件读取失败: {e}")
             raise ValueError(f"Excel 文件读取失败: {e}")
     
+    async def _extract_ppt_content(self, file_path: Path) -> str:
+        """提取 PowerPoint 文件内容"""
+        try:
+            from pptx import Presentation
+            from pptx.util import Inches
+            
+            prs = Presentation(str(file_path))
+            text_parts = []
+            
+            for slide_num, slide in enumerate(prs.slides, 1):
+                slide_texts = []
+                text_parts.append(f"【幻灯片 {slide_num}】")
+                
+                for shape in slide.shapes:
+                    # 提取文本框内容
+                    if hasattr(shape, "text") and shape.text.strip():
+                        slide_texts.append(shape.text.strip())
+                    
+                    # 提取表格内容
+                    if shape.has_table:
+                        table = shape.table
+                        for row in table.rows:
+                            row_text = ' | '.join(
+                                cell.text.strip() for cell in row.cells if cell.text.strip()
+                            )
+                            if row_text:
+                                slide_texts.append(row_text)
+                
+                if slide_texts:
+                    text_parts.append('\n'.join(slide_texts))
+                else:
+                    text_parts.append("(无文本内容)")
+            
+            content = '\n\n'.join(text_parts)
+            return self._clean_content(content)
+            
+        except ImportError:
+            logger.error("python-pptx 未安装，无法读取 PPT 文件")
+            raise ValueError("服务器未安装 PPT 读取组件(python-pptx)")
+        except Exception as e:
+            logger.error(f"PPT 文件读取失败: {e}")
+            raise ValueError(f"PPT 文件读取失败: {e}")
+    
     def _clean_content(self, content: str) -> str:
         """清理和截断内容"""
         # 移除多余空白
diff --git a/backend/requirements.txt b/backend/requirements.txt
index d551780..ad4e27a 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -51,9 +51,10 @@ openpyxl==3.1.2
 json-repair>=0.25.0
 jsonschema>=4.0.0
 
-# PDF 文档提取
+# 文档提取
 PyPDF2>=3.0.0
 python-docx>=1.0.0
+python-pptx>=0.6.21
 
 # 证书生成
 Pillow>=10.0.0