feat: 添加PPT/PPTX文件类型支持
All checks were successful
continuous-integration/drone/push Build is passing

1. upload.py: 添加ppt/pptx到允许上传的文件类型
2. knowledge_analysis_v2.py: 添加PPT内容提取方法_extract_ppt_content
3. requirements.txt: 添加python-pptx依赖
This commit is contained in:
yuliang_guo
2026-01-31 11:49:10 +08:00
parent 4e817f6eef
commit c3aa4e85e7
3 changed files with 50 additions and 4 deletions

View File

@@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2:
"""
提取文档内容
支持PDF、Worddocx、Excelxlsx/xls、文本文件
支持PDF、Worddocx、Excelxlsx/xlsPPTpptx/ppt文本文件
"""
suffix = file_path.suffix.lower()
@@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2:
return await self._extract_docx_content(file_path)
elif suffix in ['.xlsx', '.xls']:
return await self._extract_excel_content(file_path)
elif suffix in ['.pptx', '.ppt']:
return await self._extract_ppt_content(file_path)
elif suffix in ['.txt', '.md', '.text']:
return await self._extract_text_content(file_path)
else:
@@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2:
logger.error(f"Excel 文件读取失败: {e}")
raise ValueError(f"Excel 文件读取失败: {e}")
async def _extract_ppt_content(self, file_path: Path) -> str:
"""提取 PowerPoint 文件内容"""
try:
from pptx import Presentation
from pptx.util import Inches
prs = Presentation(str(file_path))
text_parts = []
for slide_num, slide in enumerate(prs.slides, 1):
slide_texts = []
text_parts.append(f"【幻灯片 {slide_num}")
for shape in slide.shapes:
# 提取文本框内容
if hasattr(shape, "text") and shape.text.strip():
slide_texts.append(shape.text.strip())
# 提取表格内容
if shape.has_table:
table = shape.table
for row in table.rows:
row_text = ' | '.join(
cell.text.strip() for cell in row.cells if cell.text.strip()
)
if row_text:
slide_texts.append(row_text)
if slide_texts:
text_parts.append('\n'.join(slide_texts))
else:
text_parts.append("(无文本内容)")
content = '\n\n'.join(text_parts)
return self._clean_content(content)
except ImportError:
logger.error("python-pptx 未安装,无法读取 PPT 文件")
raise ValueError("服务器未安装 PPT 读取组件(python-pptx)")
except Exception as e:
logger.error(f"PPT 文件读取失败: {e}")
raise ValueError(f"PPT 文件读取失败: {e}")
def _clean_content(self, content: str) -> str:
"""清理和截断内容"""
# 移除多余空白