feat: 添加PPT/PPTX文件类型支持
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
1. upload.py: 添加ppt/pptx到允许上传的文件类型 2. knowledge_analysis_v2.py: 添加PPT内容提取方法_extract_ppt_content 3. requirements.txt: 添加python-pptx依赖
This commit is contained in:
@@ -23,10 +23,10 @@ logger = get_logger(__name__)
|
||||
router = APIRouter(prefix="/upload")
|
||||
|
||||
# 支持的文件类型和大小限制
|
||||
# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、CSV、VTT、Properties
|
||||
# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、PPT、CSV、VTT、Properties
|
||||
ALLOWED_EXTENSIONS = {
|
||||
'txt', 'md', 'mdx', 'pdf', 'html', 'htm',
|
||||
'xlsx', 'xls', 'docx', 'doc', 'csv', 'vtt', 'properties'
|
||||
'xlsx', 'xls', 'docx', 'doc', 'pptx', 'ppt', 'csv', 'vtt', 'properties'
|
||||
}
|
||||
MAX_FILE_SIZE = 15 * 1024 * 1024 # 15MB
|
||||
|
||||
|
||||
@@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2:
|
||||
"""
|
||||
提取文档内容
|
||||
|
||||
支持:PDF、Word(docx)、Excel(xlsx/xls)、文本文件
|
||||
支持:PDF、Word(docx)、Excel(xlsx/xls)、PPT(pptx/ppt)、文本文件
|
||||
"""
|
||||
suffix = file_path.suffix.lower()
|
||||
|
||||
@@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2:
|
||||
return await self._extract_docx_content(file_path)
|
||||
elif suffix in ['.xlsx', '.xls']:
|
||||
return await self._extract_excel_content(file_path)
|
||||
elif suffix in ['.pptx', '.ppt']:
|
||||
return await self._extract_ppt_content(file_path)
|
||||
elif suffix in ['.txt', '.md', '.text']:
|
||||
return await self._extract_text_content(file_path)
|
||||
else:
|
||||
@@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2:
|
||||
logger.error(f"Excel 文件读取失败: {e}")
|
||||
raise ValueError(f"Excel 文件读取失败: {e}")
|
||||
|
||||
async def _extract_ppt_content(self, file_path: Path) -> str:
|
||||
"""提取 PowerPoint 文件内容"""
|
||||
try:
|
||||
from pptx import Presentation
|
||||
from pptx.util import Inches
|
||||
|
||||
prs = Presentation(str(file_path))
|
||||
text_parts = []
|
||||
|
||||
for slide_num, slide in enumerate(prs.slides, 1):
|
||||
slide_texts = []
|
||||
text_parts.append(f"【幻灯片 {slide_num}】")
|
||||
|
||||
for shape in slide.shapes:
|
||||
# 提取文本框内容
|
||||
if hasattr(shape, "text") and shape.text.strip():
|
||||
slide_texts.append(shape.text.strip())
|
||||
|
||||
# 提取表格内容
|
||||
if shape.has_table:
|
||||
table = shape.table
|
||||
for row in table.rows:
|
||||
row_text = ' | '.join(
|
||||
cell.text.strip() for cell in row.cells if cell.text.strip()
|
||||
)
|
||||
if row_text:
|
||||
slide_texts.append(row_text)
|
||||
|
||||
if slide_texts:
|
||||
text_parts.append('\n'.join(slide_texts))
|
||||
else:
|
||||
text_parts.append("(无文本内容)")
|
||||
|
||||
content = '\n\n'.join(text_parts)
|
||||
return self._clean_content(content)
|
||||
|
||||
except ImportError:
|
||||
logger.error("python-pptx 未安装,无法读取 PPT 文件")
|
||||
raise ValueError("服务器未安装 PPT 读取组件(python-pptx)")
|
||||
except Exception as e:
|
||||
logger.error(f"PPT 文件读取失败: {e}")
|
||||
raise ValueError(f"PPT 文件读取失败: {e}")
|
||||
|
||||
def _clean_content(self, content: str) -> str:
|
||||
"""清理和截断内容"""
|
||||
# 移除多余空白
|
||||
|
||||
Reference in New Issue
Block a user