feat: 添加PPT/PPTX文件类型支持
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
1. upload.py: 添加ppt/pptx到允许上传的文件类型 2. knowledge_analysis_v2.py: 添加PPT内容提取方法_extract_ppt_content 3. requirements.txt: 添加python-pptx依赖
This commit is contained in:
@@ -23,10 +23,10 @@ logger = get_logger(__name__)
|
|||||||
router = APIRouter(prefix="/upload")
|
router = APIRouter(prefix="/upload")
|
||||||
|
|
||||||
# 支持的文件类型和大小限制
|
# 支持的文件类型和大小限制
|
||||||
# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、CSV、VTT、Properties
|
# 支持格式:TXT、Markdown、MDX、PDF、HTML、Excel、Word、PPT、CSV、VTT、Properties
|
||||||
ALLOWED_EXTENSIONS = {
|
ALLOWED_EXTENSIONS = {
|
||||||
'txt', 'md', 'mdx', 'pdf', 'html', 'htm',
|
'txt', 'md', 'mdx', 'pdf', 'html', 'htm',
|
||||||
'xlsx', 'xls', 'docx', 'doc', 'csv', 'vtt', 'properties'
|
'xlsx', 'xls', 'docx', 'doc', 'pptx', 'ppt', 'csv', 'vtt', 'properties'
|
||||||
}
|
}
|
||||||
MAX_FILE_SIZE = 15 * 1024 * 1024 # 15MB
|
MAX_FILE_SIZE = 15 * 1024 * 1024 # 15MB
|
||||||
|
|
||||||
|
|||||||
@@ -176,7 +176,7 @@ class KnowledgeAnalysisServiceV2:
|
|||||||
"""
|
"""
|
||||||
提取文档内容
|
提取文档内容
|
||||||
|
|
||||||
支持:PDF、Word(docx)、Excel(xlsx/xls)、文本文件
|
支持:PDF、Word(docx)、Excel(xlsx/xls)、PPT(pptx/ppt)、文本文件
|
||||||
"""
|
"""
|
||||||
suffix = file_path.suffix.lower()
|
suffix = file_path.suffix.lower()
|
||||||
|
|
||||||
@@ -187,6 +187,8 @@ class KnowledgeAnalysisServiceV2:
|
|||||||
return await self._extract_docx_content(file_path)
|
return await self._extract_docx_content(file_path)
|
||||||
elif suffix in ['.xlsx', '.xls']:
|
elif suffix in ['.xlsx', '.xls']:
|
||||||
return await self._extract_excel_content(file_path)
|
return await self._extract_excel_content(file_path)
|
||||||
|
elif suffix in ['.pptx', '.ppt']:
|
||||||
|
return await self._extract_ppt_content(file_path)
|
||||||
elif suffix in ['.txt', '.md', '.text']:
|
elif suffix in ['.txt', '.md', '.text']:
|
||||||
return await self._extract_text_content(file_path)
|
return await self._extract_text_content(file_path)
|
||||||
else:
|
else:
|
||||||
@@ -303,6 +305,49 @@ class KnowledgeAnalysisServiceV2:
|
|||||||
logger.error(f"Excel 文件读取失败: {e}")
|
logger.error(f"Excel 文件读取失败: {e}")
|
||||||
raise ValueError(f"Excel 文件读取失败: {e}")
|
raise ValueError(f"Excel 文件读取失败: {e}")
|
||||||
|
|
||||||
|
async def _extract_ppt_content(self, file_path: Path) -> str:
|
||||||
|
"""提取 PowerPoint 文件内容"""
|
||||||
|
try:
|
||||||
|
from pptx import Presentation
|
||||||
|
from pptx.util import Inches
|
||||||
|
|
||||||
|
prs = Presentation(str(file_path))
|
||||||
|
text_parts = []
|
||||||
|
|
||||||
|
for slide_num, slide in enumerate(prs.slides, 1):
|
||||||
|
slide_texts = []
|
||||||
|
text_parts.append(f"【幻灯片 {slide_num}】")
|
||||||
|
|
||||||
|
for shape in slide.shapes:
|
||||||
|
# 提取文本框内容
|
||||||
|
if hasattr(shape, "text") and shape.text.strip():
|
||||||
|
slide_texts.append(shape.text.strip())
|
||||||
|
|
||||||
|
# 提取表格内容
|
||||||
|
if shape.has_table:
|
||||||
|
table = shape.table
|
||||||
|
for row in table.rows:
|
||||||
|
row_text = ' | '.join(
|
||||||
|
cell.text.strip() for cell in row.cells if cell.text.strip()
|
||||||
|
)
|
||||||
|
if row_text:
|
||||||
|
slide_texts.append(row_text)
|
||||||
|
|
||||||
|
if slide_texts:
|
||||||
|
text_parts.append('\n'.join(slide_texts))
|
||||||
|
else:
|
||||||
|
text_parts.append("(无文本内容)")
|
||||||
|
|
||||||
|
content = '\n\n'.join(text_parts)
|
||||||
|
return self._clean_content(content)
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.error("python-pptx 未安装,无法读取 PPT 文件")
|
||||||
|
raise ValueError("服务器未安装 PPT 读取组件(python-pptx)")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"PPT 文件读取失败: {e}")
|
||||||
|
raise ValueError(f"PPT 文件读取失败: {e}")
|
||||||
|
|
||||||
def _clean_content(self, content: str) -> str:
|
def _clean_content(self, content: str) -> str:
|
||||||
"""清理和截断内容"""
|
"""清理和截断内容"""
|
||||||
# 移除多余空白
|
# 移除多余空白
|
||||||
|
|||||||
@@ -51,9 +51,10 @@ openpyxl==3.1.2
|
|||||||
json-repair>=0.25.0
|
json-repair>=0.25.0
|
||||||
jsonschema>=4.0.0
|
jsonschema>=4.0.0
|
||||||
|
|
||||||
# PDF 文档提取
|
# 文档提取
|
||||||
PyPDF2>=3.0.0
|
PyPDF2>=3.0.0
|
||||||
python-docx>=1.0.0
|
python-docx>=1.0.0
|
||||||
|
python-pptx>=0.6.21
|
||||||
|
|
||||||
# 证书生成
|
# 证书生成
|
||||||
Pillow>=10.0.0
|
Pillow>=10.0.0
|
||||||
|
|||||||
Reference in New Issue
Block a user