Files
012-kaopeilian/backend/app/services/document_converter.py
111 998211c483 feat: 初始化考培练系统项目
- 从服务器拉取完整代码
- 按框架规范整理项目结构
- 配置 Drone CI 测试环境部署
- 包含后端(FastAPI)、前端(Vue3)、管理端

技术栈: Vue3 + TypeScript + FastAPI + MySQL
2026-01-24 19:33:28 +08:00

306 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
文档转换服务
使用 LibreOffice 将 Office 文档转换为 PDF
"""
import os
import logging
import subprocess
from pathlib import Path
from typing import Optional
from datetime import datetime
from app.core.config import settings
logger = logging.getLogger(__name__)
class DocumentConverterService:
"""文档转换服务类"""
# 支持转换的文件格式
SUPPORTED_FORMATS = {'.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls'}
# Excel文件格式需要特殊处理页面布局
EXCEL_FORMATS = {'.xlsx', '.xls'}
def __init__(self):
"""初始化转换服务"""
self.converted_path = Path(settings.UPLOAD_PATH) / "converted"
self.converted_path.mkdir(parents=True, exist_ok=True)
def get_converted_file_path(self, course_id: int, material_id: int) -> Path:
"""
获取转换后的文件路径
Args:
course_id: 课程ID
material_id: 资料ID
Returns:
转换后的PDF文件路径
"""
course_dir = self.converted_path / str(course_id)
course_dir.mkdir(parents=True, exist_ok=True)
return course_dir / f"{material_id}.pdf"
def need_convert(self, source_file: Path, converted_file: Path) -> bool:
"""
判断是否需要重新转换
Args:
source_file: 源文件路径
converted_file: 转换后的文件路径
Returns:
是否需要转换
"""
# 如果转换文件不存在,需要转换
if not converted_file.exists():
return True
# 如果源文件不存在,不需要转换
if not source_file.exists():
return False
# 如果源文件修改时间晚于转换文件,需要重新转换
source_mtime = source_file.stat().st_mtime
converted_mtime = converted_file.stat().st_mtime
return source_mtime > converted_mtime
def convert_excel_to_html(
self,
source_file: str,
course_id: int,
material_id: int
) -> Optional[str]:
"""
将Excel文件转换为HTML避免PDF分页问题
Args:
source_file: 源文件路径
course_id: 课程ID
material_id: 资料ID
Returns:
转换后的HTML文件URL失败返回None
"""
try:
try:
import openpyxl
from openpyxl.utils import get_column_letter
except ImportError as ie:
logger.error(f"Excel转换依赖缺失: openpyxl 未安装。请运行 pip install openpyxl 或重建Docker镜像。错误: {str(ie)}")
return None
source_path = Path(source_file)
logger.info(f"开始Excel转HTML: source={source_file}, course_id={course_id}, material_id={material_id}")
# 获取HTML输出路径
course_dir = self.converted_path / str(course_id)
course_dir.mkdir(parents=True, exist_ok=True)
html_file = course_dir / f"{material_id}.html"
# 检查缓存
if html_file.exists():
source_mtime = source_path.stat().st_mtime
html_mtime = html_file.stat().st_mtime
if source_mtime <= html_mtime:
logger.info(f"使用缓存的HTML文件: {html_file}")
return f"/static/uploads/converted/{course_id}/{material_id}.html"
# 读取Excel文件
wb = openpyxl.load_workbook(source_file, data_only=True)
# 构建HTML
html_content = '''<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
body { font-family: Arial, sans-serif; padding: 20px; background: #f5f5f5; }
.sheet-tabs { display: flex; gap: 10px; margin-bottom: 20px; flex-wrap: wrap; }
.sheet-tab { padding: 8px 16px; background: #fff; border: 1px solid #ddd; border-radius: 4px; cursor: pointer; }
.sheet-tab.active { background: #409eff; color: white; border-color: #409eff; }
.sheet-content { display: none; }
.sheet-content.active { display: block; }
table { border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
th, td { border: 1px solid #e4e7ed; padding: 8px 12px; text-align: left; white-space: nowrap; }
th { background: #f5f7fa; font-weight: 600; position: sticky; top: 0; }
tr:nth-child(even) { background: #fafafa; }
tr:hover { background: #ecf5ff; }
.table-wrapper { overflow-x: auto; max-height: 80vh; }
</style>
</head>
<body>
'''
# 生成sheet选项卡
sheet_names = wb.sheetnames
html_content += '<div class="sheet-tabs">\n'
for i, name in enumerate(sheet_names):
active = 'active' if i == 0 else ''
html_content += f'<div class="sheet-tab {active}" onclick="showSheet({i})">{name}</div>\n'
html_content += '</div>\n'
# 生成每个sheet的表格
for i, sheet_name in enumerate(sheet_names):
ws = wb[sheet_name]
active = 'active' if i == 0 else ''
html_content += f'<div class="sheet-content {active}" id="sheet-{i}">\n'
html_content += '<div class="table-wrapper"><table>\n'
# 获取有效数据范围
max_row = ws.max_row or 1
max_col = ws.max_column or 1
for row_idx in range(1, min(max_row + 1, 1001)): # 限制最多1000行
html_content += '<tr>'
for col_idx in range(1, min(max_col + 1, 51)): # 限制最多50列
cell = ws.cell(row=row_idx, column=col_idx)
value = cell.value if cell.value is not None else ''
tag = 'th' if row_idx == 1 else 'td'
# 转义HTML特殊字符
if isinstance(value, str):
value = value.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
html_content += f'<{tag}>{value}</{tag}>'
html_content += '</tr>\n'
html_content += '</table></div></div>\n'
# 添加JavaScript
html_content += '''
<script>
function showSheet(index) {
document.querySelectorAll('.sheet-tab').forEach((tab, i) => {
tab.classList.toggle('active', i === index);
});
document.querySelectorAll('.sheet-content').forEach((content, i) => {
content.classList.toggle('active', i === index);
});
}
</script>
</body>
</html>'''
# 写入HTML文件
with open(html_file, 'w', encoding='utf-8') as f:
f.write(html_content)
logger.info(f"Excel转HTML成功: {html_file}")
return f"/static/uploads/converted/{course_id}/{material_id}.html"
except Exception as e:
logger.error(f"Excel转HTML失败: {source_file}, 错误: {str(e)}", exc_info=True)
return None
def convert_to_pdf(
self,
source_file: str,
course_id: int,
material_id: int
) -> Optional[str]:
"""
将Office文档转换为PDF
Args:
source_file: 源文件路径(绝对路径或相对路径)
course_id: 课程ID
material_id: 资料ID
Returns:
转换后的PDF文件URL失败返回None
"""
try:
source_path = Path(source_file)
# 检查源文件是否存在
if not source_path.exists():
logger.error(f"源文件不存在: {source_file}")
return None
# 检查文件格式是否支持
file_ext = source_path.suffix.lower()
if file_ext not in self.SUPPORTED_FORMATS:
logger.error(f"不支持的文件格式: {file_ext}")
return None
# Excel文件使用HTML预览避免分页问题
if file_ext in self.EXCEL_FORMATS:
return self.convert_excel_to_html(source_file, course_id, material_id)
# 获取转换后的文件路径
converted_file = self.get_converted_file_path(course_id, material_id)
# 检查是否需要转换
if not self.need_convert(source_path, converted_file):
logger.info(f"使用缓存的转换文件: {converted_file}")
return f"/static/uploads/converted/{course_id}/{material_id}.pdf"
# 执行转换
logger.info(f"开始转换文档: {source_file} -> {converted_file}")
# 使用 LibreOffice 转换
# --headless: 无界面模式
# --convert-to pdf: 转换为PDF
# --outdir: 输出目录
output_dir = converted_file.parent
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(output_dir),
str(source_path)
]
# 执行转换命令设置超时时间为60秒
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=60,
check=True
)
# LibreOffice 转换后的文件名是源文件名.pdf
# 需要重命名为 material_id.pdf
temp_converted = output_dir / f"{source_path.stem}.pdf"
if temp_converted.exists() and temp_converted != converted_file:
temp_converted.rename(converted_file)
# 检查转换结果
if converted_file.exists():
logger.info(f"文档转换成功: {converted_file}")
return f"/static/uploads/converted/{course_id}/{material_id}.pdf"
else:
logger.error(f"文档转换失败,输出文件不存在: {converted_file}")
return None
except subprocess.TimeoutExpired:
logger.error(f"文档转换超时: {source_file}")
return None
except subprocess.CalledProcessError as e:
logger.error(f"文档转换失败: {source_file}, 错误: {e.stderr}")
return None
except Exception as e:
logger.error(f"文档转换异常: {source_file}, 错误: {str(e)}", exc_info=True)
return None
def is_convertible(self, file_ext: str) -> bool:
"""
判断文件格式是否可转换
Args:
file_ext: 文件扩展名(带点,如 .docx
Returns:
是否可转换
"""
return file_ext.lower() in self.SUPPORTED_FORMATS
# 创建全局实例
document_converter = DocumentConverterService()