""" 文档转换服务使用 LibreOffice 将 Office 文档转换为 PDF """ import os import logging import subprocess from pathlib import Path from typing import Optional from datetime import datetime from app.core.config import settings logger = logging.getLogger(__name__) class DocumentConverterService: """文档转换服务类""" # 支持转换的文件格式 SUPPORTED_FORMATS = {'.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls'} # Excel文件格式（需要特殊处理页面布局） EXCEL_FORMATS = {'.xlsx', '.xls'} def __init__(self): """初始化转换服务""" self.converted_path = Path(settings.UPLOAD_PATH) / "converted" self.converted_path.mkdir(parents=True, exist_ok=True) def get_converted_file_path(self, course_id: int, material_id: int) -> Path: """ 获取转换后的文件路径 Args: course_id: 课程ID material_id: 资料ID Returns: 转换后的PDF文件路径 """ course_dir = self.converted_path / str(course_id) course_dir.mkdir(parents=True, exist_ok=True) return course_dir / f"{material_id}.pdf" def need_convert(self, source_file: Path, converted_file: Path) -> bool: """ 判断是否需要重新转换 Args: source_file: 源文件路径 converted_file: 转换后的文件路径 Returns: 是否需要转换 """ # 如果转换文件不存在，需要转换 if not converted_file.exists(): return True # 如果源文件不存在，不需要转换 if not source_file.exists(): return False # 如果源文件修改时间晚于转换文件，需要重新转换 source_mtime = source_file.stat().st_mtime converted_mtime = converted_file.stat().st_mtime return source_mtime > converted_mtime def convert_excel_to_html( self, source_file: str, course_id: int, material_id: int ) -> Optional[str]: """ 将Excel文件转换为HTML（避免PDF分页问题） Args: source_file: 源文件路径 course_id: 课程ID material_id: 资料ID Returns: 转换后的HTML文件URL，失败返回None """ try: try: import openpyxl from openpyxl.utils import get_column_letter except ImportError as ie: logger.error(f"Excel转换依赖缺失: openpyxl 未安装。请运行 pip install openpyxl 或重建Docker镜像。错误: {str(ie)}") return None source_path = Path(source_file) logger.info(f"开始Excel转HTML: source={source_file}, course_id={course_id}, material_id={material_id}") # 获取HTML输出路径 course_dir = self.converted_path / str(course_id) course_dir.mkdir(parents=True, exist_ok=True) html_file = course_dir / f"{material_id}.html" # 检查缓存 if html_file.exists(): source_mtime = source_path.stat().st_mtime html_mtime = html_file.stat().st_mtime if source_mtime <= html_mtime: logger.info(f"使用缓存的HTML文件: {html_file}") return f"/static/uploads/converted/{course_id}/{material_id}.html" # 读取Excel文件 wb = openpyxl.load_workbook(source_file, data_only=True) # 构建HTML html_content = ''' ''' # 生成sheet选项卡 sheet_names = wb.sheetnames html_content += '

\n' for i, name in enumerate(sheet_names): active = 'active' if i == 0 else '' html_content += f'

{name}

\n' html_content += '

\n' # 生成每个sheet的表格 for i, sheet_name in enumerate(sheet_names): ws = wb[sheet_name] active = 'active' if i == 0 else '' html_content += f'

\n' html_content += '

\n' # 获取有效数据范围 max_row = ws.max_row or 1 max_col = ws.max_column or 1 for row_idx in range(1, min(max_row + 1, 1001)): # 限制最多1000行 html_content += '' for col_idx in range(1, min(max_col + 1, 51)): # 限制最多50列 cell = ws.cell(row=row_idx, column=col_idx) value = cell.value if cell.value is not None else '' tag = 'th' if row_idx == 1 else 'td' # 转义HTML特殊字符 if isinstance(value, str): value = value.replace('&', '&').replace('<', '<').replace('>', '>') html_content += f'<{tag}>{value}' html_content += '\n' html_content += '

\n' # 添加JavaScript html_content += ''' ''' # 写入HTML文件 with open(html_file, 'w', encoding='utf-8') as f: f.write(html_content) logger.info(f"Excel转HTML成功: {html_file}") return f"/static/uploads/converted/{course_id}/{material_id}.html" except Exception as e: logger.error(f"Excel转HTML失败: {source_file}, 错误: {str(e)}", exc_info=True) return None def convert_to_pdf( self, source_file: str, course_id: int, material_id: int ) -> Optional[str]: """ 将Office文档转换为PDF Args: source_file: 源文件路径（绝对路径或相对路径） course_id: 课程ID material_id: 资料ID Returns: 转换后的PDF文件URL，失败返回None """ try: source_path = Path(source_file) # 检查源文件是否存在 if not source_path.exists(): logger.error(f"源文件不存在: {source_file}") return None # 检查文件格式是否支持 file_ext = source_path.suffix.lower() if file_ext not in self.SUPPORTED_FORMATS: logger.error(f"不支持的文件格式: {file_ext}") return None # Excel文件使用HTML预览（避免分页问题） if file_ext in self.EXCEL_FORMATS: return self.convert_excel_to_html(source_file, course_id, material_id) # 获取转换后的文件路径 converted_file = self.get_converted_file_path(course_id, material_id) # 检查是否需要转换 if not self.need_convert(source_path, converted_file): logger.info(f"使用缓存的转换文件: {converted_file}") return f"/static/uploads/converted/{course_id}/{material_id}.pdf" # 执行转换 logger.info(f"开始转换文档: {source_file} -> {converted_file}") # 使用 LibreOffice 转换 # --headless: 无界面模式 # --convert-to pdf: 转换为PDF # --outdir: 输出目录 output_dir = converted_file.parent cmd = [ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', str(output_dir), str(source_path) ] # 执行转换命令（设置超时时间为60秒） result = subprocess.run( cmd, capture_output=True, text=True, timeout=60, check=True ) # LibreOffice 转换后的文件名是源文件名.pdf # 需要重命名为 material_id.pdf temp_converted = output_dir / f"{source_path.stem}.pdf" if temp_converted.exists() and temp_converted != converted_file: temp_converted.rename(converted_file) # 检查转换结果 if converted_file.exists(): logger.info(f"文档转换成功: {converted_file}") return f"/static/uploads/converted/{course_id}/{material_id}.pdf" else: logger.error(f"文档转换失败，输出文件不存在: {converted_file}") return None except subprocess.TimeoutExpired: logger.error(f"文档转换超时: {source_file}") return None except subprocess.CalledProcessError as e: logger.error(f"文档转换失败: {source_file}, 错误: {e.stderr}") return None except Exception as e: logger.error(f"文档转换异常: {source_file}, 错误: {str(e)}", exc_info=True) return None def is_convertible(self, file_ext: str) -> bool: """ 判断文件格式是否可转换 Args: file_ext: 文件扩展名（带点，如 .docx） Returns: 是否可转换 """ return file_ext.lower() in self.SUPPORTED_FORMATS # 创建全局实例 document_converter = DocumentConverterService()