012-kaopeilian/backend/app/services/document_converter.py

"""
文档转换服务
使用 LibreOffice 将 Office 文档转换为 PDF
"""
import os
import logging
import subprocess
from pathlib import Path
from typing import Optional
from datetime import datetime

from app.core.config import settings

logger = logging.getLogger(__name__)


class DocumentConverterService:
    """文档转换服务类"""

    # 支持转换的文件格式
    SUPPORTED_FORMATS = {'.docx', '.doc', '.pptx', '.ppt', '.xlsx', '.xls'}

    # Excel文件格式（需要特殊处理页面布局）
    EXCEL_FORMATS = {'.xlsx', '.xls'}

    def __init__(self):
        """初始化转换服务"""
        self.converted_path = Path(settings.UPLOAD_PATH) / "converted"
        self.converted_path.mkdir(parents=True, exist_ok=True)

    def get_converted_file_path(self, course_id: int, material_id: int) -> Path:
        """
        获取转换后的文件路径

        Args:
            course_id: 课程ID
            material_id: 资料ID

        Returns:
            转换后的PDF文件路径
        """
        course_dir = self.converted_path / str(course_id)
        course_dir.mkdir(parents=True, exist_ok=True)
        return course_dir / f"{material_id}.pdf"

    def need_convert(self, source_file: Path, converted_file: Path) -> bool:
        """
        判断是否需要重新转换

        Args:
            source_file: 源文件路径
            converted_file: 转换后的文件路径

        Returns:
            是否需要转换
        """
        # 如果转换文件不存在，需要转换
        if not converted_file.exists():
            return True

        # 如果源文件不存在，不需要转换
        if not source_file.exists():
            return False

        # 如果源文件修改时间晚于转换文件，需要重新转换
        source_mtime = source_file.stat().st_mtime
        converted_mtime = converted_file.stat().st_mtime

        return source_mtime > converted_mtime

    def convert_excel_to_html(
        self,
        source_file: str,
        course_id: int,
        material_id: int
    ) -> Optional[str]:
        """
        将Excel文件转换为HTML（避免PDF分页问题）

        Args:
            source_file: 源文件路径
            course_id: 课程ID
            material_id: 资料ID

        Returns:
            转换后的HTML文件URL，失败返回None
        """
        try:
            try:
                import openpyxl
                from openpyxl.utils import get_column_letter
            except ImportError as ie:
                logger.error(f"Excel转换依赖缺失: openpyxl 未安装。请运行 pip install openpyxl 或重建Docker镜像。错误: {str(ie)}")
                return None

            source_path = Path(source_file)
            logger.info(f"开始Excel转HTML: source={source_file}, course_id={course_id}, material_id={material_id}")

            # 获取HTML输出路径
            course_dir = self.converted_path / str(course_id)
            course_dir.mkdir(parents=True, exist_ok=True)
            html_file = course_dir / f"{material_id}.html"

            # 检查缓存
            if html_file.exists():
                source_mtime = source_path.stat().st_mtime
                html_mtime = html_file.stat().st_mtime
                if source_mtime <= html_mtime:
                    logger.info(f"使用缓存的HTML文件: {html_file}")
                    return f"/static/uploads/converted/{course_id}/{material_id}.html"

            # 读取Excel文件
            wb = openpyxl.load_workbook(source_file, data_only=True)

            # 构建HTML
            html_content = '''<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <style>
        body { font-family: Arial, sans-serif; padding: 20px; background: #f5f5f5; }
        .sheet-tabs { display: flex; gap: 10px; margin-bottom: 20px; flex-wrap: wrap; }
        .sheet-tab { padding: 8px 16px; background: #fff; border: 1px solid #ddd; border-radius: 4px; cursor: pointer; }
        .sheet-tab.active { background: #409eff; color: white; border-color: #409eff; }
        .sheet-content { display: none; }
        .sheet-content.active { display: block; }
        table { border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.1); }
        th, td { border: 1px solid #e4e7ed; padding: 8px 12px; text-align: left; white-space: nowrap; }
        th { background: #f5f7fa; font-weight: 600; position: sticky; top: 0; }
        tr:nth-child(even) { background: #fafafa; }
        tr:hover { background: #ecf5ff; }
        .table-wrapper { overflow-x: auto; max-height: 80vh; }
    </style>
</head>
<body>
'''

            # 生成sheet选项卡
            sheet_names = wb.sheetnames
            html_content += '<div class="sheet-tabs">\n'
            for i, name in enumerate(sheet_names):
                active = 'active' if i == 0 else ''
                html_content += f'<div class="sheet-tab {active}" onclick="showSheet({i})">{name}</div>\n'
            html_content += '</div>\n'

            # 生成每个sheet的表格
            for i, sheet_name in enumerate(sheet_names):
                ws = wb[sheet_name]
                active = 'active' if i == 0 else ''
                html_content += f'<div class="sheet-content {active}" id="sheet-{i}">\n'
                html_content += '<div class="table-wrapper"><table>\n'

                # 获取有效数据范围
                max_row = ws.max_row or 1
                max_col = ws.max_column or 1

                for row_idx in range(1, min(max_row + 1, 1001)):  # 限制最多1000行
                    html_content += '<tr>'
                    for col_idx in range(1, min(max_col + 1, 51)):  # 限制最多50列
                        cell = ws.cell(row=row_idx, column=col_idx)
                        value = cell.value if cell.value is not None else ''
                        tag = 'th' if row_idx == 1 else 'td'
                        # 转义HTML特殊字符
                        if isinstance(value, str):
                            value = value.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
                        html_content += f'<{tag}>{value}</{tag}>'
                    html_content += '</tr>\n'

                html_content += '</table></div></div>\n'

            # 添加JavaScript
            html_content += '''
<script>
function showSheet(index) {
    document.querySelectorAll('.sheet-tab').forEach((tab, i) => {
        tab.classList.toggle('active', i === index);
    });
    document.querySelectorAll('.sheet-content').forEach((content, i) => {
        content.classList.toggle('active', i === index);
    });
}
</script>
</body>
</html>'''

            # 写入HTML文件
            with open(html_file, 'w', encoding='utf-8') as f:
                f.write(html_content)

            logger.info(f"Excel转HTML成功: {html_file}")
            return f"/static/uploads/converted/{course_id}/{material_id}.html"

        except Exception as e:
            logger.error(f"Excel转HTML失败: {source_file}, 错误: {str(e)}", exc_info=True)
            return None

    def convert_to_pdf(
        self,
        source_file: str,
        course_id: int,
        material_id: int
    ) -> Optional[str]:
        """
        将Office文档转换为PDF

        Args:
            source_file: 源文件路径（绝对路径或相对路径）
            course_id: 课程ID
            material_id: 资料ID

        Returns:
            转换后的PDF文件URL，失败返回None
        """
        try:
            source_path = Path(source_file)

            # 检查源文件是否存在
            if not source_path.exists():
                logger.error(f"源文件不存在: {source_file}")
                return None

            # 检查文件格式是否支持
            file_ext = source_path.suffix.lower()
            if file_ext not in self.SUPPORTED_FORMATS:
                logger.error(f"不支持的文件格式: {file_ext}")
                return None

            # Excel文件使用HTML预览（避免分页问题）
            if file_ext in self.EXCEL_FORMATS:
                return self.convert_excel_to_html(source_file, course_id, material_id)

            # 获取转换后的文件路径
            converted_file = self.get_converted_file_path(course_id, material_id)

            # 检查是否需要转换
            if not self.need_convert(source_path, converted_file):
                logger.info(f"使用缓存的转换文件: {converted_file}")
                return f"/static/uploads/converted/{course_id}/{material_id}.pdf"

            # 执行转换
            logger.info(f"开始转换文档: {source_file} -> {converted_file}")

            # 使用 LibreOffice 转换
            # --headless: 无界面模式
            # --convert-to pdf: 转换为PDF
            # --outdir: 输出目录
            output_dir = converted_file.parent

            cmd = [
                'libreoffice',
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', str(output_dir),
                str(source_path)
            ]

            # 执行转换命令（设置超时时间为60秒）
            result = subprocess.run(
                cmd,
                capture_output=True,
                text=True,
                timeout=60,
                check=True
            )

            # LibreOffice 转换后的文件名是源文件名.pdf
            # 需要重命名为 material_id.pdf
            temp_converted = output_dir / f"{source_path.stem}.pdf"
            if temp_converted.exists() and temp_converted != converted_file:
                temp_converted.rename(converted_file)

            # 检查转换结果
            if converted_file.exists():
                logger.info(f"文档转换成功: {converted_file}")
                return f"/static/uploads/converted/{course_id}/{material_id}.pdf"
            else:
                logger.error(f"文档转换失败，输出文件不存在: {converted_file}")
                return None

        except subprocess.TimeoutExpired:
            logger.error(f"文档转换超时: {source_file}")
            return None
        except subprocess.CalledProcessError as e:
            logger.error(f"文档转换失败: {source_file}, 错误: {e.stderr}")
            return None
        except Exception as e:
            logger.error(f"文档转换异常: {source_file}, 错误: {str(e)}", exc_info=True)
            return None

    def is_convertible(self, file_ext: str) -> bool:
        """
        判断文件格式是否可转换

        Args:
            file_ext: 文件扩展名（带点，如 .docx）

        Returns:
            是否可转换
        """
        return file_ext.lower() in self.SUPPORTED_FORMATS


# 创建全局实例
document_converter = DocumentConverterService()