feat: 初始化考培练系统项目

- 从服务器拉取完整代码 - 按框架规范整理项目结构 - 配置 Drone CI 测试环境部署 - 包含后端(FastAPI)、前端(Vue3)、管理端技术栈: Vue3 + TypeScript + FastAPI + MySQL
2026-01-24 19:33:28 +08:00
commit 998211c483
1197 changed files with 228429 additions and 0 deletions
--- a/backend/app/services/ai/knowledge_analysis_v2.py
+++ b/backend/app/services/ai/knowledge_analysis_v2.py
@@ -0,0 +1,548 @@
+"""
+知识点分析服务 V2 - Python 原生实现
+
+功能：
+- 读取文档内容（PDF/Word/TXT）
+- 调用 AI 分析提取知识点
+- 解析 JSON 结果
+- 写入数据库
+
+提供稳定可靠的知识点分析能力。
+"""
+
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.config import settings
+from app.core.exceptions import ExternalServiceError
+from app.schemas.course import KnowledgePointCreate
+
+from .ai_service import AIService, AIResponse
+from .llm_json_parser import parse_with_fallback, clean_llm_output
+from .prompts.knowledge_analysis_prompts import (
+    SYSTEM_PROMPT,
+    USER_PROMPT,
+    KNOWLEDGE_POINT_SCHEMA,
+    DEFAULT_KNOWLEDGE_TYPE,
+)
+
+logger = logging.getLogger(__name__)
+
+# 配置常量
+STATIC_UPLOADS_PREFIX = '/static/uploads/'
+MAX_CONTENT_LENGTH = 100000  # 最大文档内容长度（字符）
+MAX_KNOWLEDGE_POINTS = 20    # 最大知识点数量
+
+
+class KnowledgeAnalysisServiceV2:
+    """
+    知识点分析服务 V2
+    
+    使用 Python 原生实现。
+    
+    使用示例：
+    ```python
+    service = KnowledgeAnalysisServiceV2()
+    result = await service.analyze_course_material(
+        db=db_session,
+        course_id=1,
+        material_id=10,
+        file_url="/static/uploads/courses/1/doc.pdf",
+        course_title="医美产品知识",
+        user_id=1
+    )
+    ```
+    """
+    
+    def __init__(self):
+        """初始化服务"""
+        self.ai_service = AIService(module_code="knowledge_analysis")
+        self.upload_path = getattr(settings, 'UPLOAD_PATH', 'uploads')
+    
+    async def analyze_course_material(
+        self,
+        db: AsyncSession,
+        course_id: int,
+        material_id: int,
+        file_url: str,
+        course_title: str,
+        user_id: int
+    ) -> Dict[str, Any]:
+        """
+        分析课程资料并提取知识点
+        
+        Args:
+            db: 数据库会话
+            course_id: 课程ID
+            material_id: 资料ID
+            file_url: 文件URL（相对路径）
+            course_title: 课程标题
+            user_id: 用户ID
+            
+        Returns:
+            分析结果，包含 success、knowledge_points_count 等字段
+        """
+        try:
+            logger.info(
+                f"开始知识点分析 V2 - course_id: {course_id}, material_id: {material_id}, "
+                f"file_url: {file_url}"
+            )
+            
+            # 1. 解析文件路径
+            file_path = self._resolve_file_path(file_url)
+            if not file_path.exists():
+                raise FileNotFoundError(f"文件不存在: {file_path}")
+            
+            logger.info(f"文件路径解析成功: {file_path}")
+            
+            # 2. 提取文档内容
+            content = await self._extract_document_content(file_path)
+            if not content or not content.strip():
+                raise ValueError("文档内容为空")
+            
+            logger.info(f"文档内容提取成功，长度: {len(content)} 字符")
+            
+            # 3. 调用 AI 分析
+            ai_response = await self._call_ai_analysis(content, course_title)
+            
+            logger.info(
+                f"AI 分析完成 - provider: {ai_response.provider}, "
+                f"tokens: {ai_response.total_tokens}, latency: {ai_response.latency_ms}ms"
+            )
+            
+            # 4. 解析 JSON 结果
+            knowledge_points = self._parse_knowledge_points(ai_response.content)
+            
+            logger.info(f"知识点解析成功，数量: {len(knowledge_points)}")
+            
+            # 5. 删除旧的知识点
+            await self._delete_old_knowledge_points(db, material_id)
+            
+            # 6. 保存到数据库
+            saved_count = await self._save_knowledge_points(
+                db=db,
+                course_id=course_id,
+                material_id=material_id,
+                knowledge_points=knowledge_points,
+                user_id=user_id
+            )
+            
+            logger.info(
+                f"知识点分析完成 - course_id: {course_id}, material_id: {material_id}, "
+                f"saved_count: {saved_count}"
+            )
+            
+            return {
+                "success": True,
+                "status": "completed",
+                "knowledge_points_count": saved_count,
+                "ai_provider": ai_response.provider,
+                "ai_model": ai_response.model,
+                "ai_tokens": ai_response.total_tokens,
+                "ai_latency_ms": ai_response.latency_ms,
+            }
+            
+        except FileNotFoundError as e:
+            logger.error(f"文件不存在: {e}")
+            raise ExternalServiceError(f"分析文件不存在: {e}")
+        except ValueError as e:
+            logger.error(f"参数错误: {e}")
+            raise ExternalServiceError(f"分析参数错误: {e}")
+        except Exception as e:
+            logger.error(
+                f"知识点分析失败 - course_id: {course_id}, material_id: {material_id}, "
+                f"error: {e}",
+                exc_info=True
+            )
+            raise ExternalServiceError(f"知识点分析失败: {e}")
+    
+    def _resolve_file_path(self, file_url: str) -> Path:
+        """解析文件 URL 为本地路径"""
+        if file_url.startswith(STATIC_UPLOADS_PREFIX):
+            relative_path = file_url.replace(STATIC_UPLOADS_PREFIX, '')
+            return Path(self.upload_path) / relative_path
+        elif file_url.startswith('/'):
+            # 绝对路径
+            return Path(file_url)
+        else:
+            # 相对路径
+            return Path(self.upload_path) / file_url
+    
+    async def _extract_document_content(self, file_path: Path) -> str:
+        """
+        提取文档内容
+        
+        支持：PDF、Word（docx）、文本文件
+        """
+        suffix = file_path.suffix.lower()
+        
+        try:
+            if suffix == '.pdf':
+                return await self._extract_pdf_content(file_path)
+            elif suffix in ['.docx', '.doc']:
+                return await self._extract_docx_content(file_path)
+            elif suffix in ['.txt', '.md', '.text']:
+                return await self._extract_text_content(file_path)
+            else:
+                # 尝试作为文本读取
+                return await self._extract_text_content(file_path)
+        except Exception as e:
+            logger.error(f"文档内容提取失败: {file_path}, error: {e}")
+            raise ValueError(f"无法读取文档内容: {e}")
+    
+    async def _extract_pdf_content(self, file_path: Path) -> str:
+        """提取 PDF 内容"""
+        try:
+            from PyPDF2 import PdfReader
+            
+            reader = PdfReader(str(file_path))
+            text_parts = []
+            
+            for page in reader.pages:
+                text = page.extract_text()
+                if text:
+                    text_parts.append(text)
+            
+            content = '\n'.join(text_parts)
+            
+            # 清理和截断
+            content = self._clean_content(content)
+            
+            return content
+            
+        except ImportError:
+            logger.error("PyPDF2 未安装，无法读取 PDF")
+            raise ValueError("服务器未安装 PDF 读取组件")
+        except Exception as e:
+            logger.error(f"PDF 读取失败: {e}")
+            raise ValueError(f"PDF 读取失败: {e}")
+    
+    async def _extract_docx_content(self, file_path: Path) -> str:
+        """提取 Word 文档内容"""
+        try:
+            from docx import Document
+            
+            doc = Document(str(file_path))
+            text_parts = []
+            
+            for para in doc.paragraphs:
+                if para.text.strip():
+                    text_parts.append(para.text)
+            
+            # 也提取表格内容
+            for table in doc.tables:
+                for row in table.rows:
+                    for cell in row.cells:
+                        if cell.text.strip():
+                            text_parts.append(cell.text)
+            
+            content = '\n'.join(text_parts)
+            content = self._clean_content(content)
+            
+            return content
+            
+        except ImportError:
+            logger.error("python-docx 未安装，无法读取 Word 文档")
+            raise ValueError("服务器未安装 Word 读取组件")
+        except Exception as e:
+            logger.error(f"Word 文档读取失败: {e}")
+            raise ValueError(f"Word 文档读取失败: {e}")
+    
+    async def _extract_text_content(self, file_path: Path) -> str:
+        """提取文本文件内容"""
+        try:
+            # 尝试多种编码
+            encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
+            
+            for encoding in encodings:
+                try:
+                    with open(file_path, 'r', encoding=encoding) as f:
+                        content = f.read()
+                    return self._clean_content(content)
+                except UnicodeDecodeError:
+                    continue
+            
+            raise ValueError("无法识别文件编码")
+            
+        except Exception as e:
+            logger.error(f"文本文件读取失败: {e}")
+            raise ValueError(f"文本文件读取失败: {e}")
+    
+    def _clean_content(self, content: str) -> str:
+        """清理和截断内容"""
+        # 移除多余空白
+        import re
+        content = re.sub(r'\n{3,}', '\n\n', content)
+        content = re.sub(r' {2,}', ' ', content)
+        
+        # 截断过长内容
+        if len(content) > MAX_CONTENT_LENGTH:
+            logger.warning(f"文档内容过长，截断至 {MAX_CONTENT_LENGTH} 字符")
+            content = content[:MAX_CONTENT_LENGTH] + "\n\n[内容已截断...]"
+        
+        return content.strip()
+    
+    async def _call_ai_analysis(
+        self,
+        content: str,
+        course_title: str
+    ) -> AIResponse:
+        """调用 AI 进行知识点分析"""
+        # 构建消息
+        user_message = USER_PROMPT.format(
+            course_name=course_title,
+            content=content
+        )
+        
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_message}
+        ]
+        
+        # 调用 AI
+        response = await self.ai_service.chat(
+            messages=messages,
+            temperature=0.1,  # 低温度，保持输出稳定
+            prompt_name="knowledge_analysis"
+        )
+        
+        return response
+    
+    def _parse_knowledge_points(self, ai_output: str) -> List[Dict[str, Any]]:
+        """
+        解析 AI 输出的知识点 JSON
+        
+        使用 LLM JSON Parser 进行多层兜底解析
+        """
+        # 先清洗输出
+        cleaned_output, rules = clean_llm_output(ai_output)
+        if rules:
+            logger.debug(f"AI 输出已清洗: {rules}")
+        
+        # 使用带 Schema 校验的解析
+        knowledge_points = parse_with_fallback(
+            cleaned_output,
+            schema=KNOWLEDGE_POINT_SCHEMA,
+            default=[],
+            validate_schema=True,
+            on_error="default"
+        )
+        
+        # 后处理：确保每个知识点有必要字段
+        processed_points = []
+        for i, kp in enumerate(knowledge_points):
+            if i >= MAX_KNOWLEDGE_POINTS:
+                logger.warning(f"知识点数量超过限制 {MAX_KNOWLEDGE_POINTS}，截断")
+                break
+            
+            if isinstance(kp, dict):
+                # 提取字段（兼容多种字段名）
+                title = (
+                    kp.get('title') or 
+                    kp.get('name') or 
+                    kp.get('知识点名称') or 
+                    f"知识点 {i + 1}"
+                )
+                content = (
+                    kp.get('content') or 
+                    kp.get('description') or 
+                    kp.get('知识点描述') or 
+                    ''
+                )
+                kp_type = (
+                    kp.get('type') or 
+                    kp.get('知识点类型') or 
+                    DEFAULT_KNOWLEDGE_TYPE
+                )
+                topic_relation = (
+                    kp.get('topic_relation') or 
+                    kp.get('关系描述') or 
+                    ''
+                )
+                
+                if title and (content or topic_relation):
+                    processed_points.append({
+                        'title': title[:200],  # 限制长度
+                        'content': content,
+                        'type': kp_type,
+                        'topic_relation': topic_relation,
+                    })
+        
+        if not processed_points:
+            logger.warning("未能解析出有效的知识点")
+        
+        return processed_points
+    
+    async def _delete_old_knowledge_points(
+        self,
+        db: AsyncSession,
+        material_id: int
+    ) -> int:
+        """删除资料关联的旧知识点"""
+        try:
+            from sqlalchemy import text
+            
+            result = await db.execute(
+                text("DELETE FROM knowledge_points WHERE material_id = :material_id"),
+                {"material_id": material_id}
+            )
+            await db.commit()
+            
+            deleted_count = result.rowcount
+            if deleted_count > 0:
+                logger.info(f"已删除旧知识点: material_id={material_id}, count={deleted_count}")
+            
+            return deleted_count
+            
+        except Exception as e:
+            logger.error(f"删除旧知识点失败: {e}")
+            await db.rollback()
+            raise
+    
+    async def _save_knowledge_points(
+        self,
+        db: AsyncSession,
+        course_id: int,
+        material_id: int,
+        knowledge_points: List[Dict[str, Any]],
+        user_id: int
+    ) -> int:
+        """保存知识点到数据库"""
+        from app.services.course_service import knowledge_point_service
+        
+        saved_count = 0
+        
+        for kp_data in knowledge_points:
+            try:
+                kp_create = KnowledgePointCreate(
+                    name=kp_data['title'],
+                    description=kp_data.get('content', ''),
+                    type=kp_data.get('type', DEFAULT_KNOWLEDGE_TYPE),
+                    source=1,  # AI 分析来源
+                    topic_relation=kp_data.get('topic_relation'),
+                    material_id=material_id
+                )
+                
+                await knowledge_point_service.create_knowledge_point(
+                    db=db,
+                    course_id=course_id,
+                    point_in=kp_create,
+                    created_by=user_id
+                )
+                saved_count += 1
+                
+            except Exception as e:
+                logger.warning(
+                    f"保存单个知识点失败: title={kp_data.get('title')}, error={e}"
+                )
+                continue
+        
+        return saved_count
+    
+    async def reanalyze_course_materials(
+        self,
+        db: AsyncSession,
+        course_id: int,
+        course_title: str,
+        user_id: int
+    ) -> Dict[str, Any]:
+        """
+        重新分析课程的所有资料
+        
+        Args:
+            db: 数据库会话
+            course_id: 课程ID
+            course_title: 课程标题
+            user_id: 用户ID
+            
+        Returns:
+            分析结果汇总
+        """
+        try:
+            from app.services.course_service import course_service
+            
+            # 获取课程的所有资料
+            materials = await course_service.get_course_materials(db, course_id=course_id)
+            
+            if not materials:
+                return {
+                    "success": True,
+                    "message": "该课程暂无资料需要分析",
+                    "materials_count": 0,
+                    "knowledge_points_count": 0
+                }
+            
+            total_knowledge_points = 0
+            analysis_results = []
+            
+            for material in materials:
+                try:
+                    result = await self.analyze_course_material(
+                        db=db,
+                        course_id=course_id,
+                        material_id=material.id,
+                        file_url=material.file_url,
+                        course_title=course_title,
+                        user_id=user_id
+                    )
+                    
+                    kp_count = result.get('knowledge_points_count', 0)
+                    total_knowledge_points += kp_count
+                    
+                    analysis_results.append({
+                        "material_id": material.id,
+                        "material_name": material.name,
+                        "success": True,
+                        "knowledge_points_count": kp_count
+                    })
+                    
+                except Exception as e:
+                    logger.error(
+                        f"资料分析失败: material_id={material.id}, error={e}"
+                    )
+                    analysis_results.append({
+                        "material_id": material.id,
+                        "material_name": material.name,
+                        "success": False,
+                        "error": str(e)
+                    })
+            
+            success_count = sum(1 for r in analysis_results if r['success'])
+            
+            logger.info(
+                f"课程资料重新分析完成 - course_id: {course_id}, "
+                f"materials: {len(materials)}, success: {success_count}, "
+                f"total_knowledge_points: {total_knowledge_points}"
+            )
+            
+            return {
+                "success": True,
+                "materials_count": len(materials),
+                "success_count": success_count,
+                "knowledge_points_count": total_knowledge_points,
+                "analysis_results": analysis_results
+            }
+            
+        except Exception as e:
+            logger.error(
+                f"课程资料重新分析失败 - course_id: {course_id}, error: {e}",
+                exc_info=True
+            )
+            raise ExternalServiceError(f"重新分析失败: {e}")
+
+
+# 创建全局实例
+knowledge_analysis_service_v2 = KnowledgeAnalysisServiceV2()
+
+
+
+
+
+
+
+
+