feat: 初始化考培练系统项目
- 从服务器拉取完整代码 - 按框架规范整理项目结构 - 配置 Drone CI 测试环境部署 - 包含后端(FastAPI)、前端(Vue3)、管理端 技术栈: Vue3 + TypeScript + FastAPI + MySQL
This commit is contained in:
548
backend/app/services/ai/knowledge_analysis_v2.py
Normal file
548
backend/app/services/ai/knowledge_analysis_v2.py
Normal file
@@ -0,0 +1,548 @@
|
||||
"""
|
||||
知识点分析服务 V2 - Python 原生实现
|
||||
|
||||
功能:
|
||||
- 读取文档内容(PDF/Word/TXT)
|
||||
- 调用 AI 分析提取知识点
|
||||
- 解析 JSON 结果
|
||||
- 写入数据库
|
||||
|
||||
提供稳定可靠的知识点分析能力。
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.exceptions import ExternalServiceError
|
||||
from app.schemas.course import KnowledgePointCreate
|
||||
|
||||
from .ai_service import AIService, AIResponse
|
||||
from .llm_json_parser import parse_with_fallback, clean_llm_output
|
||||
from .prompts.knowledge_analysis_prompts import (
|
||||
SYSTEM_PROMPT,
|
||||
USER_PROMPT,
|
||||
KNOWLEDGE_POINT_SCHEMA,
|
||||
DEFAULT_KNOWLEDGE_TYPE,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 配置常量
|
||||
STATIC_UPLOADS_PREFIX = '/static/uploads/'
|
||||
MAX_CONTENT_LENGTH = 100000 # 最大文档内容长度(字符)
|
||||
MAX_KNOWLEDGE_POINTS = 20 # 最大知识点数量
|
||||
|
||||
|
||||
class KnowledgeAnalysisServiceV2:
|
||||
"""
|
||||
知识点分析服务 V2
|
||||
|
||||
使用 Python 原生实现。
|
||||
|
||||
使用示例:
|
||||
```python
|
||||
service = KnowledgeAnalysisServiceV2()
|
||||
result = await service.analyze_course_material(
|
||||
db=db_session,
|
||||
course_id=1,
|
||||
material_id=10,
|
||||
file_url="/static/uploads/courses/1/doc.pdf",
|
||||
course_title="医美产品知识",
|
||||
user_id=1
|
||||
)
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化服务"""
|
||||
self.ai_service = AIService(module_code="knowledge_analysis")
|
||||
self.upload_path = getattr(settings, 'UPLOAD_PATH', 'uploads')
|
||||
|
||||
async def analyze_course_material(
|
||||
self,
|
||||
db: AsyncSession,
|
||||
course_id: int,
|
||||
material_id: int,
|
||||
file_url: str,
|
||||
course_title: str,
|
||||
user_id: int
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
分析课程资料并提取知识点
|
||||
|
||||
Args:
|
||||
db: 数据库会话
|
||||
course_id: 课程ID
|
||||
material_id: 资料ID
|
||||
file_url: 文件URL(相对路径)
|
||||
course_title: 课程标题
|
||||
user_id: 用户ID
|
||||
|
||||
Returns:
|
||||
分析结果,包含 success、knowledge_points_count 等字段
|
||||
"""
|
||||
try:
|
||||
logger.info(
|
||||
f"开始知识点分析 V2 - course_id: {course_id}, material_id: {material_id}, "
|
||||
f"file_url: {file_url}"
|
||||
)
|
||||
|
||||
# 1. 解析文件路径
|
||||
file_path = self._resolve_file_path(file_url)
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||
|
||||
logger.info(f"文件路径解析成功: {file_path}")
|
||||
|
||||
# 2. 提取文档内容
|
||||
content = await self._extract_document_content(file_path)
|
||||
if not content or not content.strip():
|
||||
raise ValueError("文档内容为空")
|
||||
|
||||
logger.info(f"文档内容提取成功,长度: {len(content)} 字符")
|
||||
|
||||
# 3. 调用 AI 分析
|
||||
ai_response = await self._call_ai_analysis(content, course_title)
|
||||
|
||||
logger.info(
|
||||
f"AI 分析完成 - provider: {ai_response.provider}, "
|
||||
f"tokens: {ai_response.total_tokens}, latency: {ai_response.latency_ms}ms"
|
||||
)
|
||||
|
||||
# 4. 解析 JSON 结果
|
||||
knowledge_points = self._parse_knowledge_points(ai_response.content)
|
||||
|
||||
logger.info(f"知识点解析成功,数量: {len(knowledge_points)}")
|
||||
|
||||
# 5. 删除旧的知识点
|
||||
await self._delete_old_knowledge_points(db, material_id)
|
||||
|
||||
# 6. 保存到数据库
|
||||
saved_count = await self._save_knowledge_points(
|
||||
db=db,
|
||||
course_id=course_id,
|
||||
material_id=material_id,
|
||||
knowledge_points=knowledge_points,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"知识点分析完成 - course_id: {course_id}, material_id: {material_id}, "
|
||||
f"saved_count: {saved_count}"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"status": "completed",
|
||||
"knowledge_points_count": saved_count,
|
||||
"ai_provider": ai_response.provider,
|
||||
"ai_model": ai_response.model,
|
||||
"ai_tokens": ai_response.total_tokens,
|
||||
"ai_latency_ms": ai_response.latency_ms,
|
||||
}
|
||||
|
||||
except FileNotFoundError as e:
|
||||
logger.error(f"文件不存在: {e}")
|
||||
raise ExternalServiceError(f"分析文件不存在: {e}")
|
||||
except ValueError as e:
|
||||
logger.error(f"参数错误: {e}")
|
||||
raise ExternalServiceError(f"分析参数错误: {e}")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"知识点分析失败 - course_id: {course_id}, material_id: {material_id}, "
|
||||
f"error: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
raise ExternalServiceError(f"知识点分析失败: {e}")
|
||||
|
||||
def _resolve_file_path(self, file_url: str) -> Path:
|
||||
"""解析文件 URL 为本地路径"""
|
||||
if file_url.startswith(STATIC_UPLOADS_PREFIX):
|
||||
relative_path = file_url.replace(STATIC_UPLOADS_PREFIX, '')
|
||||
return Path(self.upload_path) / relative_path
|
||||
elif file_url.startswith('/'):
|
||||
# 绝对路径
|
||||
return Path(file_url)
|
||||
else:
|
||||
# 相对路径
|
||||
return Path(self.upload_path) / file_url
|
||||
|
||||
async def _extract_document_content(self, file_path: Path) -> str:
|
||||
"""
|
||||
提取文档内容
|
||||
|
||||
支持:PDF、Word(docx)、文本文件
|
||||
"""
|
||||
suffix = file_path.suffix.lower()
|
||||
|
||||
try:
|
||||
if suffix == '.pdf':
|
||||
return await self._extract_pdf_content(file_path)
|
||||
elif suffix in ['.docx', '.doc']:
|
||||
return await self._extract_docx_content(file_path)
|
||||
elif suffix in ['.txt', '.md', '.text']:
|
||||
return await self._extract_text_content(file_path)
|
||||
else:
|
||||
# 尝试作为文本读取
|
||||
return await self._extract_text_content(file_path)
|
||||
except Exception as e:
|
||||
logger.error(f"文档内容提取失败: {file_path}, error: {e}")
|
||||
raise ValueError(f"无法读取文档内容: {e}")
|
||||
|
||||
async def _extract_pdf_content(self, file_path: Path) -> str:
|
||||
"""提取 PDF 内容"""
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
reader = PdfReader(str(file_path))
|
||||
text_parts = []
|
||||
|
||||
for page in reader.pages:
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
content = '\n'.join(text_parts)
|
||||
|
||||
# 清理和截断
|
||||
content = self._clean_content(content)
|
||||
|
||||
return content
|
||||
|
||||
except ImportError:
|
||||
logger.error("PyPDF2 未安装,无法读取 PDF")
|
||||
raise ValueError("服务器未安装 PDF 读取组件")
|
||||
except Exception as e:
|
||||
logger.error(f"PDF 读取失败: {e}")
|
||||
raise ValueError(f"PDF 读取失败: {e}")
|
||||
|
||||
async def _extract_docx_content(self, file_path: Path) -> str:
|
||||
"""提取 Word 文档内容"""
|
||||
try:
|
||||
from docx import Document
|
||||
|
||||
doc = Document(str(file_path))
|
||||
text_parts = []
|
||||
|
||||
for para in doc.paragraphs:
|
||||
if para.text.strip():
|
||||
text_parts.append(para.text)
|
||||
|
||||
# 也提取表格内容
|
||||
for table in doc.tables:
|
||||
for row in table.rows:
|
||||
for cell in row.cells:
|
||||
if cell.text.strip():
|
||||
text_parts.append(cell.text)
|
||||
|
||||
content = '\n'.join(text_parts)
|
||||
content = self._clean_content(content)
|
||||
|
||||
return content
|
||||
|
||||
except ImportError:
|
||||
logger.error("python-docx 未安装,无法读取 Word 文档")
|
||||
raise ValueError("服务器未安装 Word 读取组件")
|
||||
except Exception as e:
|
||||
logger.error(f"Word 文档读取失败: {e}")
|
||||
raise ValueError(f"Word 文档读取失败: {e}")
|
||||
|
||||
async def _extract_text_content(self, file_path: Path) -> str:
|
||||
"""提取文本文件内容"""
|
||||
try:
|
||||
# 尝试多种编码
|
||||
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
|
||||
|
||||
for encoding in encodings:
|
||||
try:
|
||||
with open(file_path, 'r', encoding=encoding) as f:
|
||||
content = f.read()
|
||||
return self._clean_content(content)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
|
||||
raise ValueError("无法识别文件编码")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"文本文件读取失败: {e}")
|
||||
raise ValueError(f"文本文件读取失败: {e}")
|
||||
|
||||
def _clean_content(self, content: str) -> str:
|
||||
"""清理和截断内容"""
|
||||
# 移除多余空白
|
||||
import re
|
||||
content = re.sub(r'\n{3,}', '\n\n', content)
|
||||
content = re.sub(r' {2,}', ' ', content)
|
||||
|
||||
# 截断过长内容
|
||||
if len(content) > MAX_CONTENT_LENGTH:
|
||||
logger.warning(f"文档内容过长,截断至 {MAX_CONTENT_LENGTH} 字符")
|
||||
content = content[:MAX_CONTENT_LENGTH] + "\n\n[内容已截断...]"
|
||||
|
||||
return content.strip()
|
||||
|
||||
async def _call_ai_analysis(
|
||||
self,
|
||||
content: str,
|
||||
course_title: str
|
||||
) -> AIResponse:
|
||||
"""调用 AI 进行知识点分析"""
|
||||
# 构建消息
|
||||
user_message = USER_PROMPT.format(
|
||||
course_name=course_title,
|
||||
content=content
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_message}
|
||||
]
|
||||
|
||||
# 调用 AI
|
||||
response = await self.ai_service.chat(
|
||||
messages=messages,
|
||||
temperature=0.1, # 低温度,保持输出稳定
|
||||
prompt_name="knowledge_analysis"
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
def _parse_knowledge_points(self, ai_output: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
解析 AI 输出的知识点 JSON
|
||||
|
||||
使用 LLM JSON Parser 进行多层兜底解析
|
||||
"""
|
||||
# 先清洗输出
|
||||
cleaned_output, rules = clean_llm_output(ai_output)
|
||||
if rules:
|
||||
logger.debug(f"AI 输出已清洗: {rules}")
|
||||
|
||||
# 使用带 Schema 校验的解析
|
||||
knowledge_points = parse_with_fallback(
|
||||
cleaned_output,
|
||||
schema=KNOWLEDGE_POINT_SCHEMA,
|
||||
default=[],
|
||||
validate_schema=True,
|
||||
on_error="default"
|
||||
)
|
||||
|
||||
# 后处理:确保每个知识点有必要字段
|
||||
processed_points = []
|
||||
for i, kp in enumerate(knowledge_points):
|
||||
if i >= MAX_KNOWLEDGE_POINTS:
|
||||
logger.warning(f"知识点数量超过限制 {MAX_KNOWLEDGE_POINTS},截断")
|
||||
break
|
||||
|
||||
if isinstance(kp, dict):
|
||||
# 提取字段(兼容多种字段名)
|
||||
title = (
|
||||
kp.get('title') or
|
||||
kp.get('name') or
|
||||
kp.get('知识点名称') or
|
||||
f"知识点 {i + 1}"
|
||||
)
|
||||
content = (
|
||||
kp.get('content') or
|
||||
kp.get('description') or
|
||||
kp.get('知识点描述') or
|
||||
''
|
||||
)
|
||||
kp_type = (
|
||||
kp.get('type') or
|
||||
kp.get('知识点类型') or
|
||||
DEFAULT_KNOWLEDGE_TYPE
|
||||
)
|
||||
topic_relation = (
|
||||
kp.get('topic_relation') or
|
||||
kp.get('关系描述') or
|
||||
''
|
||||
)
|
||||
|
||||
if title and (content or topic_relation):
|
||||
processed_points.append({
|
||||
'title': title[:200], # 限制长度
|
||||
'content': content,
|
||||
'type': kp_type,
|
||||
'topic_relation': topic_relation,
|
||||
})
|
||||
|
||||
if not processed_points:
|
||||
logger.warning("未能解析出有效的知识点")
|
||||
|
||||
return processed_points
|
||||
|
||||
async def _delete_old_knowledge_points(
|
||||
self,
|
||||
db: AsyncSession,
|
||||
material_id: int
|
||||
) -> int:
|
||||
"""删除资料关联的旧知识点"""
|
||||
try:
|
||||
from sqlalchemy import text
|
||||
|
||||
result = await db.execute(
|
||||
text("DELETE FROM knowledge_points WHERE material_id = :material_id"),
|
||||
{"material_id": material_id}
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
deleted_count = result.rowcount
|
||||
if deleted_count > 0:
|
||||
logger.info(f"已删除旧知识点: material_id={material_id}, count={deleted_count}")
|
||||
|
||||
return deleted_count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"删除旧知识点失败: {e}")
|
||||
await db.rollback()
|
||||
raise
|
||||
|
||||
async def _save_knowledge_points(
|
||||
self,
|
||||
db: AsyncSession,
|
||||
course_id: int,
|
||||
material_id: int,
|
||||
knowledge_points: List[Dict[str, Any]],
|
||||
user_id: int
|
||||
) -> int:
|
||||
"""保存知识点到数据库"""
|
||||
from app.services.course_service import knowledge_point_service
|
||||
|
||||
saved_count = 0
|
||||
|
||||
for kp_data in knowledge_points:
|
||||
try:
|
||||
kp_create = KnowledgePointCreate(
|
||||
name=kp_data['title'],
|
||||
description=kp_data.get('content', ''),
|
||||
type=kp_data.get('type', DEFAULT_KNOWLEDGE_TYPE),
|
||||
source=1, # AI 分析来源
|
||||
topic_relation=kp_data.get('topic_relation'),
|
||||
material_id=material_id
|
||||
)
|
||||
|
||||
await knowledge_point_service.create_knowledge_point(
|
||||
db=db,
|
||||
course_id=course_id,
|
||||
point_in=kp_create,
|
||||
created_by=user_id
|
||||
)
|
||||
saved_count += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"保存单个知识点失败: title={kp_data.get('title')}, error={e}"
|
||||
)
|
||||
continue
|
||||
|
||||
return saved_count
|
||||
|
||||
async def reanalyze_course_materials(
|
||||
self,
|
||||
db: AsyncSession,
|
||||
course_id: int,
|
||||
course_title: str,
|
||||
user_id: int
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
重新分析课程的所有资料
|
||||
|
||||
Args:
|
||||
db: 数据库会话
|
||||
course_id: 课程ID
|
||||
course_title: 课程标题
|
||||
user_id: 用户ID
|
||||
|
||||
Returns:
|
||||
分析结果汇总
|
||||
"""
|
||||
try:
|
||||
from app.services.course_service import course_service
|
||||
|
||||
# 获取课程的所有资料
|
||||
materials = await course_service.get_course_materials(db, course_id=course_id)
|
||||
|
||||
if not materials:
|
||||
return {
|
||||
"success": True,
|
||||
"message": "该课程暂无资料需要分析",
|
||||
"materials_count": 0,
|
||||
"knowledge_points_count": 0
|
||||
}
|
||||
|
||||
total_knowledge_points = 0
|
||||
analysis_results = []
|
||||
|
||||
for material in materials:
|
||||
try:
|
||||
result = await self.analyze_course_material(
|
||||
db=db,
|
||||
course_id=course_id,
|
||||
material_id=material.id,
|
||||
file_url=material.file_url,
|
||||
course_title=course_title,
|
||||
user_id=user_id
|
||||
)
|
||||
|
||||
kp_count = result.get('knowledge_points_count', 0)
|
||||
total_knowledge_points += kp_count
|
||||
|
||||
analysis_results.append({
|
||||
"material_id": material.id,
|
||||
"material_name": material.name,
|
||||
"success": True,
|
||||
"knowledge_points_count": kp_count
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"资料分析失败: material_id={material.id}, error={e}"
|
||||
)
|
||||
analysis_results.append({
|
||||
"material_id": material.id,
|
||||
"material_name": material.name,
|
||||
"success": False,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
success_count = sum(1 for r in analysis_results if r['success'])
|
||||
|
||||
logger.info(
|
||||
f"课程资料重新分析完成 - course_id: {course_id}, "
|
||||
f"materials: {len(materials)}, success: {success_count}, "
|
||||
f"total_knowledge_points: {total_knowledge_points}"
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"materials_count": len(materials),
|
||||
"success_count": success_count,
|
||||
"knowledge_points_count": total_knowledge_points,
|
||||
"analysis_results": analysis_results
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"课程资料重新分析失败 - course_id: {course_id}, error: {e}",
|
||||
exc_info=True
|
||||
)
|
||||
raise ExternalServiceError(f"重新分析失败: {e}")
|
||||
|
||||
|
||||
# 创建全局实例
|
||||
knowledge_analysis_service_v2 = KnowledgeAnalysisServiceV2()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user