feat: 初始化考培练系统项目

- 从服务器拉取完整代码
- 按框架规范整理项目结构
- 配置 Drone CI 测试环境部署
- 包含后端(FastAPI)、前端(Vue3)、管理端

技术栈: Vue3 + TypeScript + FastAPI + MySQL
This commit is contained in:
111
2026-01-24 19:33:28 +08:00
commit 998211c483
1197 changed files with 228429 additions and 0 deletions

View File

@@ -0,0 +1,707 @@
"""
LLM JSON Parser - 大模型 JSON 输出解析器
功能:
- 使用 json-repair 库修复 AI 输出的 JSON
- 处理中文标点、尾部逗号、Python 风格等问题
- Schema 校验确保数据完整性
使用示例:
```python
from app.services.ai.llm_json_parser import parse_llm_json, parse_with_fallback
# 简单解析
result = parse_llm_json(ai_response)
# 带 Schema 校验和默认值
result = parse_with_fallback(
ai_response,
schema=MY_SCHEMA,
default=[]
)
```
"""
import json
import re
import logging
from typing import Any, Dict, List, Optional, Tuple, Union
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
# 尝试导入 json-repair
try:
from json_repair import loads as json_repair_loads
from json_repair import repair_json
HAS_JSON_REPAIR = True
except ImportError:
HAS_JSON_REPAIR = False
logger.warning("json-repair 未安装,将使用内置修复逻辑")
# 尝试导入 jsonschema
try:
from jsonschema import validate, ValidationError, Draft7Validator
HAS_JSONSCHEMA = True
except ImportError:
HAS_JSONSCHEMA = False
logger.warning("jsonschema 未安装,将跳过 Schema 校验")
# ==================== 异常类 ====================
class JSONParseError(Exception):
"""JSON 解析错误基类"""
def __init__(self, message: str, raw_text: str = "", issues: List[dict] = None):
super().__init__(message)
self.raw_text = raw_text
self.issues = issues or []
class JSONUnrecoverableError(JSONParseError):
"""不可恢复的 JSON 错误"""
pass
# ==================== 解析结果 ====================
@dataclass
class ParseResult:
"""解析结果"""
success: bool
data: Any = None
method: str = "" # direct / json_repair / preprocessed / fixed / completed / default
issues: List[dict] = field(default_factory=list)
raw_text: str = ""
error: str = ""
# ==================== 核心解析函数 ====================
def parse_llm_json(
text: str,
*,
strict: bool = False,
return_result: bool = False
) -> Union[Any, ParseResult]:
"""
智能解析 LLM 输出的 JSON
Args:
text: 原始文本
strict: 严格模式,不进行自动修复
return_result: 返回 ParseResult 对象而非直接数据
Returns:
解析后的 JSON 对象,或 ParseResult如果 return_result=True
Raises:
JSONUnrecoverableError: 所有修复尝试都失败
"""
if not text or not text.strip():
if return_result:
return ParseResult(success=False, error="Empty input")
raise JSONUnrecoverableError("Empty input", text)
text = text.strip()
issues = []
# 第一层:直接解析
try:
data = json.loads(text)
result = ParseResult(success=True, data=data, method="direct", raw_text=text)
return result if return_result else data
except json.JSONDecodeError:
pass
if strict:
if return_result:
return ParseResult(success=False, error="Strict mode: direct parse failed", raw_text=text)
raise JSONUnrecoverableError("Strict mode: direct parse failed", text)
# 第二层:使用 json-repair推荐
if HAS_JSON_REPAIR:
try:
data = json_repair_loads(text)
issues.append({"type": "json_repair", "action": "Auto-repaired by json-repair library"})
result = ParseResult(success=True, data=data, method="json_repair", issues=issues, raw_text=text)
return result if return_result else data
except Exception as e:
logger.debug(f"json-repair 修复失败: {e}")
# 第三层:预处理(提取代码块、清理文字)
preprocessed = _preprocess_text(text)
if preprocessed != text:
try:
data = json.loads(preprocessed)
issues.append({"type": "preprocessed", "action": "Extracted JSON from text"})
result = ParseResult(success=True, data=data, method="preprocessed", issues=issues, raw_text=text)
return result if return_result else data
except json.JSONDecodeError:
pass
# 再次尝试 json-repair
if HAS_JSON_REPAIR:
try:
data = json_repair_loads(preprocessed)
issues.append({"type": "json_repair_preprocessed", "action": "Repaired after preprocessing"})
result = ParseResult(success=True, data=data, method="json_repair", issues=issues, raw_text=text)
return result if return_result else data
except Exception:
pass
# 第四层:自动修复
fixed, fix_issues = _fix_json_format(preprocessed)
issues.extend(fix_issues)
if fixed != preprocessed:
try:
data = json.loads(fixed)
result = ParseResult(success=True, data=data, method="fixed", issues=issues, raw_text=text)
return result if return_result else data
except json.JSONDecodeError:
pass
# 第五层:尝试补全截断的 JSON
completed = _try_complete_json(fixed)
if completed:
try:
data = json.loads(completed)
issues.append({"type": "completed", "action": "Auto-completed truncated JSON"})
result = ParseResult(success=True, data=data, method="completed", issues=issues, raw_text=text)
return result if return_result else data
except json.JSONDecodeError:
pass
# 所有尝试都失败
diagnosis = diagnose_json_error(fixed)
if return_result:
return ParseResult(
success=False,
method="failed",
issues=issues + diagnosis.get("issues", []),
raw_text=text,
error=f"All parse attempts failed. Issues: {diagnosis}"
)
raise JSONUnrecoverableError(f"All parse attempts failed: {diagnosis}", text, issues)
def parse_with_fallback(
raw_text: str,
schema: dict = None,
default: Any = None,
*,
validate_schema: bool = True,
on_error: str = "default" # "default" / "raise" / "none"
) -> Any:
"""
带兜底的 JSON 解析
Args:
raw_text: 原始文本
schema: JSON Schema可选
default: 默认值
validate_schema: 是否进行 Schema 校验
on_error: 错误处理方式
Returns:
解析后的数据或默认值
"""
try:
result = parse_llm_json(raw_text, return_result=True)
if not result.success:
logger.warning(f"JSON 解析失败: {result.error}")
if on_error == "raise":
raise JSONUnrecoverableError(result.error, raw_text, result.issues)
elif on_error == "none":
return None
return default
data = result.data
# Schema 校验
if validate_schema and schema and HAS_JSONSCHEMA:
is_valid, errors = validate_json_schema(data, schema)
if not is_valid:
logger.warning(f"Schema 校验失败: {errors}")
if on_error == "raise":
raise JSONUnrecoverableError(f"Schema validation failed: {errors}", raw_text)
elif on_error == "none":
return None
return default
# 记录解析方法
if result.method != "direct":
logger.info(f"JSON 解析成功: method={result.method}, issues={result.issues}")
return data
except Exception as e:
logger.error(f"JSON 解析异常: {e}")
if on_error == "raise":
raise
elif on_error == "none":
return None
return default
# ==================== 预处理函数 ====================
def _preprocess_text(text: str) -> str:
"""预处理文本:提取代码块、清理前后文字"""
# 移除 BOM
text = text.lstrip('\ufeff')
# 移除零宽字符
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
# 提取 Markdown 代码块
patterns = [
r'```json\s*([\s\S]*?)\s*```',
r'```\s*([\s\S]*?)\s*```',
r'`([^`]+)`',
]
for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
extracted = match.group(1).strip()
if extracted.startswith(('{', '[')):
text = extracted
break
# 找到 JSON 边界
text = _find_json_boundaries(text)
return text.strip()
def _find_json_boundaries(text: str) -> str:
"""找到 JSON 的起止位置"""
# 找第一个 { 或 [
start = -1
for i, c in enumerate(text):
if c in '{[':
start = i
break
if start == -1:
return text
# 找最后一个匹配的 } 或 ]
depth = 0
end = -1
in_string = False
escape = False
for i in range(start, len(text)):
c = text[i]
if escape:
escape = False
continue
if c == '\\':
escape = True
continue
if c == '"':
in_string = not in_string
continue
if in_string:
continue
if c in '{[':
depth += 1
elif c in '}]':
depth -= 1
if depth == 0:
end = i + 1
break
if end == -1:
# 找最后一个 } 或 ]
for i in range(len(text) - 1, start, -1):
if text[i] in '}]':
end = i + 1
break
if end > start:
return text[start:end]
return text[start:]
# ==================== 修复函数 ====================
def _fix_json_format(text: str) -> Tuple[str, List[dict]]:
"""修复常见 JSON 格式问题"""
issues = []
# 1. 中文标点转英文
cn_punctuation = {
'': ',', '': '.', '': ':', '': ';',
'"': '"', '"': '"', ''': "'", ''': "'",
'': '[', '': ']', '': '(', '': ')',
'': '{', '': '}',
}
for cn, en in cn_punctuation.items():
if cn in text:
text = text.replace(cn, en)
issues.append({"type": "chinese_punctuation", "from": cn, "to": en})
# 2. 移除注释
if '//' in text:
text = re.sub(r'//[^\n]*', '', text)
issues.append({"type": "removed_comments", "style": "single-line"})
if '/*' in text:
text = re.sub(r'/\*[\s\S]*?\*/', '', text)
issues.append({"type": "removed_comments", "style": "multi-line"})
# 3. Python 风格转 JSON
python_replacements = [
(r'\bTrue\b', 'true'),
(r'\bFalse\b', 'false'),
(r'\bNone\b', 'null'),
]
for pattern, replacement in python_replacements:
if re.search(pattern, text):
text = re.sub(pattern, replacement, text)
issues.append({"type": "python_style", "from": pattern, "to": replacement})
# 4. 移除尾部逗号
trailing_comma_patterns = [
(r',(\s*})', r'\1'),
(r',(\s*\])', r'\1'),
]
for pattern, replacement in trailing_comma_patterns:
if re.search(pattern, text):
text = re.sub(pattern, replacement, text)
issues.append({"type": "trailing_comma", "action": "removed"})
# 5. 修复单引号(谨慎处理)
if text.count("'") > text.count('"') and re.match(r"^\s*\{?\s*'", text):
text = re.sub(r"'([^']*)'(\s*:)", r'"\1"\2', text)
text = re.sub(r":\s*'([^']*)'", r': "\1"', text)
issues.append({"type": "single_quotes", "action": "replaced"})
return text, issues
def _try_complete_json(text: str) -> Optional[str]:
"""尝试补全截断的 JSON"""
if not text:
return None
# 统计括号
stack = []
in_string = False
escape = False
for c in text:
if escape:
escape = False
continue
if c == '\\':
escape = True
continue
if c == '"':
in_string = not in_string
continue
if in_string:
continue
if c in '{[':
stack.append(c)
elif c == '}':
if stack and stack[-1] == '{':
stack.pop()
elif c == ']':
if stack and stack[-1] == '[':
stack.pop()
if not stack:
return None # 已经平衡了
# 如果在字符串中,先闭合字符串
if in_string:
text += '"'
# 补全括号
completion = ""
for bracket in reversed(stack):
if bracket == '{':
completion += '}'
elif bracket == '[':
completion += ']'
return text + completion
# ==================== Schema 校验 ====================
def validate_json_schema(data: Any, schema: dict) -> Tuple[bool, List[dict]]:
"""
校验 JSON 是否符合 Schema
Returns:
(is_valid, errors)
"""
if not HAS_JSONSCHEMA:
logger.warning("jsonschema 未安装,跳过校验")
return True, []
try:
validator = Draft7Validator(schema)
errors = list(validator.iter_errors(data))
if errors:
error_messages = [
{
"path": list(e.absolute_path),
"message": e.message,
"validator": e.validator
}
for e in errors
]
return False, error_messages
return True, []
except Exception as e:
return False, [{"message": str(e)}]
# ==================== 诊断函数 ====================
def diagnose_json_error(text: str) -> dict:
"""诊断 JSON 错误"""
issues = []
# 检查是否为空
if not text or not text.strip():
issues.append({
"type": "empty_input",
"severity": "critical",
"suggestion": "输入为空"
})
return {"issues": issues, "fixable": False}
# 检查中文标点
cn_punctuation = ['', '', '', '', '"', '"', ''', ''']
for p in cn_punctuation:
if p in text:
issues.append({
"type": "chinese_punctuation",
"char": p,
"severity": "low",
"suggestion": f"{p} 替换为对应英文标点"
})
# 检查代码块包裹
if '```' in text:
issues.append({
"type": "markdown_wrapped",
"severity": "low",
"suggestion": "需要提取代码块内容"
})
# 检查注释
if '//' in text or '/*' in text:
issues.append({
"type": "has_comments",
"severity": "low",
"suggestion": "需要移除注释"
})
# 检查 Python 风格
if re.search(r'\b(True|False|None)\b', text):
issues.append({
"type": "python_style",
"severity": "low",
"suggestion": "将 True/False/None 转为 true/false/null"
})
# 检查尾部逗号
if re.search(r',\s*[}\]]', text):
issues.append({
"type": "trailing_comma",
"severity": "low",
"suggestion": "移除 } 或 ] 前的逗号"
})
# 检查括号平衡
open_braces = text.count('{') - text.count('}')
open_brackets = text.count('[') - text.count(']')
if open_braces > 0:
issues.append({
"type": "unclosed_brace",
"count": open_braces,
"severity": "medium",
"suggestion": f"缺少 {open_braces}}}"
})
elif open_braces < 0:
issues.append({
"type": "extra_brace",
"count": -open_braces,
"severity": "medium",
"suggestion": f"多余 {-open_braces}}}"
})
if open_brackets > 0:
issues.append({
"type": "unclosed_bracket",
"count": open_brackets,
"severity": "medium",
"suggestion": f"缺少 {open_brackets} 个 ]"
})
elif open_brackets < 0:
issues.append({
"type": "extra_bracket",
"count": -open_brackets,
"severity": "medium",
"suggestion": f"多余 {-open_brackets} 个 ]"
})
# 检查引号平衡
quote_count = text.count('"')
if quote_count % 2 != 0:
issues.append({
"type": "unbalanced_quotes",
"severity": "high",
"suggestion": "引号数量不平衡,可能有未闭合的字符串"
})
# 判断是否可修复
fixable_types = {
"chinese_punctuation", "markdown_wrapped", "has_comments",
"python_style", "trailing_comma", "unclosed_brace", "unclosed_bracket"
}
fixable = all(i["type"] in fixable_types for i in issues)
return {
"issues": issues,
"issue_count": len(issues),
"fixable": fixable,
"severity": max(
(i.get("severity", "low") for i in issues),
key=lambda x: {"low": 1, "medium": 2, "high": 3, "critical": 4}.get(x, 0),
default="low"
)
}
# ==================== 便捷函数 ====================
def safe_json_loads(text: str, default: Any = None) -> Any:
"""安全的 json.loads失败返回默认值"""
try:
return parse_llm_json(text)
except Exception:
return default
def extract_json_from_text(text: str) -> Optional[str]:
"""从文本中提取 JSON 字符串"""
preprocessed = _preprocess_text(text)
fixed, _ = _fix_json_format(preprocessed)
try:
json.loads(fixed)
return fixed
except Exception:
completed = _try_complete_json(fixed)
if completed:
try:
json.loads(completed)
return completed
except Exception:
pass
return None
def clean_llm_output(text: str) -> Tuple[str, List[str]]:
"""
清洗大模型输出,返回清洗后的文本和应用的清洗规则
Args:
text: 原始输出文本
Returns:
(cleaned_text, applied_rules)
"""
if not text:
return "", ["empty_input"]
applied_rules = []
original = text
# 1. 去除 BOM 头
if text.startswith('\ufeff'):
text = text.lstrip('\ufeff')
applied_rules.append("removed_bom")
# 2. 去除 ANSI 转义序列
ansi_pattern = re.compile(r'\x1b\[[0-9;]*m')
if ansi_pattern.search(text):
text = ansi_pattern.sub('', text)
applied_rules.append("removed_ansi")
# 3. 去除首尾空白
text = text.strip()
# 4. 去除开头的客套话
polite_patterns = [
r'^好的[,。.]?\s*',
r'^当然[,。.]?\s*',
r'^没问题[,。.]?\s*',
r'^根据您的要求[,。.]?\s*',
r'^以下是.*?[:]\s*',
r'^分析结果如下[:]\s*',
r'^我来为您.*?[:]\s*',
r'^这是.*?结果[:]\s*',
]
for pattern in polite_patterns:
if re.match(pattern, text, re.IGNORECASE):
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
applied_rules.append("removed_polite_prefix")
break
# 5. 提取 Markdown JSON 代码块
json_block_patterns = [
r'```json\s*([\s\S]*?)\s*```',
r'```\s*([\s\S]*?)\s*```',
]
for pattern in json_block_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
extracted = match.group(1).strip()
if extracted.startswith(('{', '[')):
text = extracted
applied_rules.append("extracted_code_block")
break
# 6. 处理零宽字符
zero_width = re.compile(r'[\u200b\u200c\u200d\ufeff]')
if zero_width.search(text):
text = zero_width.sub('', text)
applied_rules.append("removed_zero_width")
return text.strip(), applied_rules