Files
012-kaopeilian/backend/app/core/sanitize.py
yuliang_guo 2f47193059
All checks were successful
continuous-integration/drone/push Build is passing
feat: 集成MinIO对象存储服务
- 新增storage_service.py封装MinIO操作
- 修改upload.py使用storage_service上传文件
- 修改course_service.py使用storage_service删除文件
- 适配preview.py支持从MinIO获取文件
- 适配knowledge_analysis_v2.py支持MinIO存储
- 在config.py添加MinIO配置项
- 添加minio依赖到requirements.txt

支持特性:
- 自动降级到本地存储(MinIO不可用时)
- 保持URL格式兼容(/static/uploads/)
- 文件自动缓存到本地(用于预览和分析)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-03 14:06:22 +08:00

137 lines
3.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
输入清理和XSS防护工具
"""
import re
import html
from typing import Optional
# 危险的HTML标签和属性
DANGEROUS_TAGS = [
'script', 'iframe', 'object', 'embed', 'form', 'input',
'textarea', 'button', 'select', 'style', 'link', 'meta',
'base', 'applet', 'frame', 'frameset', 'layer', 'ilayer',
'bgsound', 'xml', 'blink', 'marquee'
]
DANGEROUS_ATTRS = [
'onclick', 'ondblclick', 'onmousedown', 'onmouseup', 'onmouseover',
'onmousemove', 'onmouseout', 'onkeypress', 'onkeydown', 'onkeyup',
'onload', 'onerror', 'onabort', 'onblur', 'onchange', 'onfocus',
'onreset', 'onsubmit', 'onunload', 'onbeforeunload', 'onresize',
'onscroll', 'ondrag', 'ondragend', 'ondragenter', 'ondragleave',
'ondragover', 'ondragstart', 'ondrop', 'onmousewheel', 'onwheel',
'oncopy', 'oncut', 'onpaste', 'oncontextmenu', 'oninput', 'oninvalid',
'onsearch', 'onselect', 'ontoggle', 'formaction', 'xlink:href'
]
def sanitize_html(text: Optional[str]) -> Optional[str]:
"""
清理HTML内容移除危险标签和属性
Args:
text: 输入文本
Returns:
清理后的文本
"""
if text is None:
return None
if not isinstance(text, str):
return text
result = text
# 移除危险标签
for tag in DANGEROUS_TAGS:
# 移除开标签
pattern = re.compile(rf'<{tag}[^>]*>', re.IGNORECASE)
result = pattern.sub('', result)
# 移除闭标签
pattern = re.compile(rf'</{tag}>', re.IGNORECASE)
result = pattern.sub('', result)
# 移除危险属性
for attr in DANGEROUS_ATTRS:
pattern = re.compile(rf'\s*{attr}\s*=\s*["\'][^"\']*["\']', re.IGNORECASE)
result = pattern.sub('', result)
# 也处理没有引号的情况
pattern = re.compile(rf'\s*{attr}\s*=\s*\S+', re.IGNORECASE)
result = pattern.sub('', result)
# 移除 javascript: 协议
pattern = re.compile(r'javascript\s*:', re.IGNORECASE)
result = pattern.sub('', result)
# 移除 data: 协议(可能包含恶意代码)
pattern = re.compile(r'data\s*:\s*text/html', re.IGNORECASE)
result = pattern.sub('', result)
# 移除 vbscript: 协议
pattern = re.compile(r'vbscript\s*:', re.IGNORECASE)
result = pattern.sub('', result)
return result
def escape_html(text: Optional[str]) -> Optional[str]:
"""
转义HTML特殊字符
Args:
text: 输入文本
Returns:
转义后的文本
"""
if text is None:
return None
if not isinstance(text, str):
return text
return html.escape(text, quote=True)
def strip_tags(text: Optional[str]) -> Optional[str]:
"""
完全移除所有HTML标签
Args:
text: 输入文本
Returns:
移除标签后的纯文本
"""
if text is None:
return None
if not isinstance(text, str):
return text
# 移除所有HTML标签
clean = re.compile('<[^>]*>')
return clean.sub('', text)
def sanitize_input(text: Optional[str], strict: bool = False) -> Optional[str]:
"""
清理用户输入
Args:
text: 输入文本
strict: 是否使用严格模式完全移除所有HTML标签
Returns:
清理后的文本
"""
if text is None:
return None
if strict:
return strip_tags(text)
else:
return sanitize_html(text)