""" 输入清理和XSS防护工具 """ import re import html from typing import Optional # 危险的HTML标签和属性 DANGEROUS_TAGS = [ 'script', 'iframe', 'object', 'embed', 'form', 'input', 'textarea', 'button', 'select', 'style', 'link', 'meta', 'base', 'applet', 'frame', 'frameset', 'layer', 'ilayer', 'bgsound', 'xml', 'blink', 'marquee' ] DANGEROUS_ATTRS = [ 'onclick', 'ondblclick', 'onmousedown', 'onmouseup', 'onmouseover', 'onmousemove', 'onmouseout', 'onkeypress', 'onkeydown', 'onkeyup', 'onload', 'onerror', 'onabort', 'onblur', 'onchange', 'onfocus', 'onreset', 'onsubmit', 'onunload', 'onbeforeunload', 'onresize', 'onscroll', 'ondrag', 'ondragend', 'ondragenter', 'ondragleave', 'ondragover', 'ondragstart', 'ondrop', 'onmousewheel', 'onwheel', 'oncopy', 'oncut', 'onpaste', 'oncontextmenu', 'oninput', 'oninvalid', 'onsearch', 'onselect', 'ontoggle', 'formaction', 'xlink:href' ] def sanitize_html(text: Optional[str]) -> Optional[str]: """ 清理HTML内容,移除危险标签和属性 Args: text: 输入文本 Returns: 清理后的文本 """ if text is None: return None if not isinstance(text, str): return text result = text # 移除危险标签 for tag in DANGEROUS_TAGS: # 移除开标签 pattern = re.compile(rf'<{tag}[^>]*>', re.IGNORECASE) result = pattern.sub('', result) # 移除闭标签 pattern = re.compile(rf'', re.IGNORECASE) result = pattern.sub('', result) # 移除危险属性 for attr in DANGEROUS_ATTRS: pattern = re.compile(rf'\s*{attr}\s*=\s*["\'][^"\']*["\']', re.IGNORECASE) result = pattern.sub('', result) # 也处理没有引号的情况 pattern = re.compile(rf'\s*{attr}\s*=\s*\S+', re.IGNORECASE) result = pattern.sub('', result) # 移除 javascript: 协议 pattern = re.compile(r'javascript\s*:', re.IGNORECASE) result = pattern.sub('', result) # 移除 data: 协议(可能包含恶意代码) pattern = re.compile(r'data\s*:\s*text/html', re.IGNORECASE) result = pattern.sub('', result) # 移除 vbscript: 协议 pattern = re.compile(r'vbscript\s*:', re.IGNORECASE) result = pattern.sub('', result) return result def escape_html(text: Optional[str]) -> Optional[str]: """ 转义HTML特殊字符 Args: text: 输入文本 Returns: 转义后的文本 """ if text is None: return None if not isinstance(text, str): return text return html.escape(text, quote=True) def strip_tags(text: Optional[str]) -> Optional[str]: """ 完全移除所有HTML标签 Args: text: 输入文本 Returns: 移除标签后的纯文本 """ if text is None: return None if not isinstance(text, str): return text # 移除所有HTML标签 clean = re.compile('<[^>]*>') return clean.sub('', text) def sanitize_input(text: Optional[str], strict: bool = False) -> Optional[str]: """ 清理用户输入 Args: text: 输入文本 strict: 是否使用严格模式(完全移除所有HTML标签) Returns: 清理后的文本 """ if text is None: return None if strict: return strip_tags(text) else: return sanitize_html(text)