Files
000-platform/backend/app/services/alert.py
111 6c6c48cf71
All checks were successful
continuous-integration/drone/push Build is passing
feat: 新增告警、成本、配额、微信模块及缓存服务
- 新增告警模块 (alerts): 告警规则配置与触发
- 新增成本管理模块 (cost): 成本统计与分析
- 新增配额模块 (quota): 配额管理与限制
- 新增微信模块 (wechat): 微信相关功能接口
- 新增缓存服务 (cache): Redis 缓存封装
- 新增请求日志中间件 (request_logger)
- 新增异常处理和链路追踪中间件
- 更新 dashboard 前端展示
- 更新 SDK stats_client 功能
2026-01-24 16:53:47 +08:00

456 lines
17 KiB
Python

"""告警服务"""
import logging
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any
import httpx
from sqlalchemy.orm import Session
from sqlalchemy import func
from ..models.alert import AlertRule, AlertRecord, NotificationChannel
from ..models.stats import AICallEvent
from ..models.logs import PlatformLog
from .cache import get_cache
logger = logging.getLogger(__name__)
class AlertService:
"""告警服务
提供告警规则检测、告警记录管理、通知发送等功能
"""
def __init__(self, db: Session):
self.db = db
self._cache = get_cache()
async def check_all_rules(self) -> List[AlertRecord]:
"""检查所有启用的告警规则
Returns:
触发的告警记录列表
"""
rules = self.db.query(AlertRule).filter(AlertRule.status == 1).all()
triggered_alerts = []
for rule in rules:
try:
alert = await self.check_rule(rule)
if alert:
triggered_alerts.append(alert)
except Exception as e:
logger.error(f"Failed to check rule {rule.id}: {e}")
return triggered_alerts
async def check_rule(self, rule: AlertRule) -> Optional[AlertRecord]:
"""检查单个告警规则
Args:
rule: 告警规则
Returns:
触发的告警记录或None
"""
# 检查冷却期
if self._is_in_cooldown(rule):
logger.debug(f"Rule {rule.id} is in cooldown")
return None
# 检查每日告警次数限制
if self._exceeds_daily_limit(rule):
logger.debug(f"Rule {rule.id} exceeds daily limit")
return None
# 根据规则类型检查
metric_value = None
threshold_value = None
triggered = False
condition = rule.condition or {}
if rule.rule_type == 'error_rate':
triggered, metric_value, threshold_value = self._check_error_rate(rule, condition)
elif rule.rule_type == 'call_count':
triggered, metric_value, threshold_value = self._check_call_count(rule, condition)
elif rule.rule_type == 'token_usage':
triggered, metric_value, threshold_value = self._check_token_usage(rule, condition)
elif rule.rule_type == 'cost_threshold':
triggered, metric_value, threshold_value = self._check_cost_threshold(rule, condition)
elif rule.rule_type == 'latency':
triggered, metric_value, threshold_value = self._check_latency(rule, condition)
if triggered:
alert = self._create_alert_record(rule, metric_value, threshold_value)
return alert
return None
def _is_in_cooldown(self, rule: AlertRule) -> bool:
"""检查规则是否在冷却期"""
cache_key = f"alert:cooldown:{rule.id}"
return self._cache.exists(cache_key)
def _set_cooldown(self, rule: AlertRule):
"""设置规则冷却期"""
cache_key = f"alert:cooldown:{rule.id}"
self._cache.set(cache_key, "1", ttl=rule.cooldown_minutes * 60)
def _exceeds_daily_limit(self, rule: AlertRule) -> bool:
"""检查是否超过每日告警次数限制"""
today = datetime.now().date()
count = self.db.query(func.count(AlertRecord.id)).filter(
AlertRecord.rule_id == rule.id,
func.date(AlertRecord.created_at) == today
).scalar()
return count >= rule.max_alerts_per_day
def _check_error_rate(self, rule: AlertRule, condition: dict) -> tuple:
"""检查错误率"""
window_minutes = self._parse_window(condition.get('window', '5m'))
threshold = condition.get('threshold', 10) # 错误次数阈值
operator = condition.get('operator', '>')
since = datetime.now() - timedelta(minutes=window_minutes)
query = self.db.query(func.count(AICallEvent.id)).filter(
AICallEvent.created_at >= since,
AICallEvent.status == 'error'
)
# 应用作用范围
if rule.scope_type == 'tenant' and rule.scope_value:
query = query.filter(AICallEvent.tenant_id == rule.scope_value)
elif rule.scope_type == 'app' and rule.scope_value:
query = query.filter(AICallEvent.app_code == rule.scope_value)
error_count = query.scalar() or 0
triggered = self._compare(error_count, threshold, operator)
return triggered, str(error_count), str(threshold)
def _check_call_count(self, rule: AlertRule, condition: dict) -> tuple:
"""检查调用次数"""
window_minutes = self._parse_window(condition.get('window', '1h'))
threshold = condition.get('threshold', 1000)
operator = condition.get('operator', '>')
since = datetime.now() - timedelta(minutes=window_minutes)
query = self.db.query(func.count(AICallEvent.id)).filter(
AICallEvent.created_at >= since
)
if rule.scope_type == 'tenant' and rule.scope_value:
query = query.filter(AICallEvent.tenant_id == rule.scope_value)
elif rule.scope_type == 'app' and rule.scope_value:
query = query.filter(AICallEvent.app_code == rule.scope_value)
call_count = query.scalar() or 0
triggered = self._compare(call_count, threshold, operator)
return triggered, str(call_count), str(threshold)
def _check_token_usage(self, rule: AlertRule, condition: dict) -> tuple:
"""检查Token使用量"""
window_minutes = self._parse_window(condition.get('window', '1d'))
threshold = condition.get('threshold', 100000)
operator = condition.get('operator', '>')
since = datetime.now() - timedelta(minutes=window_minutes)
query = self.db.query(
func.coalesce(func.sum(AICallEvent.input_tokens + AICallEvent.output_tokens), 0)
).filter(
AICallEvent.created_at >= since
)
if rule.scope_type == 'tenant' and rule.scope_value:
query = query.filter(AICallEvent.tenant_id == rule.scope_value)
elif rule.scope_type == 'app' and rule.scope_value:
query = query.filter(AICallEvent.app_code == rule.scope_value)
token_usage = query.scalar() or 0
triggered = self._compare(token_usage, threshold, operator)
return triggered, str(token_usage), str(threshold)
def _check_cost_threshold(self, rule: AlertRule, condition: dict) -> tuple:
"""检查费用阈值"""
window_minutes = self._parse_window(condition.get('window', '1d'))
threshold = condition.get('threshold', 100) # 费用阈值(元)
operator = condition.get('operator', '>')
since = datetime.now() - timedelta(minutes=window_minutes)
query = self.db.query(
func.coalesce(func.sum(AICallEvent.cost), 0)
).filter(
AICallEvent.created_at >= since
)
if rule.scope_type == 'tenant' and rule.scope_value:
query = query.filter(AICallEvent.tenant_id == rule.scope_value)
elif rule.scope_type == 'app' and rule.scope_value:
query = query.filter(AICallEvent.app_code == rule.scope_value)
total_cost = float(query.scalar() or 0)
triggered = self._compare(total_cost, threshold, operator)
return triggered, f"¥{total_cost:.2f}", f"¥{threshold:.2f}"
def _check_latency(self, rule: AlertRule, condition: dict) -> tuple:
"""检查延迟"""
window_minutes = self._parse_window(condition.get('window', '5m'))
threshold = condition.get('threshold', 5000) # 延迟阈值(ms)
operator = condition.get('operator', '>')
percentile = condition.get('percentile', 'avg') # avg, p95, p99, max
since = datetime.now() - timedelta(minutes=window_minutes)
query = self.db.query(AICallEvent.latency_ms).filter(
AICallEvent.created_at >= since,
AICallEvent.latency_ms.isnot(None)
)
if rule.scope_type == 'tenant' and rule.scope_value:
query = query.filter(AICallEvent.tenant_id == rule.scope_value)
elif rule.scope_type == 'app' and rule.scope_value:
query = query.filter(AICallEvent.app_code == rule.scope_value)
latencies = [r.latency_ms for r in query.all()]
if not latencies:
return False, "0", str(threshold)
if percentile == 'avg':
metric = sum(latencies) / len(latencies)
elif percentile == 'max':
metric = max(latencies)
elif percentile == 'p95':
latencies.sort()
idx = int(len(latencies) * 0.95)
metric = latencies[idx] if idx < len(latencies) else latencies[-1]
elif percentile == 'p99':
latencies.sort()
idx = int(len(latencies) * 0.99)
metric = latencies[idx] if idx < len(latencies) else latencies[-1]
else:
metric = sum(latencies) / len(latencies)
triggered = self._compare(metric, threshold, operator)
return triggered, f"{metric:.0f}ms", f"{threshold}ms"
def _parse_window(self, window: str) -> int:
"""解析时间窗口字符串为分钟数"""
if window.endswith('m'):
return int(window[:-1])
elif window.endswith('h'):
return int(window[:-1]) * 60
elif window.endswith('d'):
return int(window[:-1]) * 60 * 24
else:
return int(window)
def _compare(self, value: float, threshold: float, operator: str) -> bool:
"""比较值与阈值"""
if operator == '>':
return value > threshold
elif operator == '>=':
return value >= threshold
elif operator == '<':
return value < threshold
elif operator == '<=':
return value <= threshold
elif operator == '==':
return value == threshold
elif operator == '!=':
return value != threshold
return False
def _create_alert_record(
self,
rule: AlertRule,
metric_value: str,
threshold_value: str
) -> AlertRecord:
"""创建告警记录"""
title = f"[{rule.priority.upper()}] {rule.name}"
message = f"规则 '{rule.name}' 触发告警\n当前值: {metric_value}\n阈值: {threshold_value}"
if rule.scope_type == 'tenant':
message += f"\n租户: {rule.scope_value}"
elif rule.scope_type == 'app':
message += f"\n应用: {rule.scope_value}"
alert = AlertRecord(
rule_id=rule.id,
rule_name=rule.name,
alert_type=rule.rule_type,
severity=self._priority_to_severity(rule.priority),
title=title,
message=message,
tenant_id=rule.scope_value if rule.scope_type == 'tenant' else None,
app_code=rule.scope_value if rule.scope_type == 'app' else None,
metric_value=metric_value,
threshold_value=threshold_value,
notification_status='pending'
)
self.db.add(alert)
self.db.commit()
self.db.refresh(alert)
# 设置冷却期
self._set_cooldown(rule)
logger.info(f"Alert triggered: {title}")
return alert
def _priority_to_severity(self, priority: str) -> str:
"""将优先级转换为严重程度"""
mapping = {
'low': 'info',
'medium': 'warning',
'high': 'error',
'critical': 'critical'
}
return mapping.get(priority, 'warning')
async def send_notification(self, alert: AlertRecord, rule: AlertRule) -> bool:
"""发送告警通知
Args:
alert: 告警记录
rule: 告警规则
Returns:
是否发送成功
"""
if not rule.notification_channels:
alert.notification_status = 'skipped'
self.db.commit()
return True
results = []
success = True
for channel_config in rule.notification_channels:
try:
result = await self._send_to_channel(channel_config, alert)
results.append(result)
if not result.get('success'):
success = False
except Exception as e:
logger.error(f"Failed to send notification: {e}")
results.append({'success': False, 'error': str(e)})
success = False
alert.notification_status = 'sent' if success else 'failed'
alert.notification_result = results
alert.notified_at = datetime.now()
self.db.commit()
return success
async def _send_to_channel(self, channel_config: dict, alert: AlertRecord) -> dict:
"""发送到指定渠道"""
channel_type = channel_config.get('type')
if channel_type == 'wechat_bot':
return await self._send_wechat_bot(channel_config, alert)
elif channel_type == 'webhook':
return await self._send_webhook(channel_config, alert)
else:
return {'success': False, 'error': f'Unsupported channel type: {channel_type}'}
async def _send_wechat_bot(self, config: dict, alert: AlertRecord) -> dict:
"""发送到企微机器人"""
webhook = config.get('webhook')
if not webhook:
return {'success': False, 'error': 'Missing webhook URL'}
# 构建消息
content = f"**{alert.title}**\n\n{alert.message}\n\n时间: {alert.created_at}"
payload = {
"msgtype": "markdown",
"markdown": {
"content": content
}
}
try:
async with httpx.AsyncClient(timeout=10) as client:
response = await client.post(webhook, json=payload)
result = response.json()
if result.get('errcode', 0) == 0:
return {'success': True, 'channel': 'wechat_bot'}
else:
return {'success': False, 'error': result.get('errmsg')}
except Exception as e:
return {'success': False, 'error': str(e)}
async def _send_webhook(self, config: dict, alert: AlertRecord) -> dict:
"""发送到Webhook"""
url = config.get('url')
if not url:
return {'success': False, 'error': 'Missing webhook URL'}
payload = {
"alert_id": alert.id,
"title": alert.title,
"message": alert.message,
"severity": alert.severity,
"alert_type": alert.alert_type,
"metric_value": alert.metric_value,
"threshold_value": alert.threshold_value,
"created_at": alert.created_at.isoformat()
}
headers = config.get('headers', {})
method = config.get('method', 'POST')
try:
async with httpx.AsyncClient(timeout=10) as client:
if method.upper() == 'POST':
response = await client.post(url, json=payload, headers=headers)
else:
response = await client.get(url, params=payload, headers=headers)
if response.status_code < 400:
return {'success': True, 'channel': 'webhook', 'status': response.status_code}
else:
return {'success': False, 'error': f'HTTP {response.status_code}'}
except Exception as e:
return {'success': False, 'error': str(e)}
def acknowledge_alert(self, alert_id: int, acknowledged_by: str) -> Optional[AlertRecord]:
"""确认告警"""
alert = self.db.query(AlertRecord).filter(AlertRecord.id == alert_id).first()
if not alert:
return None
alert.status = 'acknowledged'
alert.acknowledged_by = acknowledged_by
alert.acknowledged_at = datetime.now()
self.db.commit()
return alert
def resolve_alert(self, alert_id: int) -> Optional[AlertRecord]:
"""解决告警"""
alert = self.db.query(AlertRecord).filter(AlertRecord.id == alert_id).first()
if not alert:
return None
alert.status = 'resolved'
alert.resolved_at = datetime.now()
self.db.commit()
return alert