""" 语音识别服务 支持多种语音识别引擎: 1. 阿里云语音识别 2. 讯飞语音识别 3. 本地 Whisper 模型 """ import os import base64 import json import hmac import hashlib import time from datetime import datetime from typing import Optional, Dict, Any import httpx from urllib.parse import urlencode class SpeechRecognitionError(Exception): """语音识别错误""" pass class AliyunSpeechRecognition: """ 阿里云智能语音交互 - 一句话识别 文档: https://help.aliyun.com/document_detail/92131.html """ def __init__( self, access_key_id: Optional[str] = None, access_key_secret: Optional[str] = None, app_key: Optional[str] = None, ): self.access_key_id = access_key_id or os.getenv("ALIYUN_ACCESS_KEY_ID") self.access_key_secret = access_key_secret or os.getenv("ALIYUN_ACCESS_KEY_SECRET") self.app_key = app_key or os.getenv("ALIYUN_NLS_APP_KEY") self.api_url = "https://nls-gateway-cn-shanghai.aliyuncs.com/stream/v1/asr" def _create_signature(self, params: Dict[str, str]) -> str: """创建签名""" sorted_params = sorted(params.items()) query_string = urlencode(sorted_params) string_to_sign = f"POST&%2F&{urlencode({query_string: ''}).split('=')[0]}" signature = hmac.new( (self.access_key_secret + "&").encode("utf-8"), string_to_sign.encode("utf-8"), hashlib.sha1, ).digest() return base64.b64encode(signature).decode("utf-8") async def recognize( self, audio_data: bytes, format: str = "wav", sample_rate: int = 16000, ) -> str: """ 识别音频 Args: audio_data: 音频数据(二进制) format: 音频格式,支持 pcm, wav, ogg, opus, mp3 sample_rate: 采样率,默认 16000 Returns: 识别出的文本 """ if not all([self.access_key_id, self.access_key_secret, self.app_key]): raise SpeechRecognitionError("阿里云语音识别配置不完整") headers = { "Content-Type": f"audio/{format}; samplerate={sample_rate}", "X-NLS-Token": await self._get_token(), } params = { "appkey": self.app_key, "format": format, "sample_rate": str(sample_rate), } try: async with httpx.AsyncClient() as client: response = await client.post( self.api_url, params=params, headers=headers, content=audio_data, timeout=30.0, ) if response.status_code != 200: raise SpeechRecognitionError( f"阿里云语音识别请求失败: {response.status_code}" ) result = response.json() if result.get("status") == 20000000: return result.get("result", "") else: raise SpeechRecognitionError( f"语音识别失败: {result.get('message', '未知错误')}" ) except httpx.RequestError as e: raise SpeechRecognitionError(f"网络请求错误: {str(e)}") async def _get_token(self) -> str: """获取访问令牌""" # 简化版:实际生产环境需要缓存 token token_url = "https://nls-meta.cn-shanghai.aliyuncs.com/" timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") params = { "AccessKeyId": self.access_key_id, "Action": "CreateToken", "Format": "JSON", "RegionId": "cn-shanghai", "SignatureMethod": "HMAC-SHA1", "SignatureNonce": str(int(time.time() * 1000)), "SignatureVersion": "1.0", "Timestamp": timestamp, "Version": "2019-02-28", } params["Signature"] = self._create_signature(params) async with httpx.AsyncClient() as client: response = await client.get(token_url, params=params, timeout=10.0) result = response.json() if "Token" in result: return result["Token"]["Id"] else: raise SpeechRecognitionError( f"获取阿里云语音识别 Token 失败: {result.get('Message', '未知错误')}" ) class XunfeiSpeechRecognition: """ 讯飞语音识别 文档: https://www.xfyun.cn/doc/asr/voicedictation/API.html """ def __init__( self, app_id: Optional[str] = None, api_key: Optional[str] = None, api_secret: Optional[str] = None, ): self.app_id = app_id or os.getenv("XUNFEI_APP_ID") self.api_key = api_key or os.getenv("XUNFEI_API_KEY") self.api_secret = api_secret or os.getenv("XUNFEI_API_SECRET") self.api_url = "wss://iat-api.xfyun.cn/v2/iat" async def recognize( self, audio_data: bytes, format: str = "audio/L16;rate=16000", ) -> str: """ 识别音频 Args: audio_data: 音频数据(二进制) format: 音频格式 Returns: 识别出的文本 """ if not all([self.app_id, self.api_key, self.api_secret]): raise SpeechRecognitionError("讯飞语音识别配置不完整") # 讯飞使用 WebSocket,这里是简化实现 # 实际需要使用 websockets 库进行实时流式识别 raise NotImplementedError("讯飞语音识别需要 WebSocket 实现") class SimpleSpeechRecognition: """ 简易语音识别实现 使用浏览器 Web Speech API 的结果直接返回 用于前端已经完成识别的情况 """ async def recognize(self, text: str) -> str: """直接返回前端传来的识别结果""" return text.strip() class SpeechRecognitionService: """ 语音识别服务统一接口 根据配置选择不同的识别引擎 """ def __init__(self, engine: str = "simple"): """ 初始化语音识别服务 Args: engine: 识别引擎,支持 aliyun, xunfei, simple """ self.engine = engine if engine == "aliyun": self._recognizer = AliyunSpeechRecognition() elif engine == "xunfei": self._recognizer = XunfeiSpeechRecognition() else: self._recognizer = SimpleSpeechRecognition() async def recognize_audio( self, audio_data: bytes, format: str = "wav", sample_rate: int = 16000, ) -> str: """ 识别音频数据 Args: audio_data: 音频二进制数据 format: 音频格式 sample_rate: 采样率 Returns: 识别出的文本 """ if self.engine == "simple": raise SpeechRecognitionError( "简易模式不支持音频识别,请使用前端 Web Speech API" ) return await self._recognizer.recognize(audio_data, format, sample_rate) async def recognize_text(self, text: str) -> str: """ 直接处理已识别的文本(用于前端已完成识别的情况) Args: text: 已识别的文本 Returns: 处理后的文本 """ return text.strip() # 创建默认服务实例 def get_speech_recognition_service(engine: str = "simple") -> SpeechRecognitionService: """获取语音识别服务实例""" return SpeechRecognitionService(engine=engine)