import requests import json import base64 import uuid import logging import datetime import os import oss2 # 新增导入 from .BaseAudioService import BaseAudioService from django.conf import settings from .AliyunAudioService import AliyunAudioService # 导入AliyunAudioService logger = logging.getLogger(__name__) class HuoshanAudioService(BaseAudioService): def __init__(self, config): super().__init__("-", "-") self.config = config self.appid = config['appid'] self.access_token = config['access_token'] self.cluster = config['cluster'] self.host = "openspeech.bytedance.com" self.voice_type = config.get('voice_type', 'S_PHQ1AVPl1') # OSS配置 self.oss_key_id = config['aliyun_asr']['oss_key_id'] self.oss_key_secret = config['aliyun_asr']['oss_key_secret'] self.oss_bucket = config['aliyun_asr']['oss_bucket'] self.oss_endpoint = config['aliyun_asr']['oss_endpoint'] self.oss_host = config['aliyun_asr']['oss_host'] self.oss_audio_base_dir = config['aliyun_asr']['oss_audio_base_dir'] # 初始化阿里云语音服务实例用于语音识别 if 'aliyun_asr' in config: aliyun_config = { 'api_key': config['aliyun_asr']['api_key'], 'api_secret': config['aliyun_asr']['api_secret'], 'app_id': config['aliyun_asr']['app_id'], 'oss_key_id': self.oss_key_id, 'oss_key_secret': self.oss_key_secret, 'oss_bucket': self.oss_bucket, 'oss_endpoint': self.oss_endpoint, 'oss_host': self.oss_host, 'oss_audio_base_dir': self.oss_audio_base_dir } self.aliyun_service = AliyunAudioService(aliyun_config) logger.info("已初始化阿里云语音服务用于语音识别") else: self.aliyun_service = None logger.warning("阿里云语音识别配置未提供,语音识别功能可能不可用") def get_token(self): # 火山引擎使用access_token直接认证,不需要额外获取token return self.access_token def synthesize_speech(self, text, language='zh'): api_url = f"https://{self.host}/api/v1/tts" header = {"Authorization": f"Bearer;{self.access_token}"} request_json = { "app": { "appid": self.appid, "token": "access_token", "cluster": self.cluster }, "user": { "uid": str(uuid.uuid4()).replace('-', '') }, "audio": { "voice_type": self.voice_type, "encoding": "mp3", "speed_ratio": 1.0, "volume_ratio": 1.0, "pitch_ratio": 1.0, }, "request": { "reqid": str(uuid.uuid4()), "text": text, "text_type": "plain", "operation": "query", "with_frontend": 1, "frontend_type": "unitTson" } } try: resp = requests.post(api_url, json.dumps(request_json), headers=header) resp_json = resp.json() if "data" in resp_json: data = resp_json["data"] audio_data = base64.b64decode(data) # 生成文件名和OSS路径 random_uuid = uuid.uuid4() mp3_name = f"{random_uuid}.mp3" today = datetime.datetime.now().strftime('%Y%m%d') oss_key = f"{self.oss_audio_base_dir}/{today}/{mp3_name}" # 上传到OSS auth = oss2.Auth(self.oss_key_id, self.oss_key_secret) bucket = oss2.Bucket(auth, self.oss_endpoint, self.oss_bucket) bucket.put_object(oss_key, audio_data) # 返回OSS上的URL return f"{self.oss_host}/{oss_key}" else: logger.error(f"Error in Huoshan TTS response: {resp_json}") return None except Exception as e: logger.error(f"Error in Huoshan TTS: {str(e)}") return None def synthesize_speech_raw(self, text, language='zh'): """ Generate speech from text and return the raw audio data directly without storing it or creating a URL """ api_url = f"https://{self.host}/api/v1/tts" header = {"Authorization": f"Bearer;{self.access_token}"} request_json = { "app": { "appid": self.appid, "token": "access_token", "cluster": self.cluster }, "user": { "uid": str(uuid.uuid4()).replace('-', '') }, "audio": { "voice_type": self.voice_type, "encoding": "mp3", "speed_ratio": 1.0, "volume_ratio": 1.0, "pitch_ratio": 1.0, }, "request": { "reqid": str(uuid.uuid4()), "text": text, "text_type": "plain", "operation": "query", "with_frontend": 1, "frontend_type": "unitTson" } } try: logger.info('Synthesizing speech with raw data return (Huoshan)') resp = requests.post(api_url, json.dumps(request_json), headers=header) resp_json = resp.json() if "data" in resp_json: data = resp_json["data"] return base64.b64decode(data) else: logger.error(f"Error in Huoshan TTS raw response: {resp_json}") return None except Exception as e: logger.error(f"Error in Huoshan TTS raw: {str(e)}") return None def recognize_speech(self, audio_data, language='zh') -> str: """ 使用阿里云语音服务进行语音识别 """ if not self.aliyun_service: logger.error("阿里云语音服务未初始化,无法进行语音识别") return "语音识别服务未配置" try: logger.info("通过阿里云语音服务进行语音识别") result = self.aliyun_service.recognize_speech(audio_data, language) return result except Exception as e: logger.error(f"调用阿里云语音识别服务失败: {str(e)}") return f"语音识别异常: {str(e)}"