lty/qy_lty/aiapp/audio/HuoshanAudioService.py
2026-03-17 13:17:02 +08:00

166 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
import base64
import uuid
import logging
import datetime
import os
import oss2 # 新增导入
from .BaseAudioService import BaseAudioService
from django.conf import settings
from .AliyunAudioService import AliyunAudioService # 导入AliyunAudioService
logger = logging.getLogger(__name__)
class HuoshanAudioService(BaseAudioService):
def __init__(self, config):
super().__init__("-", "-")
self.config = config
self.appid = config['appid']
self.access_token = config['access_token']
self.cluster = config['cluster']
self.host = "openspeech.bytedance.com"
self.voice_type = config.get('voice_type', 'S_PHQ1AVPl1')
# OSS配置
self.oss_key_id = config['aliyun_asr']['oss_key_id']
self.oss_key_secret = config['aliyun_asr']['oss_key_secret']
self.oss_bucket = config['aliyun_asr']['oss_bucket']
self.oss_endpoint = config['aliyun_asr']['oss_endpoint']
self.oss_host = config['aliyun_asr']['oss_host']
self.oss_audio_base_dir = config['aliyun_asr']['oss_audio_base_dir']
# 初始化阿里云语音服务实例用于语音识别
if 'aliyun_asr' in config:
aliyun_config = {
'api_key': config['aliyun_asr']['api_key'],
'api_secret': config['aliyun_asr']['api_secret'],
'app_id': config['aliyun_asr']['app_id'],
'oss_key_id': self.oss_key_id,
'oss_key_secret': self.oss_key_secret,
'oss_bucket': self.oss_bucket,
'oss_endpoint': self.oss_endpoint,
'oss_host': self.oss_host,
'oss_audio_base_dir': self.oss_audio_base_dir
}
self.aliyun_service = AliyunAudioService(aliyun_config)
logger.info("已初始化阿里云语音服务用于语音识别")
else:
self.aliyun_service = None
logger.warning("阿里云语音识别配置未提供,语音识别功能可能不可用")
def get_token(self):
# 火山引擎使用access_token直接认证不需要额外获取token
return self.access_token
def synthesize_speech(self, text, language='zh'):
api_url = f"https://{self.host}/api/v1/tts"
header = {"Authorization": f"Bearer;{self.access_token}"}
request_json = {
"app": {
"appid": self.appid,
"token": "access_token",
"cluster": self.cluster
},
"user": {
"uid": str(uuid.uuid4()).replace('-', '')
},
"audio": {
"voice_type": self.voice_type,
"encoding": "mp3",
"speed_ratio": 1.0,
"volume_ratio": 1.0,
"pitch_ratio": 1.0,
},
"request": {
"reqid": str(uuid.uuid4()),
"text": text,
"text_type": "plain",
"operation": "query",
"with_frontend": 1,
"frontend_type": "unitTson"
}
}
try:
resp = requests.post(api_url, json.dumps(request_json), headers=header)
resp_json = resp.json()
if "data" in resp_json:
data = resp_json["data"]
audio_data = base64.b64decode(data)
# 生成文件名和OSS路径
random_uuid = uuid.uuid4()
mp3_name = f"{random_uuid}.mp3"
today = datetime.datetime.now().strftime('%Y%m%d')
oss_key = f"{self.oss_audio_base_dir}/{today}/{mp3_name}"
# 上传到OSS
auth = oss2.Auth(self.oss_key_id, self.oss_key_secret)
bucket = oss2.Bucket(auth, self.oss_endpoint, self.oss_bucket)
bucket.put_object(oss_key, audio_data)
# 返回OSS上的URL
return f"{self.oss_host}/{oss_key}"
else:
logger.error(f"Error in Huoshan TTS response: {resp_json}")
return None
except Exception as e:
logger.error(f"Error in Huoshan TTS: {str(e)}")
return None
def synthesize_speech_raw(self, text, language='zh'):
"""
Generate speech from text and return the raw audio data directly
without storing it or creating a URL
"""
api_url = f"https://{self.host}/api/v1/tts"
header = {"Authorization": f"Bearer;{self.access_token}"}
request_json = {
"app": {
"appid": self.appid,
"token": "access_token",
"cluster": self.cluster
},
"user": {
"uid": str(uuid.uuid4()).replace('-', '')
},
"audio": {
"voice_type": self.voice_type,
"encoding": "mp3",
"speed_ratio": 1.0,
"volume_ratio": 1.0,
"pitch_ratio": 1.0,
},
"request": {
"reqid": str(uuid.uuid4()),
"text": text,
"text_type": "plain",
"operation": "query",
"with_frontend": 1,
"frontend_type": "unitTson"
}
}
try:
logger.info('Synthesizing speech with raw data return (Huoshan)')
resp = requests.post(api_url, json.dumps(request_json), headers=header)
resp_json = resp.json()
if "data" in resp_json:
data = resp_json["data"]
return base64.b64decode(data)
else:
logger.error(f"Error in Huoshan TTS raw response: {resp_json}")
return None
except Exception as e:
logger.error(f"Error in Huoshan TTS raw: {str(e)}")
return None
def recognize_speech(self, audio_data, language='zh') -> str:
"""
使用阿里云语音服务进行语音识别
"""
if not self.aliyun_service:
logger.error("阿里云语音服务未初始化,无法进行语音识别")
return "语音识别服务未配置"
try:
logger.info("通过阿里云语音服务进行语音识别")
result = self.aliyun_service.recognize_speech(audio_data, language)
return result
except Exception as e:
logger.error(f"调用阿里云语音识别服务失败: {str(e)}")
return f"语音识别异常: {str(e)}"