166 lines
6.5 KiB
Python
166 lines
6.5 KiB
Python
import requests
|
||
import json
|
||
import base64
|
||
import uuid
|
||
import logging
|
||
import datetime
|
||
import os
|
||
import oss2 # 新增导入
|
||
from .BaseAudioService import BaseAudioService
|
||
from django.conf import settings
|
||
from .AliyunAudioService import AliyunAudioService # 导入AliyunAudioService
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
class HuoshanAudioService(BaseAudioService):
|
||
|
||
def __init__(self, config):
|
||
super().__init__("-", "-")
|
||
self.config = config
|
||
self.appid = config['appid']
|
||
self.access_token = config['access_token']
|
||
self.cluster = config['cluster']
|
||
self.host = "openspeech.bytedance.com"
|
||
self.voice_type = config.get('voice_type', 'S_PHQ1AVPl1')
|
||
# OSS配置
|
||
self.oss_key_id = config['aliyun_asr']['oss_key_id']
|
||
self.oss_key_secret = config['aliyun_asr']['oss_key_secret']
|
||
self.oss_bucket = config['aliyun_asr']['oss_bucket']
|
||
self.oss_endpoint = config['aliyun_asr']['oss_endpoint']
|
||
self.oss_host = config['aliyun_asr']['oss_host']
|
||
self.oss_audio_base_dir = config['aliyun_asr']['oss_audio_base_dir']
|
||
# 初始化阿里云语音服务实例用于语音识别
|
||
if 'aliyun_asr' in config:
|
||
aliyun_config = {
|
||
'api_key': config['aliyun_asr']['api_key'],
|
||
'api_secret': config['aliyun_asr']['api_secret'],
|
||
'app_id': config['aliyun_asr']['app_id'],
|
||
'oss_key_id': self.oss_key_id,
|
||
'oss_key_secret': self.oss_key_secret,
|
||
'oss_bucket': self.oss_bucket,
|
||
'oss_endpoint': self.oss_endpoint,
|
||
'oss_host': self.oss_host,
|
||
'oss_audio_base_dir': self.oss_audio_base_dir
|
||
}
|
||
self.aliyun_service = AliyunAudioService(aliyun_config)
|
||
logger.info("已初始化阿里云语音服务用于语音识别")
|
||
else:
|
||
self.aliyun_service = None
|
||
logger.warning("阿里云语音识别配置未提供,语音识别功能可能不可用")
|
||
|
||
def get_token(self):
|
||
# 火山引擎使用access_token直接认证,不需要额外获取token
|
||
return self.access_token
|
||
|
||
def synthesize_speech(self, text, language='zh'):
|
||
api_url = f"https://{self.host}/api/v1/tts"
|
||
header = {"Authorization": f"Bearer;{self.access_token}"}
|
||
request_json = {
|
||
"app": {
|
||
"appid": self.appid,
|
||
"token": "access_token",
|
||
"cluster": self.cluster
|
||
},
|
||
"user": {
|
||
"uid": str(uuid.uuid4()).replace('-', '')
|
||
},
|
||
"audio": {
|
||
"voice_type": self.voice_type,
|
||
"encoding": "mp3",
|
||
"speed_ratio": 1.0,
|
||
"volume_ratio": 1.0,
|
||
"pitch_ratio": 1.0,
|
||
},
|
||
"request": {
|
||
"reqid": str(uuid.uuid4()),
|
||
"text": text,
|
||
"text_type": "plain",
|
||
"operation": "query",
|
||
"with_frontend": 1,
|
||
"frontend_type": "unitTson"
|
||
}
|
||
}
|
||
try:
|
||
resp = requests.post(api_url, json.dumps(request_json), headers=header)
|
||
resp_json = resp.json()
|
||
if "data" in resp_json:
|
||
data = resp_json["data"]
|
||
audio_data = base64.b64decode(data)
|
||
# 生成文件名和OSS路径
|
||
random_uuid = uuid.uuid4()
|
||
mp3_name = f"{random_uuid}.mp3"
|
||
today = datetime.datetime.now().strftime('%Y%m%d')
|
||
oss_key = f"{self.oss_audio_base_dir}/{today}/{mp3_name}"
|
||
# 上传到OSS
|
||
auth = oss2.Auth(self.oss_key_id, self.oss_key_secret)
|
||
bucket = oss2.Bucket(auth, self.oss_endpoint, self.oss_bucket)
|
||
bucket.put_object(oss_key, audio_data)
|
||
# 返回OSS上的URL
|
||
return f"{self.oss_host}/{oss_key}"
|
||
else:
|
||
logger.error(f"Error in Huoshan TTS response: {resp_json}")
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"Error in Huoshan TTS: {str(e)}")
|
||
return None
|
||
|
||
def synthesize_speech_raw(self, text, language='zh'):
|
||
"""
|
||
Generate speech from text and return the raw audio data directly
|
||
without storing it or creating a URL
|
||
"""
|
||
api_url = f"https://{self.host}/api/v1/tts"
|
||
header = {"Authorization": f"Bearer;{self.access_token}"}
|
||
request_json = {
|
||
"app": {
|
||
"appid": self.appid,
|
||
"token": "access_token",
|
||
"cluster": self.cluster
|
||
},
|
||
"user": {
|
||
"uid": str(uuid.uuid4()).replace('-', '')
|
||
},
|
||
"audio": {
|
||
"voice_type": self.voice_type,
|
||
"encoding": "mp3",
|
||
"speed_ratio": 1.0,
|
||
"volume_ratio": 1.0,
|
||
"pitch_ratio": 1.0,
|
||
},
|
||
"request": {
|
||
"reqid": str(uuid.uuid4()),
|
||
"text": text,
|
||
"text_type": "plain",
|
||
"operation": "query",
|
||
"with_frontend": 1,
|
||
"frontend_type": "unitTson"
|
||
}
|
||
}
|
||
try:
|
||
logger.info('Synthesizing speech with raw data return (Huoshan)')
|
||
resp = requests.post(api_url, json.dumps(request_json), headers=header)
|
||
resp_json = resp.json()
|
||
if "data" in resp_json:
|
||
data = resp_json["data"]
|
||
return base64.b64decode(data)
|
||
else:
|
||
logger.error(f"Error in Huoshan TTS raw response: {resp_json}")
|
||
return None
|
||
except Exception as e:
|
||
logger.error(f"Error in Huoshan TTS raw: {str(e)}")
|
||
return None
|
||
|
||
def recognize_speech(self, audio_data, language='zh') -> str:
|
||
"""
|
||
使用阿里云语音服务进行语音识别
|
||
"""
|
||
if not self.aliyun_service:
|
||
logger.error("阿里云语音服务未初始化,无法进行语音识别")
|
||
return "语音识别服务未配置"
|
||
try:
|
||
logger.info("通过阿里云语音服务进行语音识别")
|
||
result = self.aliyun_service.recognize_speech(audio_data, language)
|
||
return result
|
||
except Exception as e:
|
||
logger.error(f"调用阿里云语音识别服务失败: {str(e)}")
|
||
return f"语音识别异常: {str(e)}" |