lty/qy_lty/aiapp/audio/AliyunAudioService.py
2026-03-17 13:17:02 +08:00

219 lines
7.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import json
from .BaseAudioService import BaseAudioService
import oss2
import uuid
import logging
import datetime
import http.client
from django.conf import settings
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.request import CommonRequest
logger = logging.getLogger(__name__)
class AliyunAudioService(BaseAudioService):
def __init__(self, config):
super().__init__(config['api_key'], config['api_secret'])
self.config = config
self.oss_key_id = config['oss_key_id']
self.oss_key_secret = config['oss_key_secret']
# 创建AcsClient实例
self.client = AcsClient(
self.config['api_key'],
self.config['api_secret'],
"cn-shanghai"
)
def get_token(self):
# 创建request并设置参数。
request = CommonRequest()
request.set_method('POST')
request.set_domain('nls-meta.cn-shanghai.aliyuncs.com')
request.set_version('2019-02-28')
request.set_action_name('CreateToken')
try :
response = self.client.do_action_with_exception(request)
print(response)
jss = json.loads(response)
if 'Token' in jss and 'Id' in jss['Token']:
token = jss['Token']['Id']
expire_time = jss['Token']['ExpireTime']
print("token = " + token)
print("expireTime = " + str(expire_time))
return token
except Exception as e:
print(e)
def synthesize_speech(self, text, language='en'):
host = 'nls-gateway-cn-shanghai.aliyuncs.com'
url = 'https://' + host + '/stream/v1/tts'
# 设置HTTPS Headers。
http_headers = {
'Content-Type': 'application/json'
}
# 设置HTTPS Body。
body = {'appkey': self.config['app_id'], 'token': self.get_token(), 'text': text, 'format': 'mp3', 'sample_rate': 16000}
body = json.dumps(body)
print('The POST request body content: ' + body)
# Python 2.x请使用httplib。
# conn = httplib.HTTPSConnection(host)
# Python 3.x请使用http.client。
conn = http.client.HTTPSConnection(host)
conn.request(method='POST', url=url, body=body, headers=http_headers)
# 处理服务端返回的响应。
response = conn.getresponse()
print('Response status and response reason:')
print(response.status ,response.reason)
content_type = response.getheader('Content-Type')
print(content_type)
body = response.read()
# 随机生成一个文件名
random_uuid = uuid.uuid4()
mp3_name = f"{random_uuid}.mp3"
if 'audio/mpeg' == content_type :
auth = oss2.Auth(self.config['oss_key_id'], self.config['oss_key_secret'])
bucket = oss2.Bucket(auth, self.config['oss_endpoint'], self.config['oss_bucket'])
# 生成今天的日期作为子目录
today = datetime.datetime.now().strftime('%Y%m%d')
oss_key = f"{self.config['oss_audio_base_dir']}/{today}/{mp3_name}"
bucket.put_object(oss_key, body)
else :
print('The POST request failed: ' + str(body))
conn.close()
return '{}/{}'.format(self.config['oss_host'], oss_key)
def synthesize_speech_raw(self, text, language='en'):
"""
Generate speech from text and return the raw audio data directly
without storing it or creating a URL
Args:
text (str): The text to convert to speech
language (str, optional): Language code. Defaults to 'en'.
Returns:
bytes: Raw audio data
"""
host = 'nls-gateway-cn-shanghai.aliyuncs.com'
url = 'https://' + host + '/stream/v1/tts'
# 设置HTTPS Headers。
http_headers = {
'Content-Type': 'application/json'
}
# 设置HTTPS Body。
body = {'appkey': self.config['app_id'], 'token': self.get_token(), 'text': text, 'format': 'mp3', 'sample_rate': 16000}
body = json.dumps(body)
logger.info('Synthesizing speech with raw data return')
# Python 3.x请使用http.client。
conn = http.client.HTTPSConnection(host)
conn.request(method='POST', url=url, body=body, headers=http_headers)
# 处理服务端返回的响应。
response = conn.getresponse()
content_type = response.getheader('Content-Type')
body_data = response.read()
if 'audio/mpeg' == content_type:
logger.info('Successfully generated raw audio data')
conn.close()
return body_data
else:
logger.error(f'Failed to generate raw audio data: {str(body_data)}')
conn.close()
return None
def recognize_speech(self, audio_data, language='en') -> str:
app_key = self.config['app_id']
token = self.get_token()
# 服务请求地址
url = 'https://nls-gateway-cn-shanghai.aliyuncs.com/stream/v1/asr'
# audioFile = '/path/to/nls-sample-16k.wav'
# audioFile = './uploaded_files/audio.wav'
# audio_format = 'pcm'
# audio_format = 'mp3'
audio_format = 'opus'
# sample_rate = 16000
# sample_rate = 8000
enable_punctuation_prediction = True
enable_inverse_text_normalization = True
enable_voice_detection = False
# 设置RESTful请求参数
request = url + '?appkey=' + app_key
# request = request + '&format=' + audio_format
# request = request + '&sample_rate=' + str(sample_rate)
# if enable_punctuation_prediction :
# request = request + '&enable_punctuation_prediction=' + 'true'
# if enable_inverse_text_normalization :
# request = request + '&enable_inverse_text_normalization=' + 'true'
# if enable_voice_detection :
# request = request + '&enable_voice_detection=' + 'true'
# print('Request: ' + request)
# # 读取音频文件
# with open(audioFile, mode = 'rb') as f:
# audioContent = f.read()
host = 'nls-gateway-cn-shanghai.aliyuncs.com'
# 设置HTTPS请求头部
http_headers = {
'X-NLS-Token': token,
'Content-type': 'application/octet-stream',
'Content-Length': len(audio_data)
}
# Python 2.x使用httplib
# conn = httplib.HTTPSConnection(host)
# Python 3.x使用http.client
conn = http.client.HTTPSConnection(host)
conn.request(method='POST', url=request, body=audio_data, headers=http_headers)
response = conn.getresponse()
# print('Response status and response reason:')
# print(response.status ,response.reason)
body = response.read()
try:
print('Recognize response is:')
body = json.loads(body)
print(body)
status = body['status']
if status == 20000000 :
result = body['result']
print('Recognize result: ' + result)
return result
else :
print('Recognizer failed!')
return '识别失败'
except ValueError:
print('The response is not json format string')
conn.close()
return '识别失败2'
if __name__ == '__main__':
audio_ser = AliyunAudioService()
audio_ser.synthesize_speech('你好,你是谁啊')