當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

java实现微软文本转语音（TTS）经验总结

發布時間：2023/12/20 编程问答 25 豆豆

生活随笔收集整理的這篇文章主要介紹了 java实现微软文本转语音（TTS）经验总结小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

一、使用背景

公司項目之前一直是采用人工錄音，然而上線一段時間之后發現，人工錄音成本太高，而且每周上線的音頻不多，老板發現問題后，甚至把音頻功能裸停了一段時間。直到最近項目要向海外擴展，需要內容做國際化，就想到了用機器翻譯。目前機翻已經相對成熟，做的好的國內有科大訊飛，國外有微軟。既然項目主要面對海外用戶，就決定采用微軟的TTS。（PS：這里不是打廣告，微軟的TTS是真的不錯，自己可以去官網試聽下，雖然無法像人一樣很有感情的朗讀詩歌什么的，但是朗讀新聞咨詢類文章還是抑揚頓挫的。）

二、上代碼

使用背景已經啰嗦了一大堆，我覺得讀者還是會關注的，但是我想作為資深CV碼農，我想你們更關注還是如何應用，所以還是老規矩，簡簡單單的上代碼。（申請賬號這些就不介紹了）

1.依賴

<dependency><groupId>com.microsoft.cognitiveservices.speech</groupId><artifactId>client-sdk</artifactId><version>1.12.1</version> </dependency>

2.配置常量

public class TtsConst {/*** 音頻合成類型（親測這種效果最佳，其他的你自己去試試）*/public static final String AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3";/*** 授權url*/public static final String ACCESS_TOKEN_URI = "https://eastasia.api.cognitive.microsoft.com/sts/v1.0/issuetoken";/*** api key*/public static final String API_KEY = "你自己的 api key";/*** 設置accessToken的過期時間為9分鐘*/public static final Integer ACCESS_TOKEN_EXPIRE_TIME = 9 * 60;/*** 性別*/public static final String MALE = "Male";/*** tts服務url*/public static final String TTS_SERVICE_URI = "https://eastasia.tts.speech.microsoft.com/cognitiveservices/v1";}

3.https連接

public class HttpsConnection {public static HttpsURLConnection getHttpsConnection(String connectingUrl) throws Exception {URL url = new URL(connectingUrl);return (HttpsURLConnection) url.openConnection();} }

3.授權

@Component @Slf4j public class Authentication {@Resourceprivate RedisCache redisCache;public String genAccessToken() {InputStream inSt;HttpsURLConnection webRequest;try {String accessToken = redisCache.get(RedisKey.KEY_TTS_ACCESS_TOKEN);if (StringUtils.isEmpty(accessToken)) {webRequest = HttpsConnection.getHttpsConnection(TtsConst.ACCESS_TOKEN_URI);webRequest.setDoInput(true);webRequest.setDoOutput(true);webRequest.setConnectTimeout(5000);webRequest.setReadTimeout(5000);webRequest.setRequestMethod("POST");byte[] bytes = new byte[0];webRequest.setRequestProperty("content-length", String.valueOf(bytes.length));webRequest.setRequestProperty("Ocp-Apim-Subscription-Key", TtsConst.API_KEY);webRequest.connect();DataOutputStream dop = new DataOutputStream(webRequest.getOutputStream());dop.write(bytes);dop.flush();dop.close();inSt = webRequest.getInputStream();InputStreamReader in = new InputStreamReader(inSt);BufferedReader bufferedReader = new BufferedReader(in);StringBuilder strBuffer = new StringBuilder();String line = null;while ((line = bufferedReader.readLine()) != null) {strBuffer.append(line);}bufferedReader.close();in.close();inSt.close();webRequest.disconnect();accessToken = strBuffer.toString();//設置accessToken的過期時間為9分鐘redisCache.set(RedisKey.KEY_TTS_ACCESS_TOKEN, accessToken, TtsConst.ACCESS_TOKEN_EXPIRE_TIME);log.info("New tts access token {}", accessToken);}return accessToken;} catch (Exception e) {log.error("Generate tts access token failed {}", e.getMessage());}return null;} }

4.字節數組處理

public class ByteArray {private byte[] data;private int length;public ByteArray(){length = 0;data = new byte[length];}public ByteArray(byte[] ba){data = ba;length = ba.length;}/**合并數組*/public void cat(byte[] second, int offset, int length){if(this.length + length > data.length) {int allocatedLength = Math.max(data.length, length);byte[] allocated = new byte[allocatedLength << 1];System.arraycopy(data, 0, allocated, 0, this.length);System.arraycopy(second, offset, allocated, this.length, length);data = allocated;}else {System.arraycopy(second, offset, data, this.length, length);}this.length += length;}public void cat(byte[] second){cat(second, 0, second.length);}public byte[] getArray(){if(length == data.length){return data;}byte[] ba = new byte[length];System.arraycopy(data, 0, ba, 0, this.length);data = ba;return ba;}public int getLength(){return length;} }

5.創建SSML文件

@Slf4j public class XmlDom {public static String createDom(String locale, String genderName, String voiceName, String textToSynthesize){Document doc = null;Element speak, voice;try {DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();DocumentBuilder builder = dbf.newDocumentBuilder();doc = builder.newDocument();if (doc != null){speak = doc.createElement("speak");speak.setAttribute("version", "1.0");speak.setAttribute("xml:lang", "en-us");voice = doc.createElement("voice");voice.setAttribute("xml:lang", locale);voice.setAttribute("xml:gender", genderName);voice.setAttribute("name", voiceName);voice.appendChild(doc.createTextNode(textToSynthesize));speak.appendChild(voice);doc.appendChild(speak);}} catch (ParserConfigurationException e) {log.error("Create ssml document failed: {}",e.getMessage());return null;}return transformDom(doc);}private static String transformDom(Document doc){StringWriter writer = new StringWriter();try {TransformerFactory tf = TransformerFactory.newInstance();Transformer transformer;transformer = tf.newTransformer();transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");transformer.transform(new DOMSource(doc), new StreamResult(writer));} catch (TransformerException e) {log.error("Transform ssml document failed: {}",e.getMessage());return null;}return writer.getBuffer().toString().replaceAll("\n|\r", "");} }

6.正主來了！TTS服務

@Slf4j @Component public class TtsService {@Resourceprivate Authentication authentication;/*** 合成音頻*/public byte[] genAudioBytes(String textToSynthesize, String locale, String gender, String voiceName) {String accessToken = authentication.genAccessToken();if (StringUtils.isEmpty(accessToken)) {return new byte[0];}try {HttpsURLConnection webRequest = HttpsConnection.getHttpsConnection(TtsConst.TTS_SERVICE_URI);webRequest.setDoInput(true);webRequest.setDoOutput(true);webRequest.setConnectTimeout(5000);webRequest.setReadTimeout(300000);webRequest.setRequestMethod("POST");webRequest.setRequestProperty("Content-Type", "application/ssml+xml");webRequest.setRequestProperty("X-Microsoft-OutputFormat", TtsConst.AUDIO_24KHZ_48KBITRATE_MONO_MP3);webRequest.setRequestProperty("Authorization", "Bearer " + accessToken);webRequest.setRequestProperty("X-Search-AppId", "07D3234E49CE426DAA29772419F436CC");webRequest.setRequestProperty("X-Search-ClientID", "1ECFAE91408841A480F00935DC390962");webRequest.setRequestProperty("User-Agent", "TTSAndroid");webRequest.setRequestProperty("Accept", "*/*");String body = XmlDom.createDom(locale, gender, voiceName, textToSynthesize);if (StringUtils.isEmpty(body)) {return new byte[0];}byte[] bytes = body.getBytes();webRequest.setRequestProperty("content-length", String.valueOf(bytes.length));webRequest.connect();DataOutputStream dop = new DataOutputStream(webRequest.getOutputStream());dop.write(bytes);dop.flush();dop.close();InputStream inSt = webRequest.getInputStream();ByteArray ba = new ByteArray();int rn2 = 0;int bufferLength = 4096;byte[] buf2 = new byte[bufferLength];while ((rn2 = inSt.read(buf2, 0, bufferLength)) > 0) {ba.cat(buf2, 0, rn2);}inSt.close();webRequest.disconnect();return ba.getArray();} catch (Exception e) {log.error("Synthesis tts speech failed {}", e.getMessage());}return null;} }

由于項目中需要將音頻上傳到OSS，所以這里生成的是字節碼文件，你也可以選擇下載或保存音頻文件。

三、問題及總結

1.問題

由于項目中需要生成超過10分鐘的音頻，我在調試中發現tts不能生成超過10分鐘的音頻，尷尬了呀，在微軟官網中摸索了半天也沒找到生成超過10分鐘音頻的辦法，放棄了嗎？不可能的。在我感覺到無計可施的時候，我的腦海中蹦出了四個字，那就是”斷點續傳“。我就想能不能通過tts把內容分段生成字節碼兩個，然后拼接后再上傳到OSS。說干就干，沒想到最后真的可行。成功那一瞬間的感覺無法言喻呀。不廢話了，嗯，上大媽，哦不是，上代碼。太激動了。

/*** 生成中文音頻信息*/public byte[] getZHAudioBuffer(String gender, String chapterContent, String locale, String voiceName) {byte[] audioBuffer;if (chapterContent.length() <= 2600) {audioBuffer = ttsService.genAudioBytes(chapterContent, locale, gender, voiceName);} else {byte[] audioBuffer1 = ttsService.genAudioBytes(chapterContent.substring(0, chapterContent.length() / 2), locale, gender, voiceName);byte[] audioBuffer2 = ttsService.genAudioBytes(chapterContent.substring(chapterContent.length() / 2), locale, gender, voiceName);ByteArray byteArray = new ByteArray(audioBuffer1);byteArray.cat(audioBuffer2);audioBuffer = byteArray.getArray();}return audioBuffer;}/*** 生成英文音頻信息*/public byte[] getUSAudioBuffer(String gender, String chapterContent, String locale, String voiceName) {String[] words = chapterContent.split(" ");byte[] audioBuffer;int maxLength = 1500;if (words.length <= maxLength) {audioBuffer = ttsService.genAudioBytes(chapterContent, locale, gender, voiceName);} else {String[] part1 = new String[maxLength];String[] part2 = new String[words.length - maxLength];for (int i = 0; i < words.length; i++) {if (i < maxLength) {part1[i] = words[i];} else {part2[i - maxLength] = words[i];}}byte[] audioBuffer1 = ttsService.genAudioBytes(String.join(" ", part1), locale, gender, voiceName);byte[] audioBuffer2 = ttsService.genAudioBytes(String.join(" ", part2), locale, gender, voiceName);ByteArray byteArray = new ByteArray(audioBuffer1);byteArray.cat(audioBuffer2);audioBuffer = byteArray.getArray();}return audioBuffer;}

我要說的都在代碼里了，你細品。（PS：中文的2600字符和英文的1500字符，是我調試出來的，生成的音頻肯定是在10分鐘以內的）

2.總結

微軟TTS還是挺香的，嗯，總結很到位，我繼續摸索其他功能去了。

總結

以上是生活随笔為你收集整理的java实现微软文本转语音（TTS）经验总结的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇：算法养成：弱鸡大学生浅谈c++stl
下一篇：华为荣耀U8860刷MIUI刷机教程全解