由于公司最初项目立项时需要使用到微软的实时语音识别,所以研究了下微软的官方sdk和api,前端和java的交互相对简单,前端页面通过HZRcorder采集实时音频流,处理成二进制,后端netty+websocket接收消息,难点是微软的翻译,微软官方给了几种翻译类型,一种是一段音频片段,一种是硬件设备直接获取流做翻译,我们的服务需要部署服务器所以没有办法用第二种,第一种情况是微软的SpeechRecognizer对象可以接收一个特殊的流对象PullAudioStreamCallback作为数据源,如果传入了这个对象,SpeechRecognizer会主动从该流对象里读取数据。但是SpeechRecognizer会在流中读取到0个字节后停止识别,在我们的场景中默认的流类型无法满足需求,当没有数据读取到时它们无法block住,PullAudioStreamCallback期望的效果是只有当明确流结束时读取流的Read()方法才返回0。因此需要定义我们自己的音频流对象;
package ********;
import com.microsoft.cognitiveservices.speech.audio.PullAudioInputStreamCallback;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.io.InputStream;
@Slf4j
public class VoiceAudioStream extends PullAudioInputStreamCallback {
private EchoStream _dataStream = new EchoStream();
private ManualResetEvent _waitForEmptyDataStream = null;
private InputStream stream;
@Override
public int read(byte[] dataBuffer) //S2T服务从PullAudioInputStream中读取数据, 读到0个字节并不会关闭流
{
long ret = 0;
if (_waitForEmptyDataStream != null && !_dataStream.DataAvailable())
{//用户主动close时可以关闭流
_waitForEmptyDataStream.Set();
return 0;
}
try {
if(this.stream != null){
//log.info("1前:{}",dataBuffer);
ret = this.stream.read(dataBuffer,0, dataBuffer.length);
//log.info("1后:{}",dataBuffer);
if((int)ret < 1){
// log.info("2");
this.stream = _dataStream.Read(dataBuffer, 0, dataBuffer.length);
ret = this.stream.read(dataBuffer,0, dataBuffer.length);
}
}else{
//log.info("3");
this.stream = _dataStream.Read(dataBuffer, 0, dataBuffer.length);
ret = this.stream.read(dataBuffer,0, dataBuffer.length);
}
} catch (IOException e) {
e.printStackTrace();
}
return (int)Math.max(0, ret);
}
public void write(byte[] buffer, int offset, int count) //Client向PullAudioInputStream写入数据
{
_dataStream.write(buffer, offset, count);
}
@Override
public void close(){
if (_dataStream.DataAvailable())
{
log.info("进到close里面了");
_waitForEmptyDataStream = new ManualResetEvent(false); //通过ManualResetEvent强制流的使用者必须调用close来手动关闭流
try {
_waitForEmptyDataStream.WaitOne();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
log.info("等待了吗?");
_waitForEmptyDataStream.close();
try {
_dataStream.close();
} catch (IOException e) {
e.printStackTrace();
}
//_dataStream.close();
try {
this.stream.close();
} catch (IOException ex) {
// ignored
}
}
}
package *****;
import lombok.extern.slf4j.Slf4j;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.ConcurrentLinkedDeque;
@Slf4j
public class EchoStream extends InputStream {
private ManualResetEvent _DataReady = new ManualResetEvent(false);
private ConcurrentLinkedDeque<byte[]> _Buffers = new ConcurrentLinkedDeque<>();
public Boolean DataAvailable(){
return !_Buffers.isEmpty();
}
public void write(byte[] buffer, int offset, int count)
{
// log.info("开始write,EchoStream");
_Buffers.addLast(buffer);
if(_Buffers.size()>1){
_DataReady.Set();
}
}
@Override
public int read() throws IOException {
return 0;
}
public byte[] getLBuffer(){
if(_Buffers.size() != 0){
return _Buffers.pollFirst();
}
return new byte[0];
}
public InputStream Read(byte[] buffer, int offset, int count)
{
//log.info("开始read,EchoStream");
try {
if(_Buffers.size() == 0){
_DataReady.WaitOne();
}
} catch (InterruptedException e) {
e.printStackTrace();
}
//log.info("开始read,lBuffer前:{}",_Buffers.size());
byte[] lBuffer = _Buffers.pollFirst();
//log.info("开始read,lBuffer后:{}",lBuffer);
if (lBuffer == null || lBuffer.length == 0)
{
// log.info("读取到的lBuffer为空,进入等待");
_DataReady.Reset();
//return -1;
}
if (!DataAvailable()) {
// log.info("此时dataready为空");
_DataReady.Reset();
}
//buffer = Arrays.copyOf(lBuffer, lBuffer.length);
buffer = lBuffer.clone();
//log.info("buffer:{},lBuffer.length:{}",buffer,lBuffer.length);
//buffer = lBuffer;
//return buffer.length;
return new ByteArrayInputStream(buffer);
}
@Override
public void close() throws IOException {
super.close();
}
}
前端建立连接的时候创建微软翻译的连接;
public static Boolean buildConnect(SessionEntity sessionEntity)
{
try{
stopTranslationWithAudioStreamSemaphore = new Semaphore(0);
//初始化语音翻译配置实例
//SpeechTranslationConfig speechTranslationConfig = getInstance(speechKey ,speechRegion);
//初始化语言
speechTranslationConfig.setSpeechRecognitionLanguage(sessionEntity.getFromLanguage());
speechTranslationConfig.addTargetLanguage(sessionEntity.getToLanguage());
//初始化语音流
VoiceAudioStream audioStream = null;
if(sessionEntity.getAudioStream() != null && !sessionEntity.getAudioStream().equals("")){
log.info("进来了?");
audioStream = sessionEntity.getAudioStream();
}else{
audioStream = new VoiceAudioStream();
sessionEntity.setAudioStream(audioStream);
sessionEntity.setVoiceData(new ArrayList<>());
}
AudioStreamFormat audioFormat = AudioStreamFormat.getWaveFormatPCM(Settings.getEmbeddedSpeechSamplesPerSecond(), Settings.getEmbeddedSpeechBitsPerSample(), Settings.getEmbeddedSpeechChannels());
PullAudioInputStream pullStream = PullAudioInputStream.createPullStream(audioStream, audioFormat);
AudioConfig audioConfig = AudioConfig.fromStreamInput(pullStream);
TranslationRecognizer translationRecognizer = new TranslationRecognizer(speechTranslationConfig, audioConfig);
sessionEntity.setTranslationRecognizer(translationRecognizer);
//绑定监听方法
translationRecognizer.recognizing.addEventListener((s, e) -> {
log.info("tempResult:{}", e.getResult());
//这里是中间结果,每出现一次就推送一次
AsrResult asrResult = Constant.asrResultMap.get(sessionEntity.getSessionID());
Integer asrIndex = 0;
if(asrResult == null){ //第一次进入
//log.info("第一次进入");
//获取本房间最新的文本下标
asrIndex = Constant.roomIndex.get(sessionEntity.getRoomId());
if(asrIndex == null){
asrIndex = 0;
}
Constant.roomIndex.put(sessionEntity.getRoomId(),asrIndex+1);
asrResult = new AsrResult();
asrResult.setStartTime(DateUtils.now());
asrResult.setEndTime(DateUtils.now());
asrResult.setAsrIndex(asrIndex);
}else{
//log.info("第二次进入");
//判断本次结果与上次结果之间时间差是否超过一定值
Long thisTime = System.currentTimeMillis();
Long lastTime = DateUtils.gettimeStemp(asrResult.getEndTime(),"yyyy-MM-dd HH:mm:ss");
if((thisTime - lastTime) > 2000L) {
//log.info("两次时间过长");
//index新增
asrIndex = Constant.roomIndex.get(sessionEntity.getRoomId());
asrResult.setAsrIndex(asrIndex);
Constant.roomIndex.put(sessionEntity.getRoomId(),asrIndex+1);
asrResult.setStartTime(DateUtils.now());
}
asrResult.setEndTime(DateUtils.now());
}
asrResult.setAsrText(e.getResult().getText());
asrResult.setTempText(e.getResult().getText());
asrResult.setUserId(sessionEntity.getExtNum());
asrResult.setTranslation(e.getResult().getTranslations());
asrResult.setTempType(1);
Constant.asrResultMap.put(sessionEntity.getSessionID(),asrResult);
WebSocketServerHandler.sendReplyToRoom(sessionEntity.getRoomId(), sessionEntity.getSessionID(), asrResult);
});
translationRecognizer.recognized.addEventListener((s, e) -> {
log.info("RECOGNIZEDResult:{}", e.getResult());
if (e.getResult().getReason() == ResultReason.RecognizedSpeech) {
//这里是中间结果,每出现一次就推送一次
if(!e.getResult().getText().equals("")){
AsrResult asrResult = new AsrResult();
Integer asrIndex = Constant.asrIndexMap.get(sessionEntity.getSessionID());
asrResult.setAsrIndex(asrIndex);
asrResult.setAsrText(e.getResult().getText());
asrResult.setEndTime(DateUtils.now());
asrResult.setTempText(e.getResult().getText());
asrResult.setUserId(sessionEntity.getExtNum());
asrResult.setTranslation(e.getResult().getTranslations());
asrResult.setTempType(0);
Constant.asrIndexMap.remove(sessionEntity.getSessionID());
WebSocketServerHandler.sendReplyToRoom(sessionEntity.getRoomId(), sessionEntity.getSessionID(), asrResult);
}
}
else if (e.getResult().getReason() == ResultReason.NoMatch) {
log.info("NOMATCH: Speech could not be recognized.");
}
});
translationRecognizer.canceled.addEventListener((s, e) -> {
if (e.getReason() == CancellationReason.Error) {
log.info("CANCELED: ErrorCode=" + e.getErrorCode());
log.info("CANCELED: ErrorDetails=" + e.getErrorDetails());
}
Constant.asrResultMap.remove(sessionEntity.getSessionID());
stopTranslationWithAudioStreamSemaphore.release();
});
translationRecognizer.sessionStopped.addEventListener((s, e) -> {
log.info("\n Session stopped event.");
Constant.asrResultMap.remove(sessionEntity.getSessionID());
stopTranslationWithAudioStreamSemaphore.release();
});
try {
log.info("开始异步识别");
translationRecognizer.startContinuousRecognitionAsync().get();
return true;
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ExecutionException e) {
e.printStackTrace();
}
}catch (Exception e){
log.info("微软buildConnect异常:{}",e.getStackTrace());
}
return false;
}
接收语音流直接传入
/**
* 接收数据,开始识别
* */
public static void ReceiveAudio(SessionEntity sessionEntity,byte[] audioChunk)
{
if(audioChunk.length>0){
sessionEntity.getAudioStream().write(audioChunk,0,audioChunk.length);
}
}