这个国庆节作死,答应别人在七天内做2个项目。智能客服项目是其中一个,当时是答应给隔壁兄弟团队做的。他们说实在搞不定了,节后要上线,我就因为随口说了一句:“这有什么难的”,结果祸从口出,这事情就落我头上了。

录音识别是智能客服大项目计划里面的一部分,简单说就是客户在跟用户聊天的时候,实时从声卡上抓取音频数据,然后发送给阿里云-智能语音交互识别成句子文本后,再发送给我们的智能客服助手服务器,生成提示展示给客户人员。听起来很简单,我当时也这么认为,但做完这个之后,我觉得以后没啥事还是少去隔壁团队那儿串门,^^。

这个小程序,步骤上分3步:录音,识别、提交。

1. 录音

本来是想着用java来写,开发能快点,而且java也有相应的AudioSystem音频处理模块,应该能轻松搞定。但是后来发现AudioSystem只能从麦克风这种音频输入设备读取数据,要想从声卡抓取音频数据只能用C++调用WASAPI来获取。网上也有不少例子,可以拿来借鉴。但这儿当时遇到的最大问题是,我抓取的音频格式是PCM FLOAT 32位,做音频处理的时候不方便,而且有些的音频处理程序不支持(比如: java的AudioSystem),所以我转换成PCM SIGNED 16位。另外,音频数据在发送给识别服务之前要按照要求转换成单轨方式,采样率也要调整成16000Hz。音频处理比较头疼,开始打算用ffmpeg来处理,但是这个库太重,用起来太复杂,后来就自己上网查资料,自己写算法转换的。

另外,网上也有种说法,就是说要在windows系统上进行录音,需要打开立体声混响设备。其实是不需要的,除非你想同时进行声卡和麦克风录音。仅仅从声卡录音,通过WASAPI够了。

2. 识别

将声卡抓取的音频数据调整成能被识别的格式之后,就可以调用阿里云提供的SDK进行发送了。因为文档比较齐全,所以这块儿还是挺顺利的。

3. 提交

因为智能服务接口是restful的所以,我这儿只用libcurl来进行HTTP处理。libcurl当时在编译成静态库后,连接时总是报 找不到函数入口。开始以为是忘记加extern "C",加了之后还是报错。最后,查了下资料,按照这个网页()的指导操作了下就好了。

4. 参考代码

其他模块都没什么难度,主要是音频处理部分,包括,位深转换,采样率转换,单轨调整。主要还是基于网上的代码做了少量的修改。

Capture.h
#pragma once

#pragma comment(lib,"avrt.lib")
#include <Audioclient.h>
#include <mmdeviceapi.h>
#include<iostream>
#include<avrt.h>
#include <vector>

typedef struct WAVE_HEADER {
	char    fccID[4];       //内容为""RIFF
	unsigned long dwSize;   //最后填写,WAVE格式音频的大小
	char    fccType[4];     //内容为"WAVE"
}WAVE_HEADER;

typedef struct WAVE_FMT {
	char    fccID[4];          //内容为"fmt "
	unsigned long  dwSize;     //内容为WAVE_FMT占的字节数,为16
	unsigned short wFormatTag; //如果为PCM,改值为 1
	unsigned short wChannels;  //通道数,单通道=1,双通道=2
	unsigned long  dwSamplesPerSec;//采用频率
	unsigned long  dwAvgBytesPerSec;/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
	unsigned short wBlockAlign;//==wChannels*uiBitsPerSample/8
	unsigned short uiBitsPerSample;//每个采样点的bit数,8bits=8, 16bits=16
}WAVE_FMT;

typedef struct WAVE_DATA {
	char    fccID[4];       //内容为"data"
	unsigned long dwSize;   //==NumSamples*wChannels*uiBitsPerSample/8
}WAVE_DATA;

class Capture
{
public:
	Capture();

	int start();
	int stop();
	int cap(std::vector<BYTE> &buffer, int rate, int channels);
	int wav(std::vector<BYTE>& buffer, int rate, int channels);
private:
	bool adjustFormatTo16Bits(WAVEFORMATEX *pwfx);
	int read(std::vector<BYTE> &buffer);
	int resample(std::vector<BYTE> &buffer, int rate);
	int singleChannel(std::vector<BYTE> &buffer);
	

	IAudioCaptureClient * m_pAudioCaptureClient;
	IAudioClient * m_pAudioClient;
	WAVEFORMATEX * m_pwfx;
	IMMDevice* m_pMMDevice;
	size_t m_FrameSize;

	int m_SampleRate;
	int m_Channels;
};
Capture.cpp
#include "Capture.h"

#define RETURN_ON_ERROR(hr) if(FAILED(hr)){CoUninitialize();return -1;}
#define RETURN_ON_NULL(p) if(p==NULL){CoUninitialize();return -1;}
#define RETURN_ON_FALSE(b) if(!b){CoUninitialize();return -1;}

bool Capture::adjustFormatTo16Bits(WAVEFORMATEX *pwfx)
{
	bool ret=false;

	if (pwfx->wFormatTag == WAVE_FORMAT_IEEE_FLOAT)
	{
		pwfx->wFormatTag = WAVE_FORMAT_PCM;
		pwfx->wBitsPerSample = 16;		
		pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
		pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
		ret = true;
	}
	else if (pwfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE)
	{
		PWAVEFORMATEXTENSIBLE pEx = reinterpret_cast<PWAVEFORMATEXTENSIBLE>(pwfx);
		if (IsEqualGUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, pEx->SubFormat))
		{
			pEx->SubFormat = KSDATAFORMAT_SUBTYPE_PCM;
			pEx->Samples.wValidBitsPerSample = 16;			
			pwfx->wBitsPerSample = 16;
			pwfx->nBlockAlign = pwfx->nChannels * pwfx->wBitsPerSample / 8;
			pwfx->nAvgBytesPerSec = pwfx->nBlockAlign * pwfx->nSamplesPerSec;
			ret = true;
		}
	}

	return ret;
}

Capture::Capture() {
	m_pAudioCaptureClient = NULL;
	m_pAudioClient = NULL;
	m_pMMDevice = NULL;
	m_pwfx = NULL;
	m_FrameSize = 0;

	m_SampleRate = -1;
	m_Channels = -1;
}

int Capture::start() {
	CoInitialize(NULL);
	IMMDeviceEnumerator *pMMDeviceEnumerator = NULL;
	HRESULT hr = CoCreateInstance(__uuidof(MMDeviceEnumerator), NULL, CLSCTX_ALL,
		__uuidof(IMMDeviceEnumerator), (void**)&pMMDeviceEnumerator);
	RETURN_ON_ERROR(hr);

	hr = pMMDeviceEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &m_pMMDevice);
	RETURN_ON_ERROR(hr);

	pMMDeviceEnumerator->Release();

	hr = m_pMMDevice->Activate(__uuidof(IAudioClient), CLSCTX_ALL, NULL, (void**)&m_pAudioClient);
	RETURN_ON_ERROR(hr);

	REFERENCE_TIME hnsDefaultDevicePeriod(0);
	hr = m_pAudioClient->GetDevicePeriod(&hnsDefaultDevicePeriod, NULL);
	RETURN_ON_ERROR(hr);

	hr = m_pAudioClient->GetMixFormat(&m_pwfx);
	RETURN_ON_ERROR(hr);

	/*转换成signed 16位编码*/
	adjustFormatTo16Bits(m_pwfx);

	m_FrameSize = (m_pwfx->wBitsPerSample / 8)*m_pwfx->nChannels;	
	
	hr = m_pAudioClient->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, 0, 0, m_pwfx, 0);
	RETURN_ON_ERROR(hr);


	hr = m_pAudioClient->GetService(__uuidof(IAudioCaptureClient), (void**)&m_pAudioCaptureClient);
	RETURN_ON_ERROR(hr);

	hr = m_pAudioClient->Start();
	RETURN_ON_ERROR(hr);

	CoUninitialize();

	m_Channels = m_pwfx->nChannels;
	m_SampleRate = m_pwfx->nSamplesPerSec;
	return 0;
}

int Capture::stop() {
	if (m_pAudioClient)
	{
		m_pAudioClient->Stop();
		m_pAudioClient->Release();
		m_pAudioClient = NULL;
	}
	if (m_pwfx != NULL)
	{
		CoTaskMemFree(m_pwfx);
		m_pwfx = NULL;
	}
	if (m_pAudioCaptureClient != NULL)
	{
		m_pAudioCaptureClient->Release();
		m_pAudioCaptureClient = NULL;
	}
	return 0;
}

int Capture::cap(std::vector<BYTE> &buffer, int rate, int channels)
{
	read(buffer);
	resample(buffer, rate);
	singleChannel(buffer);
	return buffer.size();
}

int Capture::read(std::vector<BYTE> &buffer) {
	DWORD dwWaitResult;
	UINT32 nNextPacketSize(0);
	BYTE *pData = NULL;
	UINT32 framesAvailable;
	DWORD flags;
	CoInitialize(NULL);

	HRESULT hr = m_pAudioCaptureClient->GetBuffer(&pData, &framesAvailable, &flags, NULL, NULL);
	RETURN_ON_ERROR(hr);

	if (0 != framesAvailable)
	{
		buffer.insert(buffer.end(), pData, pData+framesAvailable * m_FrameSize);
	}
	m_pAudioCaptureClient->ReleaseBuffer(framesAvailable);

	CoUninitialize();
	return framesAvailable * m_FrameSize;
}

int Capture::resample(std::vector<BYTE>& buffer, int rate)
{
	if (m_SampleRate == rate)return buffer.size();
	if (m_pwfx == nullptr)return -1;
	std::vector<BYTE> resultBuffer;
	int bytes = m_pwfx->wBitsPerSample/8;
	int sampleCount = buffer.size() / bytes;
	int srcRate = m_pwfx->nSamplesPerSec;
	int dstRate = rate;
	int rateLen = srcRate / dstRate;
	
	if (rateLen == 1) return buffer.size();

	if (rateLen > 0) {
		short tempRead = 0;
		short tempSum = 0;
		int flag = 0;
		
		for (int i = 0; i < sampleCount; i++) {
			memcpy(&tempRead, buffer.data()+i*bytes, bytes);
			tempSum = tempSum + tempRead;
			flag++;
			if (flag == rateLen)
			{
				flag = 0;
				tempSum = tempSum / rateLen;
				resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempSum), ((BYTE*)&tempSum) + bytes);
				tempSum = 0;
			}
		}
	}
	else {
		rateLen = dstRate / srcRate;
		int tempRead1;
		int tempRead2;
		int tempSum;
		int tempAvgDiff;
		int tempWrite;
		int flag;

		for (int i = 0; i < (sampleCount-1); i++) {			
			memcpy(&tempRead1, buffer.data() + i * bytes, bytes);
			memcpy(&tempRead2, buffer.data() + i * bytes+ bytes, bytes);
			tempSum = tempRead2 - tempRead1;
			tempAvgDiff = tempSum / rateLen;
			tempWrite = tempRead1;
			flag = rateLen;
			do
			{
				tempWrite += tempAvgDiff;
				resultBuffer.insert(resultBuffer.end(), ((BYTE*)&tempWrite), ((BYTE*)&tempWrite) + bytes);
			} while (--flag);
		}
	}
	buffer.swap(resultBuffer);	
	return buffer.size();
}

int Capture::singleChannel(std::vector<BYTE>& buffer)
{
	if (m_Channels == 1) return buffer.size();

	size_t len = buffer.size() / 2;
	int bytes = m_pwfx->wBitsPerSample / 8;
	//std::vector<BYTE> singleBuffer(len);
	BYTE *singleBuffer = new BYTE[len];
	//singleBuffer.reserve(len);
	for (int i = 0; i < len/bytes; i++) {
		//singleBuffer.insert(singleBuffer.end(), buffer.data() + i*bytes * 2, buffer.data() + i*bytes * 2 + bytes);
		memcpy(singleBuffer+i*bytes, buffer.data()+i*(2*bytes), bytes);
	}
	
	buffer.assign(singleBuffer, singleBuffer + len);
	delete[] singleBuffer;
	return buffer.size();
}

int Capture::wav(std::vector<BYTE>& buffer, int rate, int channels)
{
	std::vector<BYTE> wavBuffer;

	WAVE_HEADER pcmHEADER;
	WAVE_FMT    pcmFMT;
	WAVE_DATA   pcmDATA;

	unsigned short m_pcmData;
	int dataSize = buffer.size();

	/* WAVE_HEADER */
	memcpy(pcmHEADER.fccID, "RIFF", strlen("RIFF"));
	memcpy(pcmHEADER.fccType, "WAVE", strlen("WAVE"));
	pcmHEADER.dwSize = 36 + dataSize;

	/* WAVE_FMT */
	memcpy(pcmFMT.fccID, "fmt ", strlen("fmt "));
	pcmFMT.dwSize = 16;
	pcmFMT.wFormatTag = 1;
	pcmFMT.wChannels = channels;
	pcmFMT.dwSamplesPerSec = rate;
	pcmFMT.uiBitsPerSample = 16;
	/* ==dwSamplesPerSec*wChannels*uiBitsPerSample/8 */
	pcmFMT.dwAvgBytesPerSec = pcmFMT.dwSamplesPerSec*pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;
	/* ==wChannels*uiBitsPerSample/8 */
	pcmFMT.wBlockAlign = pcmFMT.wChannels*pcmFMT.uiBitsPerSample / 8;

	/* WAVE_DATA */
	memcpy(pcmDATA.fccID, "data", strlen("data"));
	pcmDATA.dwSize = dataSize;

	wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmHEADER, ((BYTE*)&pcmHEADER) + sizeof(WAVE_HEADER));
	wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmFMT, ((BYTE*)&pcmFMT) + sizeof(WAVE_FMT));
	wavBuffer.insert(wavBuffer.end(), (BYTE*)&pcmDATA, ((BYTE*)&pcmDATA) + sizeof(WAVE_DATA));
	wavBuffer.insert(wavBuffer.end(), buffer.begin(), buffer.end());
	buffer.swap(wavBuffer);
	return buffer.size();
}