FFmpegFrameRecorder推送音频 ffmpeg单独音频推流

转载

mob64ca140caeb2 2024-08-13 17:00:54

之前项目需要实现双录功能，客户办理业务时录制音频视频保存做凭证使用。当前使用FFmpeg实现采集外交的USB摄像头数据，采样H264、ACC编码生成mkv视频格式文件。对外是Windows OCX接口给上传的html应用使用的，以下是以前项目使用的测试页面。

FFmpegFrameRecorder推送音频 ffmpeg单独音频推流_ffmpeg

FFmpegFrameRecorder推送音频 ffmpeg单独音频推流_设备名_02

现在直播行业发展很快，由于对视频敢兴趣最近也去了解了下，把之前做的东西整理后重写了个测试Demo，用于验证与测试。ffmpeg的强大毋庸置疑，利用ffmpeg命令也可以实现将电脑中摄像头的画面发布出去，例如发布为UDP，RTP，RTMP等。那代码如何实现呢？

先看下测试Demo框架，以下项目用VS2010开发，GoldenMediaStreamPS是实现推流的逻辑dll, 包括了录制摄像头、音频保存文件，推流摄像头、推流播放视频文件等功能。GoldenMediaStreamPSDemo是一个MFC界面，用来测试dll。

FFmpegFrameRecorder推送音频 ffmpeg单独音频推流_ide_03

FFmpegFrameRecorder推送音频 ffmpeg单独音频推流_ide_04

GoldenMediaStreamPS对外接口如下：创建对象CreateKernel接口根据传入的类型进行初始化，如录制摄像头保存为文件，推流视频还是摄像头等等。下面以推流摄像头为例子吧：

typedef enum E_KernelWorkMode
{
    E_PUSL_MODE_FILE            = 0,       // 推流模块：输出为视频文件
    E_PUSL_MODE_RTP_MPEGTS      = 0x01,    // 推流模块：输出为rtp_mpegts:udp 格式
    E_PUSL_MODE_RTMP            = 0x02,    // 推流模块：输出为RTMP 格式
    E_PULL_MODE_FILE            = 0x03,    // 拉流模块：播放视频文件
    E_PULL_MODE_RTP_MPEGTS      = 0x04     // 拉流模块：rtp_mpegts:udp 格式
}KernelWorkMode;

typedef struct T_VideoCodecParm
{
    UINT32 m_nVideoCodecId;
    UINT32 m_nWidth;
    UINT32 m_nHeight;
    UINT32 m_nFramerate;
    UINT32 m_nVideoBitrate;
    UINT32 m_nGopsize;
}VideoCodecParm;

typedef struct T_AudioCodecParm
{
    UINT32 m_nAudioCodecId;
    UINT32 m_nSamplerate;
    UINT32 m_nChannels;
    UINT32 m_nAudioBitrate;
}AudioCodecParm;

class IKernelProxy
{
public:
    IKernelProxy(void){}
    virtual ~IKernelProxy(void){}

public:
    virtual int CreateKernel(KernelWorkMode eMode,CString csOutPutPath) =0;  //初始化，创建实例
    virtual int OpenInputStream(CString csFileName) =0;  //输入需要推流的文件名，比如某个视频文件
    virtual int OpenInputStream(CString csVideoName,CString csAudioName) =0; //输入需要推流的摄像头，音频设备名称
    virtual int Start() =0; //开始运行
    virtual int ReleaseKernel() =0;
    
    virtual void SetVideoCodecParm(const VideoCodecParm& tVideoParm) =0;
    virtual void SetAudioCodecParm(const AudioCodecParm& tAudioParm) =0;
    virtual void GetVideoCodecParm(VideoCodecParm& tOutVideoParm) =0;
    virtual void GetAudioCodecParm(AudioCodecParm& tOutAudioParm) =0;
    virtual void GetCaptureDevicesList(WSVte &vOutVideoDevices,WSVte &vOutAudioDevices) =0;
    
};

一：初始化GoldenMediaStreamPS

void CGoldenMediaStreamPSDemoDlg::OnBnClickedButton1()
{
    // TODO: 在此添加控件通知处理程序代码
    vector<wstring> videoDev;
    vector<wstring> audioDev;
    if(m_pIKernelProxy!=NULL)
    {
        m_pIKernelProxy->GetCaptureDevicesList(videoDev,audioDev);
        m_pIKernelProxy->CreateKernel(E_PUSL_MODE_RTP_MPEGTS,_T("rtp://127.0.0.1:9999"));
        m_pIKernelProxy->OpenInputStream(videoDev[0].c_str(),audioDev[0].c_str());
        m_pIKernelProxy->Start();
    }
    
}

二：获取硬件设备

首先当然是需要获取硬件设备，也就是电脑的摄像头和音频设备。通过Directshow的COM接口来枚举设备，工程里面的EnumDevice接口就实现了枚举设备的功能：GetVideoDevices，GetAudioDevices会把系统上的摄像头、音频设备名称存入到vector容器内。

HRESULT CaptureDevices::Enumerate()
{
	HRESULT hr = S_OK;

	ICreateDevEnum *enumDev;

	hr = CoCreateInstance(CLSID_SystemDeviceEnum, NULL, CLSCTX_INPROC_SERVER, IID_PPV_ARGS(&enumDev));

	if(FAILED(hr))
	{
		lastError = L"Could not create device enumerator";
		return hr;
	}

	hr = enumDev->CreateClassEnumerator(CLSID_VideoInputDeviceCategory, &enumMonikerVideo, NULL);

	if (FAILED(hr))
	{
		printf("No video capture devices available");
	}

	hr = enumDev->CreateClassEnumerator(CLSID_AudioInputDeviceCategory, &enumMonikerAudio, NULL);

	if (FAILED(hr))
	{
		printf("No audio capture devices available");
	}

	enumDev->Release();

	return hr;
}

HRESULT CaptureDevices::GetVideoDevices(vector<wstring> *videoDevices)
{
	if (!enumMonikerVideo)
		return E_FAIL;

	IMoniker *pMoniker = NULL;
	wstring name;
	
	while (enumMonikerVideo->Next(1, &pMoniker, NULL) == S_OK)
	{
		IPropertyBag *pPropBag;
		HRESULT hr = pMoniker->BindToStorage(0, 0, IID_PPV_ARGS(&pPropBag));
		if (FAILED(hr))
		{
			pMoniker->Release();
			continue;  
		} 

		VARIANT var;
		VariantInit(&var);

		hr = pPropBag->Read(L"FriendlyName", &var, 0);
		if (SUCCEEDED(hr))
		{
			name = var.bstrVal;
			VariantClear(&var); 
		}

		pPropBag->Release();
		pMoniker->Release();

		if (!name.empty())
			videoDevices->push_back(name);
	}
	return 0;
}

HRESULT CaptureDevices::GetAudioDevices(vector<wstring> *audioDevices)
{
	if (!enumMonikerAudio)
		return E_FAIL;

	IMoniker *pMoniker = NULL;
	wstring name;

	while (enumMonikerAudio->Next(1, &pMoniker, NULL) == S_OK)
	{
		IPropertyBag *pPropBag;
		HRESULT hr = pMoniker->BindToStorage(0, 0, IID_PPV_ARGS(&pPropBag));
		if (FAILED(hr))
		{
			pMoniker->Release();
			continue;  
		} 

		VARIANT var;
		VariantInit(&var);

		hr = pPropBag->Read(L"FriendlyName", &var, 0);
		if (SUCCEEDED(hr))
		{
			name = var.bstrVal;
			VariantClear(&var); 
		}

		pPropBag->Release();
		pMoniker->Release();

		if (!name.empty())
			audioDevices->push_back(name);
	}
    return 0;
}

三：打开输入设备

从刚刚的GetVideoDevices，GetAudioDevices我们可以获取系统上的输入设备名称，通过OpenInputStream接口，将输入设备名称传入GoldenMediaStreamPS 实现打开。而打开输入设备使用的是ffmpeg的avformat_open_input打开设备，将获取的设备名称作为参数传进去，注意这个设备名称需要转成UTF-8编码，不然会报错。接着调用avformat_find_stream_info获取流的信息，得到视频流或音频流的索引ID号，之后通过这个索引号ID来定位视频和音频的Stream信息。实际上，我们可以把设备也看成是一般的文件源，而文件一般采用某种封装格式，要播放出来需要进行解复用，分离成裸流，然后对单独的视频流、音频流进行解码。虽然采集出来的图像或音频都是未编码的，但是按照FFmpeg的常规处理流程，我们需要加上“解码”这个步骤。这里跟打开音视频文件区别就是：av_find_input_format("dshow")。

int CameraInPutStream::OpenInputStream(CString csVDevName,CString csADevName)
{
     string strVideoDevName;
     string strAudioDevName;
        strVideoDevName = PUBLICFUN::WStringToString(csVDevName.GetString());
        strAudioDevName = PUBLICFUN::WStringToString(csADevName.GetString());

    if(strVideoDevName.empty() && strAudioDevName.empty())
    {
        ATLTRACE("you have not set any capture device \n");
        return false;
    }


    int i;

    //打开Directshow设备前需要调用FFmpeg的avdevice_register_all函数，否则下面返回失败
    m_pInputFormat = av_find_input_format("dshow");
    ASSERT(m_pInputFormat != NULL);

    // Set device params
    AVDictionary *device_param = 0;
    //if not setting rtbufsize, error messages will be shown in cmd, but you can still watch or record the stream correctly in most time
    //setting rtbufsize will erase those error messages, however, larger rtbufsize will bring latency
    //av_dict_set(&device_param, "rtbufsize", "10M", 0);

    if(!strVideoDevName.empty())
    {
        int res = 0;

        string device_name = "video=" + strVideoDevName;

        string device_name_utf8 = PUBLICFUN::AnsiToUTF8(device_name.c_str(), device_name.length());  //转成UTF-8，解决设备名称包含中文字符出现乱码的问题

        //Set own video device's name
        if ((res = avformat_open_input(&m_pVideoInFormatCtx, device_name_utf8.c_str(), m_pInputFormat, &device_param)) != 0)
        {
            ATLTRACE("Couldn't open input video stream.（无法打开输入流）\n");
            return false;
        }
        //input video initialize
        if (avformat_find_stream_info(m_pVideoInFormatCtx, NULL) < 0)
        {
            ATLTRACE("Couldn't find video stream information.（无法获取流信息）\n");
            return false;
        }
        m_nInVideoIndex = -1;
        for (i = 0; i < m_pVideoInFormatCtx->nb_streams; i++)
        {
            if (m_pVideoInFormatCtx->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO)
            {
                m_nInVideoIndex = i;
                break;
            }
        }

        if (m_nInVideoIndex == -1)
        {
            ATLTRACE("Couldn't find a video stream.（没有找到视频流）\n");
            return false;
        }
        if (avcodec_open2(m_pVideoInFormatCtx->streams[m_nInVideoIndex]->codec, avcodec_find_decoder(m_pVideoInFormatCtx->streams[m_nInVideoIndex]->codec->codec_id), NULL) < 0)
        {
            ATLTRACE("Could not open video codec.（无法打开解码器）\n");
            return false;
        }
    }

    //

    if(!strAudioDevName.empty())
    {
        string device_name = "audio=" + strAudioDevName;

        string device_name_utf8 = PUBLICFUN::AnsiToUTF8(device_name.c_str(), device_name.length());  //转成UTF-8，解决设备名称包含中文字符出现乱码的问题

        //Set own audio device's name
        if (avformat_open_input(&m_pAudioInFormatCtx, device_name_utf8.c_str(), m_pInputFormat, &device_param) != 0){

            ATLTRACE("Couldn't open input audio stream.（无法打开输入流）\n");
            return false;
        }

        //input audio initialize
        if (avformat_find_stream_info(m_pAudioInFormatCtx, NULL) < 0)
        {
            ATLTRACE("Couldn't find audio stream information.（无法获取流信息）\n");
            return false;
        }
        m_nInAudioIndex = -1;
        for (i = 0; i < m_pAudioInFormatCtx->nb_streams; i++)
        {
            if (m_pAudioInFormatCtx->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO)
            {
                m_nInAudioIndex = i;
                break;
            }
        }
        if (m_nInAudioIndex == -1)
        {
            ATLTRACE("Couldn't find a audio stream.（没有找到音频流）\n");
            return false;
        }
        if (avcodec_open2(m_pAudioInFormatCtx->streams[m_nInAudioIndex]->codec, avcodec_find_decoder(m_pAudioInFormatCtx->streams[m_nInAudioIndex]->codec->codec_id), NULL) < 0)
        {
            ATLTRACE("Could not open audio codec.（无法打开解码器）\n");
            return false;
        }
    }
    return 0;
}

int CameraInPutStream::OpenOutputStream(KernelWorkMode eMode,CString csOutPutPath)
{
    
    LPWSTR p_lpszOutPutPath =  (LPWSTR)(LPCTSTR)csOutPutPath;
    if (p_lpszOutPutPath==NULL)
    {

        return -1;
    }
    USES_CONVERSION;
    LPSTR lpStrOutPutPath = W2A (p_lpszOutPutPath);
    m_Loger->InfoMethodLog("OpenInputStream",1,(char*)lpStrOutPutPath);

    int nResult = 0;
    do 
    {
        //创建输出格式上下文
        if(m_pOutFormatCtx!=NULL)break;
        if(eMode == E_PUSL_MODE_RTMP)
            nResult = avformat_alloc_output_context2(&m_pOutFormatCtx,NULL,"rtmp",NULL); 
        else if(eMode == E_PUSL_MODE_RTP_MPEGTS)
            nResult = avformat_alloc_output_context2(&m_pOutFormatCtx,NULL,"rtp_mpegts",NULL);
        else if(eMode == E_PUSL_MODE_FILE)
            nResult = avformat_alloc_output_context2(&m_pOutFormatCtx,NULL,NULL,NULL);

        if(0!=nResult)
        {
            m_Loger->InfoMethodLog("avformat_alloc_output_context2",1,"Error = %d",nResult);
            break;
        }
    } while (FALSE);
    

	if(m_videoCodecId != 0)
	{
		//output video encoder initialize
		m_pCodec_v = avcodec_find_encoder(m_videoCodecId);
		if (!m_pCodec_v)
		{
			ATLTRACE("Can not find output video encoder! (没有找到合适的编码器！)\n");
			return false;
		}
		m_pCodecCtx_v = avcodec_alloc_context3(m_pCodec_v);
		m_pCodecCtx_v->pix_fmt = AV_PIX_FMT_YUV420P;
		m_pCodecCtx_v->width = m_width;
		m_pCodecCtx_v->height = m_height;
		m_pCodecCtx_v->time_base.num = 1;
		m_pCodecCtx_v->time_base.den = m_framerate;
		m_pCodecCtx_v->bit_rate = m_video_bitrate;
		m_pCodecCtx_v->gop_size = m_gopsize;

		//m_pCodecCtx_v->delay = 10;
		//m_pCodecCtx_v->global_quality = 90;


		/* Some formats want stream headers to be separate. */
		if (m_pOutFormatCtx->oformat->flags & AVFMT_GLOBALHEADER)
			m_pCodecCtx_v->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;


		AVDictionary *param = 0;

		//set H264 codec param
		if(m_videoCodecId == AV_CODEC_ID_H264)
		{
			//m_pCodecCtx_v->me_range = 16;
			//m_pCodecCtx_v->max_qdiff = 4;
			//m_pCodecCtx_v->qcompress = 0.6;
			m_pCodecCtx_v->qmin = 10;
			m_pCodecCtx_v->qmax = 51;
			//Optional Param
			m_pCodecCtx_v->max_b_frames = 0;
			

#if 1
			 //下面设置两个参数影响编码延时，如果不设置，编码器默认会缓冲很多帧
			// Set H264 preset and tune
			av_dict_set(¶m, "preset", "fast", 0);
			av_dict_set(¶m, "tune", "zerolatency", 0);
#else
			 /**
			  * ultrafast,superfast, veryfast, faster, fast, medium
			  * slow, slower, veryslow, placebo.　
			  注意：这是x264编码速度的选项， 设置该参数可以降低编码延时
			  */
			av_opt_set(m_pCodecCtx_v->priv_data,"preset","superfast",0);
#endif

		}

		if (avcodec_open2(m_pCodecCtx_v, m_pCodec_v, ¶m) < 0)
		{
			ATLTRACE("Failed to open output video encoder! (编码器打开失败！)\n");
			return false;
		}
        m_pOutFormatCtx->max_interleave_delta = 2;
		//Add a new stream to output,should be called by the user before avformat_write_header() for muxing
		m_VideoSt = avformat_new_stream(m_pOutFormatCtx, m_pCodec_v);
		if (m_VideoSt == NULL)
		{
			return false;
		}
		m_VideoSt->time_base.num = 1;
		m_VideoSt->time_base.den = m_framerate;
		m_VideoSt->codec = m_pCodecCtx_v;
		//Initialize the buffer to store YUV frames to be encoded.
		m_pFrameYUV = av_frame_alloc();
		m_pOutBuffer = (uint8_t *)av_malloc(avpicture_get_size(AV_PIX_FMT_YUV420P, m_pCodecCtx_v->width, m_pCodecCtx_v->height));
		avpicture_fill((AVPicture *)m_pFrameYUV, m_pOutBuffer, AV_PIX_FMT_YUV420P, m_pCodecCtx_v->width, m_pCodecCtx_v->height);
	}

	if(m_audioCodecId == 0)
	{
		//output audio encoder initialize
		m_pCodec_a = avcodec_find_encoder(m_audioCodecId);
		if (!m_pCodec_a)
		{
			ATLTRACE("Can not find output audio encoder! (没有找到合适的编码器！)\n");
			return false;
		}
		m_pCodecCtx_a = avcodec_alloc_context3(m_pCodec_a);
		m_pCodecCtx_a->channels = m_channels;
		m_pCodecCtx_a->channel_layout = av_get_default_channel_layout(m_channels);
		m_pCodecCtx_a->sample_rate = m_samplerate;
		m_pCodecCtx_a->sample_fmt = m_pCodec_a->sample_fmts[0];
		m_pCodecCtx_a->bit_rate = m_audio_bitrate;
		m_pCodecCtx_a->time_base.num = 1;
		m_pCodecCtx_a->time_base.den = m_pCodecCtx_a->sample_rate;

		if(m_audioCodecId == AV_CODEC_ID_AAC)
		{
			/** Allow the use of the experimental AAC encoder */
			m_pCodecCtx_a->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
		}

		/* Some formats want stream headers to be separate. */
		if (m_pOutFormatCtx->oformat->flags & AVFMT_GLOBALHEADER)
			m_pCodecCtx_a->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;

		if (avcodec_open2(m_pCodecCtx_a, m_pCodec_a, NULL) < 0)
		{
			ATLTRACE("Failed to open ouput audio encoder! (编码器打开失败！)\n");
			return false;
		}

		//Add a new stream to output,should be called by the user before avformat_write_header() for muxing
		m_AudioSt = avformat_new_stream(m_pOutFormatCtx, m_pCodec_a);
		if (m_AudioSt == NULL)
		{
			return false;
		}
		m_AudioSt->time_base.num = 1;
		m_AudioSt->time_base.den = m_pCodecCtx_a->sample_rate;
		m_AudioSt->codec = m_pCodecCtx_a;

		//Initialize the FIFO buffer to store audio samples to be encoded. 

		m_pFifo = av_audio_fifo_alloc(m_pCodecCtx_a->sample_fmt, m_pCodecCtx_a->channels, 1);

		//Initialize the buffer to store converted samples to be encoded.
		m_pConvertedInputSamples = NULL;
		/**
		* Allocate as many pointers as there are audio channels.
		* Each pointer will later point to the audio samples of the corresponding
		* channels (although it may be NULL for interleaved formats).
		*/
		if (!(m_pConvertedInputSamples = (uint8_t**)calloc(m_pCodecCtx_a->channels, sizeof(**m_pConvertedInputSamples)))) 
		{
			ATLTRACE("Could not allocate converted input sample pointers\n");
			return false;
		}
		m_pConvertedInputSamples[0] = NULL;
	}

	//Open output URL,set before avformat_write_header() for muxing
	if (avio_open(&m_pOutFormatCtx->pb, lpStrOutPutPath, AVIO_FLAG_WRITE) < 0)
	{
		ATLTRACE("Failed to open output file! (输出文件打开失败！)\n");
		return false;
	}

	// 增加一些元数据	
	av_dict_set(&(m_pOutFormatCtx->metadata), "title", "FKDualRec4", NULL);
	// av_dict_set(&(m_pOutFormatCtx->metadata), "tags", "FKDualRec", NULL);
	char l_buf[256];
	sprintf(l_buf,"timestamp=%ld",time(NULL));
	av_dict_set(&(m_pOutFormatCtx->metadata), "comment", l_buf, NULL);


	//Show some Information
	av_dump_format(m_pOutFormatCtx, 0, lpStrOutPutPath, 1);

	AVDictionary* options = NULL;    
	// av_dict_set(&options, "movflags", "rtphint+faststart", 0);     
	av_dict_set(&options, "movflags", "faststart", 0);     

	//Write File Header
	avformat_write_header(m_pOutFormatCtx, &options);

	m_vid_framecnt = 0;
	m_aud_framecnt = 0;
    m_nb_samples = 0;
    m_nLastAudioPresentationTime = 0;
    m_next_vid_time = 0;
	m_next_aud_time = 0;
    m_first_vid_time1 = m_first_vid_time2 = -1;
	m_first_aud_time = -1;

    return 0;
}

四：打开输出文件

输出文件根据根据CreateKernel传入的类型，avformat_alloc_output_context2 创建输出格式。如果是保存文件则传入文件路径，如果是推流则传入推流地址。目前使用ffmpeg里的rtp_mpegts格式推流，所以只改输出格式就行了很方便。当然你也可以根据协议封装UDP格式包发送出去，接下来我也会去研究下不同的推流封装格式，这个以后再说。

//创建输出格式上下文
        if(m_pOutFormatCtx!=NULL)break;
        if(eMode == E_PUSL_MODE_RTMP)
            nResult = avformat_alloc_output_context2(&m_pOutFormatCtx,NULL,"rtmp",NULL); 
        else if(eMode == E_PUSL_MODE_RTP_MPEGTS)
            nResult = avformat_alloc_output_context2(&m_pOutFormatCtx,NULL,"rtp_mpegts",NULL);
        else if(eMode == E_PUSL_MODE_FILE)
            nResult = avformat_alloc_output_context2(&m_pOutFormatCtx,NULL,NULL,NULL);

五：数据采集与编码

完成了输入设备的打开，那么起两个线程去读取数据与编码，一个读摄像头数据一个采集音频数据。读数据这一步就跟读视频文件没什么区别了，就是找到视频流ID,音频流ID，接着获取AVStream对象，不断的调用av_read_frame读数据，把数据给编码器编码，编码后打上时间戳用于播放时同步等等。创建编码器在OpenOutputStream（）接口已经有了，各位看上面代码吧。下面说一下读输入数据与编码：

视频我们使用的是H264编码，音频采用ACC，编码的时候有DTS\PTS表示每帧解码、显示的时间。DTS编码器会自动打上，PTS则需要我们自己写入。DTS与PTS这里不多接受了，大伙看看原理吧。由于B帧的关系导致PTS与DTS会不相同。PTS这里就是简单的把获取到的每帧数据按照采集时的时间戳转换成不同格式的时间基写入的，而DTS编码后编码器会自动不上。

int CameraInPutStream::ReadVideoPackets()
{
	if(m_VideoPacket == NULL)
	{
		prepare before decode and encode
		m_VideoPacket = (AVPacket *)av_malloc(sizeof(AVPacket));
	}

	int encode_video = 1;
	int ret;

	//start decode and encode

	while (encode_video)
	{
		if (m_bExitThread)
			break;

		AVFrame * pframe = NULL;
		if ((ret = av_read_frame(m_pVideoInFormatCtx, m_VideoPacket)) >= 0)
		{
			pframe = av_frame_alloc();
			if (!pframe) 
			{
				ret = AVERROR(ENOMEM);
				return ret;
			}
			int dec_got_frame = 0;
			ret = avcodec_decode_video2(m_pVideoInFormatCtx->streams[m_VideoPacket->stream_index]->codec, pframe, &dec_got_frame, m_VideoPacket);
			if (ret < 0) 
			{
				av_frame_free(&pframe);
				av_log(NULL, AV_LOG_ERROR, "Decoding failed\n");
				break;
			}
			if (dec_got_frame)
			{
                
				/*
				根据usb2.0 480M带宽，按 60% ~80%利用率约 band= 288M ~ 384M可用
				yuv 数据帧大小 640*480*2*8 = 4912500 = 4.68M
				4.68 / band = 0.012 ~0.016s 也就是视频帧已经在约 0.014s前捕获
				*/
                LeaveCriticalSection(&cs_lock);

				int64_t l_timestamp;
				GetCurrentFrameTimeStamp(l_timestamp);

                write_video_frame(m_pVideoInFormatCtx->streams[m_VideoPacket->stream_index], m_pVideoInFormatCtx->streams[m_nInVideoIndex]->codec->pix_fmt, pframe, l_timestamp);
                DeleteCriticalSection(&cs_lock);


				av_frame_free(&pframe);
			}
			else 
			{
				av_frame_free(&pframe);
			}

			av_free_packet(m_VideoPacket);
		}
		else
		{
			if (ret == AVERROR_EOF)
				encode_video = 0;
			else
			{
				ATLTRACE("Could not read video frame\n");
				break;
			}
		}
	}

	return 0;
}

int CameraInPutStream::write_video_frame(AVStream * input_st, enum AVPixelFormat pix_fmt, AVFrame *pframe, INT64 lTimeStamp)
{
    if(m_VideoSt == NULL)
    return -1;

    //ATLTRACE("Video timestamp: %ld \n", lTimeStamp);

    if(m_first_vid_time1 == -1)
    {
        TRACE("First Video timestamp: %ld \n", lTimeStamp);
        m_first_vid_time1 = lTimeStamp;
    }

    AVRational time_base_q = { 1, AV_TIME_BASE };

    if(img_convert_ctx == NULL)
    {
        //camera data may has a pix fmt of RGB or sth else,convert it to YUV420
        img_convert_ctx = sws_getContext(m_pVideoInFormatCtx->streams[m_nInVideoIndex]->codec->width, m_pVideoInFormatCtx->streams[m_nInVideoIndex]->codec->height,
            m_pVideoInFormatCtx->streams[m_nInVideoIndex]->codec->pix_fmt, m_pCodecCtx_v->width, m_pCodecCtx_v->height, AV_PIX_FMT_YUV420P, SWS_BICUBIC, NULL, NULL, NULL);
    }

    sws_scale(img_convert_ctx, (const uint8_t* const*)pframe->data, pframe->linesize, 0, m_pCodecCtx_v->height, m_pFrameYUV->data, m_pFrameYUV->linesize);

    m_pFrameYUV->width = m_pCodecCtx_v->width;
    m_pFrameYUV->height = m_pCodecCtx_v->height;    
    m_pFrameYUV->format = AV_PIX_FMT_YUV420P;

    enc_pkt.data = NULL;
    enc_pkt.size = 0;
    av_init_packet(&enc_pkt);

    int ret;
    int enc_got_frame = 0;
    ret = avcodec_encode_video2(m_pCodecCtx_v, &enc_pkt, m_pFrameYUV, &enc_got_frame);

    if (enc_got_frame == 1)
    {
        //printf("Succeed to encode frame: %5d\tsize:%5d\n", framecnt, enc_pkt.size);

        if(m_first_vid_time2 == -1)
        {
            m_first_vid_time2 = lTimeStamp;
        }

        enc_pkt.stream_index = m_VideoSt->index;

        m_Loger->InfoMethodLog("write video: ",1,"first_time1 = %d,first_time2=%d,stream_index=%d",m_first_vid_time1,m_first_vid_time2,enc_pkt.stream_index);
#if 0
        //Write PTS
        AVRational time_base = m_VideoSt->time_base;//{ 1, 1000 };
        AVRational r_framerate1 = input_st->r_frame_rate;//{ 50, 2 }; 
        //Duration between 2 frames (us)
        // int64_t calc_duration = (double)(AV_TIME_BASE)*(1 / av_q2d(r_framerate1));	//内部时间戳
        int64_t calc_pts = (double)m_vid_framecnt * (AV_TIME_BASE)*(1 / av_q2d(r_framerate1));

        //Parameters
        enc_pkt.pts = av_rescale_q(calc_pts, time_base_q, time_base);  //enc_pkt.pts = (double)(framecnt*calc_duration)*(double)(av_q2d(time_base_q)) / (double)(av_q2d(time_base));
        enc_pkt.dts = enc_pkt.pts;
        //enc_pkt.duration = av_rescale_q(calc_duration, time_base_q, time_base); //(double)(calc_duration)*(double)(av_q2d(time_base_q)) / (double)(av_q2d(time_base));
        //enc_pkt.pos = -1;
#else

        //enc_pkt.pts= av_rescale_q(lTimeStamp, time_base_q, m_VideoSt->time_base);
        
        //20200221//enc_pkt.pts = (INT64)m_VideoSt->time_base.den * lTimeStamp/AV_TIME_BASE;

#endif

        m_vid_framecnt++;
        AVRational time_base = m_VideoSt->time_base;//{ 1, 1000 };	//输出流时间基
        AVRational r_framerate1 = input_st->r_frame_rate;// { 50, 2 };				
        AVRational time_base_q = { 1, AV_TIME_BASE };				
        //Duration between 2 frames (us)				
        int64_t calc_duration = (double)(AV_TIME_BASE)*(1 / av_q2d(r_framerate1));	//内部时间戳				
        //Parameters				
        //enc_pkt.pts = (double)(framecnt*calc_duration)*(double)(av_q2d(time_base_q)) / (double)(av_q2d(time_base));				
        enc_pkt.pts = av_rescale_q(m_vid_framecnt*calc_duration, time_base_q, time_base);				
        enc_pkt.dts = enc_pkt.pts;				
        enc_pkt.duration = av_rescale_q(calc_duration, time_base_q, time_base); 
        //(double)(calc_duration)*(double)(av_q2d(time_base_q)) / (double)(av_q2d(time_base));				
        enc_pkt.pos = -1;								
        //Delay				
        int64_t pts_time = av_rescale_q(enc_pkt.dts, time_base, time_base_q);				
        int64_t now_time = av_gettime() - m_nStartTime;	

        m_Loger->InfoMethodLog("write video: ",1,"pts = %d,dts=%d,duration=%d,pts_time=%d,now_time=%d",enc_pkt.pts,enc_pkt.dts,enc_pkt.duration,pts_time,now_time);
        
        if (pts_time > now_time)
        {
            m_Loger->InfoMethodLog("av_usleep: ",1,"%d",pts_time - now_time);
            av_usleep(pts_time - now_time);
        }
             				
        ret = av_interleaved_write_frame(m_pOutFormatCtx, &enc_pkt);
        if(ret < 0)	
        {
            char tmpErrString[128] = {0};
            ATLTRACE("Could not write video frame, error: %s\n", av_make_error_string(tmpErrString, AV_ERROR_MAX_STRING_SIZE, ret));
            av_packet_unref(&enc_pkt);
            return ret;
        }

        av_free_packet(&enc_pkt);
    }
    else if(ret == 0)
    {
        ATLTRACE("Buffer video frame, timestamp: %I64d.\n", lTimeStamp); //编码器缓冲帧
    }

    return 0;
}

读音频数据：

int CameraInPutStream::ReadAudioPackets()
{
	//audio trancoding here
    int ret;

	int encode_audio = 1;
    int dec_got_frame_a = 0;

	//start decode and encode
	while (encode_audio)
	{
		if (m_bExitThread)
		   break;

		AVFrame *input_frame = av_frame_alloc();
		if (!input_frame)
		{
			ret = AVERROR(ENOMEM);
			return ret;
		}			

		/** Decode one frame worth of audio samples. */
		/** Packet used for temporary storage. */
		AVPacket input_packet;
		av_init_packet(&input_packet);
		input_packet.data = NULL;
		input_packet.size = 0;

		
		int64_t l_timestamp;
		if(!GetCurrentFrameTimeStamp(l_timestamp))
		{
			//Sleep(100);
			//continue;
		}		

		/** Read one audio frame from the input file into a temporary packet. */
		if ((ret = av_read_frame(m_pAudioInFormatCtx, &input_packet)) < 0) 
		{
			/** If we are at the end of the file, flush the decoder below. */
			if (ret == AVERROR_EOF)
			{
				encode_audio = 0;
			}
			else
			{
				ATLTRACE("Could not read audio frame\n");
				return ret;
			}					
		}

		/**
		* Decode the audio frame stored in the temporary packet.
		* The input audio stream decoder is used to do this.
		* If we are at the end of the file, pass an empty packet to the decoder
		* to flush it.
		*/
		if ((ret = avcodec_decode_audio4(m_pAudioInFormatCtx->streams[m_nInAudioIndex]->codec, input_frame, &dec_got_frame_a, &input_packet)) < 0)
		{
			ATLTRACE("Could not decode audio frame\n");
			return ret;
		}
		av_packet_unref(&input_packet);
		/** If there is decoded data, convert and store it */
		if (dec_got_frame_a) 
		{
            LeaveCriticalSection(&cs_lock);

            write_audio_frame(m_pAudioInFormatCtx->streams[m_nInAudioIndex], input_frame, l_timestamp);

            DeleteCriticalSection(&cs_lock);
				//m_pAudioCBFunc(m_pAudioContext,m_pAudFmtCtx->streams[m_audioindex], input_frame, l_timestamp);
	
		}

		av_frame_free(&input_frame);
		

	}//while

	return 0;
}

int CameraInPutStream::write_audio_frame(AVStream *input_st, AVFrame *input_frame, INT64 lTimeStamp)
{
	if(m_AudioSt == NULL)
		return -1;

	if(m_first_aud_time == -1)
	{
		TRACE("First Audio timestamp: %ld \n", lTimeStamp);
		m_first_aud_time = lTimeStamp;
	}

	const int output_frame_size = m_pCodecCtx_a->frame_size;

	AVRational time_base_q = { 1, AV_TIME_BASE };
	int ret;

	//if((INT64)(av_audio_fifo_size(m_pFifo) + input_frame->nb_samples) * AV_TIME_BASE /(INT64)(input_st->codec->sample_rate) - lTimeStamp > AV_TIME_BASE/10)
	//{
	//	TRACE("audio data is overflow \n");
	//	return 0;
	//}

	int nFifoSamples = av_audio_fifo_size(m_pFifo);
	INT64 timeshift = (INT64)nFifoSamples * AV_TIME_BASE /(INT64)(input_st->codec->sample_rate); //因为Fifo里有之前未读完的数据，所以从Fifo队列里面取出的第一个音频包的时间戳等于当前时间减掉缓冲部分的时长


	TRACE("audio time diff: %I64d \n", lTimeStamp - timeshift - m_nLastAudioPresentationTime); //理论上该差值稳定在一个水平，如果差值一直变大（在某些采集设备上发现有此现象），则会有视音频不同步的问题，具体产生的原因不清楚
    m_aud_framecnt += input_frame->nb_samples;


	if(aud_convert_ctx == NULL)
	{
		// Initialize the resampler to be able to convert audio sample formats
		aud_convert_ctx = swr_alloc_set_opts(NULL,
			av_get_default_channel_layout(m_pCodecCtx_a->channels),
			m_pCodecCtx_a->sample_fmt,
			m_pCodecCtx_a->sample_rate,
			av_get_default_channel_layout(input_st->codec->channels),
			input_st->codec->sample_fmt,
			input_st->codec->sample_rate,
			0, NULL);

		/**
		* Perform a sanity check so that the number of converted samples is
		* not greater than the number of samples to be converted.
		* If the sample rates differ, this case has to be handled differently
		*/
		ATLASSERT(m_pCodecCtx_a->sample_rate == input_st->codec->sample_rate);

		swr_init(aud_convert_ctx);
	}

	/**
	* Allocate memory for the samples of all channels in one consecutive
	* block for convenience.
	*/

	if ((ret = av_samples_alloc(m_pConvertedInputSamples, NULL, m_pCodecCtx_a->channels, input_frame->nb_samples, m_pCodecCtx_a->sample_fmt, 0)) < 0)
	{
		ATLTRACE("Could not allocate converted input samples\n");
		av_freep(&(*m_pConvertedInputSamples)[0]);
		free(*m_pConvertedInputSamples);
		return ret;
	}
	

	/**
	* Convert the input samples to the desired output sample format.
	* This requires a temporary storage provided by converted_input_samples.
	*/
	/** Convert the samples using the resampler. */
	if ((ret = swr_convert(aud_convert_ctx,
		m_pConvertedInputSamples, input_frame->nb_samples,
		(const uint8_t**)input_frame->extended_data, input_frame->nb_samples)) < 0)
	{
		ATLTRACE("Could not convert input samples\n");
		return ret;
	}

	/** Add the converted input samples to the FIFO buffer for later processing. */
	/**
	* Make the FIFO as large as it needs to be to hold both,
	* the old and the new samples.
	*/
	if ((ret = av_audio_fifo_realloc(m_pFifo, av_audio_fifo_size(m_pFifo) + input_frame->nb_samples)) < 0)
	{
		ATLTRACE("Could not reallocate FIFO\n");
		return ret;
	}

	/** Store the new samples in the FIFO buffer. */
	if (av_audio_fifo_write(m_pFifo, (void **)m_pConvertedInputSamples, input_frame->nb_samples) < input_frame->nb_samples) 
	{
		ATLTRACE("Could not write data to FIFO\n");
		return AVERROR_EXIT;
	}


	INT64 timeinc = (INT64)m_pCodecCtx_a->frame_size * AV_TIME_BASE /(INT64)(input_st->codec->sample_rate);
    
    //当前帧的时间戳不能小于上一帧的值 
	if(lTimeStamp - timeshift > m_nLastAudioPresentationTime )
	{
		m_nLastAudioPresentationTime = lTimeStamp - timeshift; 
	}
	
	while (av_audio_fifo_size(m_pFifo) >= output_frame_size)
		/**
		* Take one frame worth of audio samples from the FIFO buffer,
		* encode it and write it to the output file.
		*/
	{
		/** Temporary storage of the output samples of the frame written to the file. */
		AVFrame *output_frame = av_frame_alloc();
		if (!output_frame)
		{
			ret = AVERROR(ENOMEM);
			return ret;
		}
		/**
		* Use the maximum number of possible samples per frame.
		* If there is less than the maximum possible frame size in the FIFO
		* buffer use this number. Otherwise, use the maximum possible frame size
		*/
		const int frame_size = FFMIN(av_audio_fifo_size(m_pFifo), m_pCodecCtx_a->frame_size);


		/** Initialize temporary storage for one output frame. */
		/**
		* Set the frame's parameters, especially its size and format.
		* av_frame_get_buffer needs this to allocate memory for the
		* audio samples of the frame.
		* Default channel layouts based on the number of channels
		* are assumed for simplicity.
		*/
		output_frame->nb_samples = frame_size;
		output_frame->channel_layout = m_pCodecCtx_a->channel_layout;
		output_frame->format = m_pCodecCtx_a->sample_fmt;
		output_frame->sample_rate = m_pCodecCtx_a->sample_rate;

		/**
		* Allocate the samples of the created frame. This call will make
		* sure that the audio frame can hold as many samples as specified.
		*/
		if ((ret = av_frame_get_buffer(output_frame, 0)) < 0) 
		{
			ATLTRACE("Could not allocate output frame samples\n");
			av_frame_free(&output_frame);
			return ret;
		}

		/**
		* Read as many samples from the FIFO buffer as required to fill the frame.
		* The samples are stored in the frame temporarily.
		*/
		if (av_audio_fifo_read(m_pFifo, (void **)output_frame->data, frame_size) < frame_size) 
		{
			ATLTRACE("Could not read data from FIFO\n");
			return AVERROR_EXIT;
		}

		/** Encode one frame worth of audio samples. */
		/** Packet used for temporary storage. */
		AVPacket output_packet;
		av_init_packet(&output_packet);
		output_packet.data = NULL;
		output_packet.size = 0;

		int enc_got_frame_a = 0;

		/**
		* Encode the audio frame and store it in the temporary packet.
		* The output audio stream encoder is used to do this.
		*/
		if ((ret = avcodec_encode_audio2(m_pCodecCtx_a, &output_packet, output_frame, &enc_got_frame_a)) < 0) 
		{
			ATLTRACE("Could not encode frame\n");
			av_packet_unref(&output_packet);
			return ret;
		}


		/** Write one audio frame from the temporary packet to the output file. */
		if (enc_got_frame_a)
		{
			//output_packet.flags |= AV_PKT_FLAG_KEY;
			output_packet.stream_index = m_AudioSt->index;

#if 0
			AVRational r_framerate1 = { input_st->codec->sample_rate, 1 };// { 44100, 1};
			//int64_t calc_duration = (double)(AV_TIME_BASE)*(1 / av_q2d(r_framerate1));  //内部时间戳
			int64_t calc_pts = (double)m_nb_samples * (AV_TIME_BASE)*(1 / av_q2d(r_framerate1));

			output_packet.pts = av_rescale_q(calc_pts, time_base_q, m_AudioSt->time_base);
			//output_packet.dts = output_packet.pts;
			//output_packet.duration = output_frame->nb_samples;
#else
			output_packet.pts = av_rescale_q(m_nLastAudioPresentationTime, time_base_q, m_AudioSt->time_base);

#endif

			//ATLTRACE("audio pts : %ld\n", output_packet.pts);

			//int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
			//int64_t now_time = av_gettime() - start_time;
			//if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
			//	av_usleep(pts_time - now_time);

			if ((ret = av_interleaved_write_frame(m_pOutFormatCtx, &output_packet)) < 0) 
			{
				char tmpErrString[128] = {0};
				ATLTRACE("Could not write audio frame, error: %s\n", av_make_error_string(tmpErrString, AV_ERROR_MAX_STRING_SIZE, ret));
				av_packet_unref(&output_packet);
				return ret;
			}

			av_packet_unref(&output_packet);
		}//if (enc_got_frame_a)


		m_nb_samples += output_frame->nb_samples;

		m_nLastAudioPresentationTime += timeinc;

		av_frame_free(&output_frame);		
	}//while


	return 0;
}

推流音频文件是在另一个FileInPutStream.cpp实现，方式大同小异，流程都是差不多的。这里就不多说了：下面看看效果吧：

我是直接用ffmeg测试的，你也可以搭一个推流服务器，把地址改过来就行了。

测试命令：ffplay.exe rtp://127.0.0.1:9999

FFmpegFrameRecorder推送音频 ffmpeg单独音频推流_设备名_05