基本流程
- 环境搭建
- 客户端环境
- 录音模块
pip install pyaudio
- 服务器环境
- flask
- 客户端
- 录音模块
- 硬件设备
- 识别效果跟话筒的关系很大
- 缓冲区
- 发送数据
- 缓冲区数据
- 缓冲区优化
- 接收数据
- 识别结果
- 服务器端
- 接收缓冲区数据
- 调用识别接口
- 传入缓冲区数据
- 返回识别文字
- 发送识别文字给客户端
环境准备
相关包的安装
pip install pygame
SpeechRecognition
playsound
librosa
服务器端
文档结构
初始目录结构
├── cache # 缓冲区
│ └── temp.wav
├── client
│ ├── client.py
│ └── __init__.py
├── decoder # 解码器
│ ├── create_data_list.sh
│ ├── datalist # 生成datalist的地方
│ ├── recognize.py
│ └── wenet -> /home/asr/data/wenet/wenet
├── model # 模型存放地方
│ ├── 20210815_unified_conformer_exp
│ │ ├── final.pt
│ │ ├── global_cmvn
│ │ ├── train.yaml
│ │ └── words.txt
└── server
├── __init__.py
└── server.py
测试识别后的目录结构
├── cache #
│ ├── temp1.wav
│ ├── temp2.wav
│ └── temp.wav
├── client
│ ├── client.py
│ └── __init__.py
├── decoder
│ ├── create_data_list.sh
│ ├── datalist
│ │ ├── temp
│ │ ├── temp1
│ │ └── temp2
│ ├── recognize.py
│ └── wenet -> /home/asr/data/wenet/wenet
├── model
│ ├── 20210618_u2pp_conformer_exp.tar.gz
│ ├── 20210815_unified_conformer_exp
│ │ ├── final.pt
│ │ ├── global_cmvn
│ │ ├── train.yaml
│ │ └── words.txt
│ └── 20210815_unified_conformer_exp.tar.gz
└── server
├── __init__.py
└── server.py
服务器功能模块【待完成】
服务端环境搭建
pip install flask
from flask import Flask
app = Flask(__name__)
# 加载模型 传入参数
@app.route("/")
def getdata():
# 调用识别
save_wav(data,save_path)
if __name__ =="__main__":
app.run()
接收数据
# 保存麦克风数据
def save_wav(frames, save_path):
wf = wave.open(save_path, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(2)
wf.setframerate(SAMPALE_RATE)
wf.writeframes(b''.join(frames))
wf.close()
print('\033[93m' + "已录入缓冲区" + '\033[0m')
返回客户识别结果
开启服务的时候加载模型,
当有数据传入的调用识别接口
from flask import Flask
from recognize import recognize
app = Flask(__name__)
# 加载模型 传入参数
model = recognize()
@app.route("/")
def run_recognize():
# 调用识别
result = model.get_recognize()
return result
if __name__ =="__main__":
app.run()
识别引擎搭建【完成】
生成data_list
wenet识别需要这个文件,内部读取这个文件的数据,格式必须是如下
{"key":"temp","wav":"/home/sunao/data/StreamAIzimu/cache/temp.wav","txt":""}
#!/usr/bin/bash
root=..
data=${root}/cache/temp.wav
echo "{\"key\":\"temp\",\"wav\":\"${data}\",\"txt\":\"\"}" > online_data.list
修改wenet的识别文件recognize.py改为只加载一次模型,并且取消默认的bash脚本传入参数的方式
recognize.py
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import copy
import logging
import os
import sys
import time
import torch
import yaml
from torch.utils.data import DataLoader
from wenet.dataset.dataset import Dataset
from wenet.transformer.asr_model import init_asr_model
from wenet.utils.checkpoint import load_checkpoint
from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols
from wenet.utils.config import override_config
class recognize():
def __init__(self, ):
self.root_path = os.pardir
self.batch_size = 1
self.beam_size = 10
self.bpe_model = None
self.checkpoint = '../model/20210815_unified_conformer_exp/final.pt'
self.config = '../model/20210815_unified_conformer_exp/train.yaml'
self.ctc_weight = 0.5
self.data_type = 'raw'
self.decoding_chunk_size = -1
self.dict = '../model/20210815_unified_conformer_exp/words.txt'
self.gpu = -1
self.mode = 'attention_rescoring'
self.non_lang_syms = None
self.num_decoding_left_chunks = -1
self.override_config = []
self.penalty = 0.0
self.result_file = 'online_text'
self.reverse_weight = 0.0
self.simulate_streaming = False,
self.test_data = 'online_data.list'
self.use_cuda = self.gpu >= 0 and torch.cuda.is_available()
self.device = torch.device('cuda' if False else 'cpu')
self.load_configs() # 加载配置
self.test_data_conf()
self.loadmodel() # 加载模型
def load_configs(self):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.gpu)
if self.mode in ['ctc_prefix_beam_search', 'attention_rescoring'
] and self.batch_size > 1:
logging.fatal(
'decoding mode {} must be running with batch_size == 1'.format(
self.mode))
sys.exit(1)
with open(self.config, 'r') as fin:
self.configs = yaml.load(fin, Loader=yaml.FullLoader)
if len(self.override_config) > 0:
self.configs = override_config(self.configs, self.override_config)
# 加载词典
self.symbol_table = read_symbol_table(self.dict)
def loadmodel(self):
# Init asr model from configs
model = init_asr_model(self.configs)
# Load dict
self.char_dict = {v: k for k, v in self.symbol_table.items()}
self.eos = len(self.char_dict) - 1
load_checkpoint(model, self.checkpoint)
self.model = model.to(self.device)
self.model.eval()
def test_data_conf(self):
'''
测试数据配置
'''
self.test_conf = copy.deepcopy(self.configs['dataset_conf'])
self.test_conf['filter_conf']['max_length'] = 102400
self.test_conf['filter_conf']['min_length'] = 0
self.test_conf['filter_conf']['token_max_length'] = 102400
self.test_conf['filter_conf']['token_min_length'] = 0
self.test_conf['filter_conf']['max_output_input_ratio'] = 102400
self.test_conf['filter_conf']['min_output_input_ratio'] = 0
self.test_conf['speed_perturb'] = False
self.test_conf['spec_aug'] = False
self.test_conf['shuffle'] = False
self.test_conf['sort'] = False
if 'fbank_conf' in self.test_conf:
self.test_conf['fbank_conf']['dither'] = 0.0
elif 'mfcc_conf' in self.test_conf:
self.test_conf['mfcc_conf']['dither'] = 0.0
self.test_conf['batch_conf']['batch_type'] = "static"
self.test_conf['batch_conf']['batch_size'] = self.batch_size
self.non_lang_syms = read_non_lang_symbols(self.non_lang_syms)
def get_test_data_loader(self):
test_dataset = Dataset(self.data_type,
self.test_data,
self.symbol_table,
self.test_conf,
self.bpe_model,
self.non_lang_syms,
partition=False)
return DataLoader(test_dataset, batch_size=None, num_workers=0)
def get_recognize(self):
test_data_loader = self.get_test_data_loader()
with torch.no_grad():
for batch_idx, batch in enumerate(test_data_loader):
keys, feats, target, feats_lengths, target_lengths = batch
feats = feats.to(self.device)
feats_lengths = feats_lengths.to(self.device)
assert (feats.size(0) == 1)
if self.mode == 'attention':
hyps, _ = self.model.recognize(
feats,
feats_lengths,
beam_size=self.beam_size,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
simulate_streaming=self.simulate_streaming)
hyps = [hyp.tolist() for hyp in hyps]
elif self.mode == 'ctc_greedy_search':
hyps, _ = self.model.ctc_greedy_search(
feats,
feats_lengths,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
simulate_streaming=self.simulate_streaming)
# ctc_prefix_beam_search and attention_rescoring only return one
# result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode
elif self.mode == 'ctc_prefix_beam_search':
assert (feats.size(0) == 1)
hyp, _ = self.model.ctc_prefix_beam_search(
feats,
feats_lengths,
self.beam_size,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
simulate_streaming=self.simulate_streaming)
hyps = [hyp]
elif self.mode == 'attention_rescoring':
assert (feats.size(0) == 1)
hyp, _ = self.model.attention_rescoring(
feats,
feats_lengths,
self.beam_size,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
ctc_weight=self.ctc_weight,
simulate_streaming=self.simulate_streaming,
reverse_weight=self.reverse_weight)
hyps = [hyp]
content = ''
for w in hyps[0]:
if w == self.eos:
break
content += self.char_dict[w]
return content
if __name__ == '__main__':
# 加载模型
recog = recognize()
# 实时接收数据
result = recog.get_recognize()
print(result)
# 实时返回识别结果
修改支持指定音频识别
这样就可以支持多路翻译
# Copyright (c) 2020 Mobvoi Inc. (authors: Binbin Zhang, Xiaoyu Chen, Di Wu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import copy
import logging
import os
import sys
import time
import torch
import yaml
from torch.utils.data import DataLoader
from wenet.dataset.dataset import Dataset
from wenet.transformer.asr_model import init_asr_model
from wenet.utils.checkpoint import load_checkpoint
from wenet.utils.file_utils import read_symbol_table, read_non_lang_symbols
from wenet.utils.config import override_config
class recognize():
def __init__(self, ):
self.root_path = os.pardir
self.batch_size = 1
self.beam_size = 10
self.bpe_model = None
self.checkpoint = '../model/20210815_unified_conformer_exp/final.pt'
self.config = '../model/20210815_unified_conformer_exp/train.yaml'
self.ctc_weight = 0.5
self.data_type = 'raw'
self.decoding_chunk_size = -1
self.dict = '../model/20210815_unified_conformer_exp/words.txt'
self.gpu = -1
self.mode = 'attention_rescoring'
self.non_lang_syms = None
self.num_decoding_left_chunks = -1
self.override_config = []
self.penalty = 0.0
self.result_file = 'online_text'
self.reverse_weight = 0.0
self.simulate_streaming = False,
self.test_data = 'online_data.list'
self.use_cuda = self.gpu >= 0 and torch.cuda.is_available()
self.device = torch.device('cuda' if False else 'cpu')
self.load_configs() # 加载配置
self.test_data_conf()
self.loadmodel() # 加载模型
def load_configs(self):
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
os.environ['CUDA_VISIBLE_DEVICES'] = str(self.gpu)
if self.mode in ['ctc_prefix_beam_search', 'attention_rescoring'
] and self.batch_size > 1:
logging.fatal(
'decoding mode {} must be running with batch_size == 1'.format(
self.mode))
sys.exit(1)
with open(self.config, 'r') as fin:
self.configs = yaml.load(fin, Loader=yaml.FullLoader)
if len(self.override_config) > 0:
self.configs = override_config(self.configs, self.override_config)
# 加载词典
self.symbol_table = read_symbol_table(self.dict)
def loadmodel(self):
# Init asr model from configs
model = init_asr_model(self.configs)
# Load dict
self.char_dict = {v: k for k, v in self.symbol_table.items()}
self.eos = len(self.char_dict) - 1
load_checkpoint(model, self.checkpoint)
self.model = model.to(self.device)
self.model.eval()
def test_data_conf(self):
'''
测试数据配置
'''
self.test_conf = copy.deepcopy(self.configs['dataset_conf'])
self.test_conf['filter_conf']['max_length'] = 102400
self.test_conf['filter_conf']['min_length'] = 0
self.test_conf['filter_conf']['token_max_length'] = 102400
self.test_conf['filter_conf']['token_min_length'] = 0
self.test_conf['filter_conf']['max_output_input_ratio'] = 102400
self.test_conf['filter_conf']['min_output_input_ratio'] = 0
self.test_conf['speed_perturb'] = False
self.test_conf['spec_aug'] = False
self.test_conf['shuffle'] = False
self.test_conf['sort'] = False
if 'fbank_conf' in self.test_conf:
self.test_conf['fbank_conf']['dither'] = 0.0
elif 'mfcc_conf' in self.test_conf:
self.test_conf['mfcc_conf']['dither'] = 0.0
self.test_conf['batch_conf']['batch_type'] = "static"
self.test_conf['batch_conf']['batch_size'] = self.batch_size
self.non_lang_syms = read_non_lang_symbols(self.non_lang_syms)
def get_test_data_loader(self,path):
self.test_data=path
test_dataset = Dataset(self.data_type,
self.test_data,
self.symbol_table,
self.test_conf,
self.bpe_model,
self.non_lang_syms,
partition=False)
return DataLoader(test_dataset, batch_size=None, num_workers=0)
def create_data_list(self,path):
file_name = path.split("/")[-1].split(".")[0]
filepath = "./datalist/"+file_name
if not os.path.exists(filepath):
with open(filepath,'w',encoding="utf-8") as file:
file.write('{"key":"%s","wav":"/home/sunao/data/StreamAIzimu/cache/%s.wav","txt":""}'%(file_name,file_name))
return filepath
def get_recognize(self , path):
path = self.create_data_list(path)
test_data_loader = self.get_test_data_loader(path)
with torch.no_grad():
for batch_idx, batch in enumerate(test_data_loader):
keys, feats, target, feats_lengths, target_lengths = batch
feats = feats.to(self.device)
feats_lengths = feats_lengths.to(self.device)
assert (feats.size(0) == 1)
if self.mode == 'attention':
hyps, _ = self.model.recognize(
feats,
feats_lengths,
beam_size=self.beam_size,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
simulate_streaming=self.simulate_streaming)
hyps = [hyp.tolist() for hyp in hyps]
elif self.mode == 'ctc_greedy_search':
hyps, _ = self.model.ctc_greedy_search(
feats,
feats_lengths,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
simulate_streaming=self.simulate_streaming)
# ctc_prefix_beam_search and attention_rescoring only return one
# result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode
elif self.mode == 'ctc_prefix_beam_search':
assert (feats.size(0) == 1)
hyp, _ = self.model.ctc_prefix_beam_search(
feats,
feats_lengths,
self.beam_size,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
simulate_streaming=self.simulate_streaming)
hyps = [hyp]
elif self.mode == 'attention_rescoring':
assert (feats.size(0) == 1)
hyp, _ = self.model.attention_rescoring(
feats,
feats_lengths,
self.beam_size,
decoding_chunk_size=self.decoding_chunk_size,
num_decoding_left_chunks=self.num_decoding_left_chunks,
ctc_weight=self.ctc_weight,
simulate_streaming=self.simulate_streaming,
reverse_weight=self.reverse_weight)
hyps = [hyp]
content = ''
for w in hyps[0]:
if w == self.eos:
break
content += self.char_dict[w]
return content
if __name__ == '__main__':
# 加载模型
recog = recognize()
# 实时接收数据
result1 = recog.get_recognize("../cache/temp.wav")
result2 = recog.get_recognize("../cache/temp1.wav")
result3 = recog.get_recognize("../cache/temp2.wav")
print(result1)
print(result2)
print(result3)
# 实时返回识别结果
客户端
- 首先判断是否有人说话,即是否有数据
- 存入缓冲区,送入识别模块
- 识别返回结果,存入字幕
- 字幕的长度
- 需要判断此处是否是句子结尾
- 是的话,断句,vad
- 不是的话,超过20个字进行断句
- 判断静音时间
- 如果静音时间过长,则
录音模块
获取麦克风数据以及保存音频
实际上这里不需要保存音频,这里只是为了测试录音是否可以正常运行
import pyaudio
import wave
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
SAMPALE_RATE = 44100 # 默认是 44100 保真度最高,识别的时候使用16000
RECORD_SECONDS = 4
temp_save_path = "Audio/temp.wav"
p = pyaudio.PyAudio()
# 保存麦克风数据
def save_wav(frames, save_path):
wf = wave.open(save_path, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(2)
wf.setframerate(SAMPALE_RATE)
wf.writeframes(b''.join(frames))
wf.close()
print('\033[93m' + "已录入缓冲区" + '\033[0m')
# 获取麦克风数据
def recording(save_path):
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=SAMPALE_RATE,
input=True,
frames_per_buffer=CHUNK)
print('\033[93m' + "recording" + '\033[0m')
# 缓冲区大小
frames = []
max_size = 16*4
while 1:
data = stream.read(CHUNK)
#data = int.from_bytes(data, byteorder='big', signed=False)
frames.append(data)
if len(frames) == max_size:
# 保存缓冲区
save_wav(frames, save_path)
# 清空缓冲区
frames = []
break
# 发送到服务器
# result = requests(frames)
# break
# if result == "退出":
# break
recording(temp_save_path)
简易VAD
基于短时能量和短时过零率门限进行端点检测
vad.py
# -*- coding: utf-8 -*-
import numpy as np
import pyaudio
SUCCESS = 0
FAIL = 1
audio2 = ""
stream2 = ""
# 需要添加录音互斥功能能,某些功能开启的时候录音暂时关闭
def ZCR(curFrame):
# 过零率
tmp1 = curFrame[:-1]
tmp2 = curFrame[1:]
sings = (tmp1 * tmp2 <= 0)
diffs = (tmp1 - tmp2) > 0.02
zcr = np.sum(sings * diffs)
return zcr
def STE(curFrame):
# 短时能量
amp = np.sum(np.abs(curFrame))
return amp
class Vad(object):
def __init__(self,CHUNK=1024):
# 初始短时能量高门限
self.amp1 = 15
# 初始短时能量低门限
self.amp2 = 1
# 初始短时过零率高门限
self.zcr1 = 2
# 初始短时过零率低门限
self.zcr2 = 1
# 允许最大静音长度
self.maxsilence = 45 # 允许换气的最长时间
# 语音的最短长度
self.minlen = 40 # 过滤小音量
# 能量最大值
self.max_en = 20000
# 初始状态为静音
self.status = 0
self.count = 0
self.silence = 0
self.frame_len = CHUNK
self.frame_inc = CHUNK / 2
self.cur_status = 0
def check_ontime(self, cache_frame): # self.cache的值为空 self.cache_frames的数据长度为744
wave_data = np.frombuffer(cache_frame, dtype=np.int16) # 这里的值竟然是256
wave_data = wave_data * 1.0 / self.max_en # max_en 为20000
data = wave_data[np.arange(0, self.frame_len)] # 取前frame_len个值 这个值为256
# 获得音频过零率
zcr = ZCR(data)
# 获得音频的短时能量, 平方放大
amp = STE(data) ** 2
# 返回当前音频数据状态
status = self.speech_status(amp, zcr)
return status
def speech_status(self, amp, zcr):
status = 0
# 0= 静音, 1= 可能开始, 2=确定进入语音段 3语音结束
if self.cur_status in [0, 1]: # 如果在静音状态或可能的语音状态,则执行下面操作
# 确定进入语音段
if amp > self.amp1 or zcr > self.zcr1: # 超过最大 短时能量门限了
status = 2
self.silence = 0
self.count += 1
# 可能处于语音段 能量处于浊音段,过零率在清音或浊音段
elif amp > self.amp2 or zcr > self.zcr2:
status = 2
self.count += 1
# 静音状态
else:
status = 0
self.count = 0
self.count = 0
# 2 = 语音段
elif self.cur_status == 2:
# 保持在语音段 能量处于浊音段,过零率在清音或浊音段
if amp > self.amp2 or zcr > self.zcr2:
self.count += 1
status = 2
# 语音将结束
else:
# 静音还不够长,尚未结束
self.silence += 1
if self.silence < self.maxsilence:
self.count += 1
status = 2
# 语音长度太短认为是噪声
elif self.count < self.minlen:
status = 0
self.silence = 0
self.count = 0
# 语音结束
else:
status = 3
self.silence = 0
self.count = 0
return status
本地录音模块加测试模块结合
目前简化功能,实现对答时语音识别效果。因为还么有找到控制控制台输出的好方法,对实时字幕的显示很不好。
基于缓存的语音识别对说话者的尾音总是捕捉不到,需要进行尾音的追加,达到识别整段语音的效果。
import time
import pyaudio
import wave
from decoder.recognize import Recognize
import numpy as np
from vad import Vad
class RecognizeService():
def __init__(self):
self.CHUNK = 1024
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.SAMPALE_RATE = 16000 # 默认是 44100 保真度最高,识别的时候使用16000
self.temp_save_path = "../cache/temp1.wav"
self.p = pyaudio.PyAudio()
self.model = Recognize()
self.stream = self.p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.SAMPALE_RATE,
input=True,
frames_per_buffer=self.CHUNK,
input_device_index=0)
self.v = Vad(self.CHUNK)
# 保存麦克风数据
def save_wav(self,frames, save_path):
wf = wave.open(save_path, 'wb')
wf.setnchannels(self.CHANNELS)
wf.setsampwidth(self.p.get_sample_size(self.FORMAT))
wf.setframerate(self.SAMPALE_RATE)
wf.writeframes(b''.join(frames))
wf.close()
# print('\033[93m' + "已录入缓冲区" + '\033[0m')
# 获取麦克风数据
def recording(self,save_path):
print('\033[93m' + "recording" + '\033[0m')
# 缓冲区大小
frames = []
max_size = 5
long_frames = []
next=""
num=0
is_speak=False
result=""
while True:
stream_data = self.stream.read(self.CHUNK,exception_on_overflow=False)
status = self.v.check_ontime(stream_data)
if status==2:
is_speak=True
# 增加音量
wave_data = np.frombuffer(stream_data, dtype=np.int16)
frames.append(wave_data)
if len(frames) >= max_size:
long_frames.extend(frames)
if len(long_frames) > max_size * 10:
long_frames = long_frames[-max_size * 10:]
# 缓存
self.save_wav(long_frames, self.temp_save_path)
result = self.model.get_recognize(self.temp_save_path)
# 清空缓冲区
frames = []
if next == result:
continue
next = result
if status==0:
num += 1
if num == 10: # 停止识别
if is_speak:
if len(frames)>0 and len(long_frames)>0: # 判断是不是最后的尾音
long_frames.extend(frames)
self.save_wav(long_frames, self.temp_save_path)
result = self.model.get_recognize(self.temp_save_path)
if result != "":
print(result)
num = 0
# 静音
long_frames = []
# 清空缓冲区
frames = []
is_speak=False
result=""
if __name__ == '__main__':
service = RecognizeService()
service.recording(service.temp_save_path)
修改为在线实时识别