{"id":"https://openalex.org/W4300862572","doi":"https://doi.org/10.48550/arxiv.2204.08686","title":"Audio-Visual Wake Word Spotting System For MISP Challenge 2021","display_name":"Audio-Visual Wake Word Spotting System For MISP Challenge 2021","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4300862572","doi":"https://doi.org/10.48550/arxiv.2204.08686"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2204.08686","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2204.08686","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5112870466","display_name":"Yanguang Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Yanguang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100429020","display_name":"Jianwei Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Jianwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100462722","display_name":"Han Yang","orcid":"https://orcid.org/0000-0003-2782-7502"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Han, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041886705","display_name":"Shuaijiang Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Shuaijiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061005945","display_name":"Chaoyang Mei","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mei, Chaoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102944502","display_name":"Tingwei Guo","orcid":"https://orcid.org/0000-0002-9320-543X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Guo, Tingwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043550392","display_name":"Shuran Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Shuran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022202095","display_name":"Chuandong Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Chuandong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090002307","display_name":"Wei Zou","orcid":"https://orcid.org/0000-0001-5307-4685"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5081173423","display_name":"Xiangang Li","orcid":"https://orcid.org/0000-0002-7810-1077"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiangang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5043550392","display_name":"Shuran Zhou","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Shuran","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022202095","display_name":"Chuandong Xie","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Chuandong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5090002307","display_name":"Wei Zou","orcid":"https://orcid.org/0000-0001-5307-4685"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zou, Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5081173423","display_name":"Xiangang Li","orcid":"https://orcid.org/0000-0002-7810-1077"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Xiangang","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":59},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11309","display_name":"Music and Audio Processing","score":0.9922,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11233","display_name":"Advanced Adaptive Filtering Techniques","score":0.9889,"subfield":{"id":"https://openalex.org/subfields/2206","display_name":"Computational Mechanics"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/spotting","display_name":"Spotting","score":0.8644961},{"id":"https://openalex.org/keywords/keyword-spotting","display_name":"Keyword spotting","score":0.6139506},{"id":"https://openalex.org/keywords/leverage","display_name":"Leverage (statistics)","score":0.5682417},{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness","score":0.56499803},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.46974996}],"concepts":[{"id":"https://openalex.org/C2779506182","wikidata":"https://www.wikidata.org/wiki/Q7580141","display_name":"Spotting","level":2,"score":0.8644961},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8016287},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.6448952},{"id":"https://openalex.org/C2781213101","wikidata":"https://www.wikidata.org/wiki/Q6398558","display_name":"Keyword spotting","level":2,"score":0.6139506},{"id":"https://openalex.org/C2778806681","wikidata":"https://www.wikidata.org/wiki/Q907293","display_name":"Microphone array","level":4,"score":0.56825346},{"id":"https://openalex.org/C153083717","wikidata":"https://www.wikidata.org/wiki/Q6535263","display_name":"Leverage (statistics)","level":2,"score":0.5682417},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.56499803},{"id":"https://openalex.org/C54197355","wikidata":"https://www.wikidata.org/wiki/Q5782992","display_name":"Beamforming","level":2,"score":0.47157827},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.46974996},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.45742217},{"id":"https://openalex.org/C2778263558","wikidata":"https://www.wikidata.org/wiki/Q46384","display_name":"Microphone","level":3,"score":0.4391612},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.14165416},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.13973284},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C68115822","wikidata":"https://www.wikidata.org/wiki/Q1068172","display_name":"Sound pressure","level":2,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2204.08686","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2204.08686","pdf_url":"http://arxiv.org/pdf/2204.08686","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2204.08686","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2204.08686","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.43,"display_name":"Life on land","id":"https://metadata.un.org/sdg/15"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4286904253","https://openalex.org/W3206647229","https://openalex.org/W3119978414","https://openalex.org/W2918559346","https://openalex.org/W2545741539","https://openalex.org/W2516975559","https://openalex.org/W2117995638","https://openalex.org/W2114097550","https://openalex.org/W2000885660","https://openalex.org/W1969408022"],"abstract_inverted_index":{"This":[0],"paper":[1],"presents":[2],"the":[3,10,37,46,67,86,89,102,141,145],"details":[4],"of":[5,13,24,40,53,92],"our":[6,119,159],"system":[7],"designed":[8],"for":[9,127],"Task":[11,25],"1":[12,26],"Multimodal":[14],"Information":[15],"Based":[16],"Speech":[17],"Processing":[18],"(MISP)":[19],"Challenge":[20],"2021.":[21],"The":[22,134],"purpose":[23],"is":[27,95,105,137],"to":[28,35,65,78,97,107,139,157],"leverage":[29],"both":[30],"audio":[31,109],"and":[32,60,110,113,132,143],"video":[33,87],"information":[34],"improve":[36,144],"environmental":[38],"robustness":[39],"far-field":[41,83],"wake":[42],"word":[43],"spotting.":[44],"In":[45],"proposed":[47,106],"system,":[48],"firstly,":[49],"we":[50],"take":[51],"advantage":[52],"speech":[54],"enhancement":[55],"algorithms":[56],"such":[57,129],"as":[58,130],"beamforming":[59],"weighted":[61],"prediction":[62],"error":[63],"(WPE)":[64],"address":[66],"multi-microphone":[68],"conversational":[69],"audio.":[70],"Secondly,":[71],"several":[72],"data":[73],"augmentation":[74],"techniques":[75],"are":[76,116,152],"applied":[77],"simulate":[79],"a":[80],"more":[81],"realistic":[82],"scenario.":[84],"For":[85],"information,":[88],"provided":[90],"region":[91],"interest":[93],"(ROI)":[94],"used":[96,138],"obtain":[98],"visual":[99,111],"representation.":[100],"Then":[101],"multi-layer":[103],"CNN":[104],"learn":[108],"representations,":[112],"these":[114],"representations":[115],"fed":[117],"into":[118],"two-branch":[120],"attention-based":[121],"network":[122],"which":[123],"can":[124],"be":[125],"employed":[126],"fusion,":[128],"transformer":[131],"conformed.":[133],"focal":[135],"loss":[136],"fine-tune":[140],"model":[142],"performance":[146],"significantly.":[147],"Finally,":[148],"multiple":[149],"trained":[150],"models":[151],"integrated":[153],"by":[154],"casting":[155],"vote":[156],"achieve":[158],"final":[160],"0.091":[161],"score.":[162]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4300862572","counts_by_year":[],"updated_date":"2025-03-03T08:59:27.754276","created_date":"2022-10-04"}