{"id":"https://openalex.org/W2105396365","doi":"https://doi.org/10.1109/icassp.2004.1327250","title":"Improved face and feature finding for audio-visual speech recognition in visually challenging environments","display_name":"Improved face and feature finding for audio-visual speech recognition in visually challenging environments","publication_year":2004,"publication_date":"2004-09-28","ids":{"openalex":"https://openalex.org/W2105396365","doi":"https://doi.org/10.1109/icassp.2004.1327250","mag":"2105396365"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2004.1327250","pdf_url":null,"source":{"id":"https://openalex.org/S4363607879","display_name":"IEEE International Conference on Acoustics Speech and Signal Processing","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5103460664","display_name":"Jintao Jiang","orcid":null},"institutions":[{"id":"https://openalex.org/I161318765","display_name":"University of California, Los Angeles","ror":"https://ror.org/046rm7j60","country_code":"US","type":"funder","lineage":["https://openalex.org/I161318765"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"None Jintao Jiang","raw_affiliation_strings":["Department of Electrical Engineering, University of California, Los Angeles, CA, USA"],"affiliations":[{"raw_affiliation_string":"Department of Electrical Engineering, University of California, Los Angeles, CA, USA","institution_ids":["https://openalex.org/I161318765"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024184433","display_name":"Gerasimos Potamianos","orcid":"https://orcid.org/0000-0002-9833-7124"},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"G. Potamianos","raw_affiliation_strings":["IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA"],"affiliations":[{"raw_affiliation_string":"IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5108291571","display_name":"Harriet J. Nock","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"H. Nock","raw_affiliation_strings":["IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA"],"affiliations":[{"raw_affiliation_string":"IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022650481","display_name":"Giridharan Iyengar","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"G. Iyengar","raw_affiliation_strings":["IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA"],"affiliations":[{"raw_affiliation_string":"IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5111579153","display_name":"C. Neti","orcid":null},"institutions":[{"id":"https://openalex.org/I4210114115","display_name":"IBM Research - Thomas J. Watson Research Center","ror":"https://ror.org/0265w5591","country_code":"US","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210114115"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"C. Neti","raw_affiliation_strings":["IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA"],"affiliations":[{"raw_affiliation_string":"IBM Thomas J. Watson Research Center, Yorktown Heights, NY, USA","institution_ids":["https://openalex.org/I4210114115"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.624,"has_fulltext":true,"fulltext_origin":"ngrams","cited_by_count":9,"citation_normalized_percentile":{"value":0.646617,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":80,"max":81},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":1.0,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9989,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10057","display_name":"Face and Expression Recognition","score":0.9947,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/robustness","display_name":"Robustness","score":0.62247235},{"id":"https://openalex.org/keywords/audio-visual","display_name":"Audio visual","score":0.53419584},{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5330049},{"id":"https://openalex.org/keywords/word-error-rate","display_name":"Word error rate","score":0.46071276},{"id":"https://openalex.org/keywords/sensory-cue","display_name":"Sensory cue","score":0.42023104}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.82172894},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.7915511},{"id":"https://openalex.org/C63479239","wikidata":"https://www.wikidata.org/wiki/Q7353546","display_name":"Robustness (evolution)","level":3,"score":0.62247235},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.6181431},{"id":"https://openalex.org/C3017588708","wikidata":"https://www.wikidata.org/wiki/Q758901","display_name":"Audio visual","level":2,"score":0.53419584},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5330049},{"id":"https://openalex.org/C31510193","wikidata":"https://www.wikidata.org/wiki/Q1192553","display_name":"Facial recognition system","level":3,"score":0.47480336},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.46395937},{"id":"https://openalex.org/C40969351","wikidata":"https://www.wikidata.org/wiki/Q3516228","display_name":"Word error rate","level":2,"score":0.46071276},{"id":"https://openalex.org/C111370547","wikidata":"https://www.wikidata.org/wiki/Q7451120","display_name":"Sensory cue","level":2,"score":0.42023104},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.4112397},{"id":"https://openalex.org/C52622490","wikidata":"https://www.wikidata.org/wiki/Q1026626","display_name":"Feature extraction","level":2,"score":0.40702355},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.39013588},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C49774154","wikidata":"https://www.wikidata.org/wiki/Q131765","display_name":"Multimedia","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/icassp.2004.1327250","pdf_url":null,"source":{"id":"https://openalex.org/S4363607879","display_name":"IEEE International Conference on Acoustics Speech and Signal Processing","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"score":0.6,"display_name":"Peace, justice, and strong institutions","id":"https://metadata.un.org/sdg/16"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":11,"referenced_works":["https://openalex.org/W1929897159","https://openalex.org/W196476531","https://openalex.org/W2096391593","https://openalex.org/W2121647436","https://openalex.org/W2124351082","https://openalex.org/W2146555731","https://openalex.org/W2156165250","https://openalex.org/W2159686933","https://openalex.org/W2217896605","https://openalex.org/W22517275","https://openalex.org/W3214102110"],"related_works":["https://openalex.org/W578794879","https://openalex.org/W4245955731","https://openalex.org/W3137890128","https://openalex.org/W2985118265","https://openalex.org/W2625296515","https://openalex.org/W2393726419","https://openalex.org/W2380912101","https://openalex.org/W2350550760","https://openalex.org/W2271369634","https://openalex.org/W1984634519"],"abstract_inverted_index":{"Visual":[0],"information":[1],"in":[2,20,32,48,111,119],"a":[3,37,41,112],"speaker's":[4],"face":[5,67,87],"is":[6,36,80],"known":[7],"to":[8,29,82,106],"improve":[9,83],"the":[10,84,117],"robustness":[11],"of":[12,86,116],"automatic":[13],"speech":[14,56,94],"recognition":[15],"(ASR).":[16],"However,":[17],"most":[18],"studies":[19],"audio-visual":[21,46,108],"ASR":[22,31,47],"have":[23],"focused":[24],"on":[25,40,54],"\"visually":[26],"clean\"":[27],"data":[28],"benefit":[30],"noise.":[33],"This":[34,78],"paper":[35],"follow":[38],"up":[39],"previous":[42],"study":[43],"that":[44,72],"investigated":[45],"visually":[49],"challenging":[50],"environments.":[51],"It":[52],"focuses":[53],"visual":[55,93],"front":[57],"end":[58],"processing,":[59],"and":[60,68,88,91],"it":[61],"proposes":[62],"an":[63],"improved,":[64],"appearance":[65],"based":[66],"feature":[69,89],"detection":[70],"algorithm":[71],"utilizes":[73],"Gaussian":[74],"mixture":[75],"model":[76],"classifiers.":[77],"method":[79],"shown":[81],"accuracy":[85],"detection,":[90],"thus":[92],"recognition,":[95],"over":[96],"our":[97],"previously":[98],"used":[99],"baseline":[100],"system.":[101],"In":[102],"turn,":[103],"this":[104],"translates":[105],"improved":[107],"ASR,":[109],"resulting":[110],"10%":[113],"relative":[114],"reduction":[115],"word-error-rate":[118],"noisy":[120],"speech.":[121]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W2105396365","counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-04-19T08:02:37.950650","created_date":"2016-06-24"}