{"id":"https://openalex.org/W4386694222","doi":"https://doi.org/10.48550/arxiv.2309.04814","title":"Speech2Lip: High-fidelity Speech to Lip Generation by Learning from a Short Video","display_name":"Speech2Lip: High-fidelity Speech to Lip Generation by Learning from a Short Video","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4386694222","doi":"https://doi.org/10.48550/arxiv.2309.04814"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2309.04814","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2309.04814","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058073849","display_name":"Xiuzhe Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Xiuzhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100703619","display_name":"Pengfei Hu","orcid":"https://orcid.org/0000-0002-7935-886X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Pengfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101071037","display_name":"Yang Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091549331","display_name":"Xiaoyang Lyu","orcid":"https://orcid.org/0009-0007-5335-6691"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lyu, Xiaoyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091525681","display_name":"Yan\u2010Pei Cao","orcid":"https://orcid.org/0000-0002-0416-4374"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Yan-Pei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102004349","display_name":"Ying Shan","orcid":"https://orcid.org/0000-0001-7673-8325"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shan, Ying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5026184280","display_name":"Wenming Yang","orcid":"https://orcid.org/0000-0002-2506-1286"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Wenming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5038814111","display_name":"Zhongqian Sun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Zhongqian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5102498323","display_name":"Xiaojuan Qi","orcid":"https://orcid.org/0000-0002-4285-1626"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Xiaojuan","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":65},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9965,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.9965,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9896,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9549,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7781512},{"id":"https://openalex.org/C104114177","wikidata":"https://www.wikidata.org/wiki/Q79782","display_name":"Motion (physics)","level":2,"score":0.55245036},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5383614},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.52805835},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.50611037},{"id":"https://openalex.org/C2780312720","wikidata":"https://www.wikidata.org/wiki/Q5689100","display_name":"Head (geology)","level":2,"score":0.48574024},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.46563},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.0},{"id":"https://openalex.org/C114793014","wikidata":"https://www.wikidata.org/wiki/Q52109","display_name":"Geomorphology","level":1,"score":0.0},{"id":"https://openalex.org/C127313418","wikidata":"https://www.wikidata.org/wiki/Q1069","display_name":"Geology","level":0,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2309.04814","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2309.04814","pdf_url":"http://arxiv.org/pdf/2309.04814","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2309.04814","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2309.04814","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4380449851","https://openalex.org/W4318832338","https://openalex.org/W4248383205","https://openalex.org/W4234745530","https://openalex.org/W3125091513","https://openalex.org/W2916591301","https://openalex.org/W2381850946","https://openalex.org/W2231829109","https://openalex.org/W2146383839","https://openalex.org/W1919390113"],"abstract_inverted_index":{"Synthesizing":[0],"realistic":[1],"videos":[2,185],"according":[3],"to":[4,61,68,86,95,138,167,201],"a":[5,56,73,81,112,121,149,192,195,222,226],"given":[6,111],"speech":[7,60],"is":[8,33,66],"still":[9],"an":[10],"open":[11],"challenge.":[12],"Previous":[13],"works":[14],"have":[15],"been":[16],"plagued":[17],"by":[18,49,221],"issues":[19],"such":[20],"as":[21],"inaccurate":[22],"lip":[23,44,126,170],"shape":[24],"generation":[25,106,128],"and":[26,37,92,135,182,194,208,231,239],"poor":[27],"image":[28,65,127],"quality.":[29],"The":[30],"key":[31],"reason":[32],"that":[34,89,156,215],"only":[35],"motions":[36],"appearances":[38],"on":[39,131,211],"limited":[40,100],"facial":[41],"areas":[42],"(e.g.,":[43],"area)":[45],"are":[46,199],"mainly":[47],"driven":[48],"the":[50,62,105,140,173,203],"input":[51],"speech.":[52],"Therefore,":[53],"directly":[54],"learning":[55,98,132],"mapping":[57,153],"function":[58],"from":[59,99],"entire":[63],"head":[64,114,145,162,177,188],"prone":[67],"ambiguity,":[69],"particularly":[70],"when":[71],"using":[72],"short":[74],"video":[75,223],"for":[76,125],"training.":[77],"We":[78],"thus":[79],"propose":[80],"decomposition-synthesis-composition":[82],"framework":[83],"named":[84],"Speech":[85],"Lip":[87],"(Speech2Lip)":[88],"disentangles":[90],"speech-sensitive":[91,133],"speech-insensitive":[93,142],"motion/appearance":[94],"facilitate":[96],"effective":[97],"training":[101],"data,":[102],"resulting":[103],"in":[104,229,235],"of":[107,224],"natural-looking":[108],"videos.":[109],"First,":[110],"fixed":[113],"pose":[115],"(i.e.,":[116,144],"canonical":[117,174],"space),":[118],"we":[119,147],"present":[120],"speech-driven":[122],"implicit":[123],"model":[124,139,217],"which":[129],"concentrates":[130],"motion":[134,143],"appearance.":[136],"Next,":[137],"major":[141],"movement),":[146],"introduce":[148],"geometry-aware":[150],"mutual":[151],"explicit":[152],"(GAMEM)":[154],"module":[155],"establishes":[157],"geometric":[158],"mappings":[159],"between":[160],"different":[161],"poses.":[163],"This":[164],"allows":[165],"us":[166],"paste":[168],"generated":[169],"images":[171,178],"at":[172],"space":[175],"onto":[176],"with":[179,186],"arbitrary":[180],"poses":[181],"synthesize":[183],"talking":[184],"natural":[187],"movements.":[189],"In":[190],"addition,":[191],"Blend-Net":[193],"contrastive":[196],"sync":[197],"loss":[198],"introduced":[200],"enhance":[202],"overall":[204],"synthesis":[205],"performance.":[206],"Quantitative":[207],"qualitative":[209],"results":[210],"three":[212],"benchmarks":[213],"demonstrate":[214],"our":[216],"can":[218],"be":[219],"trained":[220],"just":[225],"few":[227],"minutes":[228],"length":[230],"achieve":[232],"state-of-the-art":[233],"performance":[234],"both":[236],"visual":[237],"quality":[238],"speech-visual":[240],"synchronization.":[241],"Code:":[242],"https://github.com/CVMI-Lab/Speech2Lip.":[243]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4386694222","counts_by_year":[],"updated_date":"2025-04-08T22:48:32.293213","created_date":"2023-09-14"}