{"id":"https://openalex.org/W4392340304","doi":"https://doi.org/10.48550/arxiv.2402.18122","title":"G4G:A Generic Framework for High Fidelity Talking Face Generation with\n Fine-grained Intra-modal Alignment","display_name":"G4G:A Generic Framework for High Fidelity Talking Face Generation with\n Fine-grained Intra-modal Alignment","publication_year":2024,"publication_date":"2024-02-28","ids":{"openalex":"https://openalex.org/W4392340304","doi":"https://doi.org/10.48550/arxiv.2402.18122"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2402.18122","pdf_url":"https://arxiv.org/pdf/2402.18122","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2402.18122","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5100390006","display_name":"Juan Zhang","orcid":"https://orcid.org/0000-0001-5089-723X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Juan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100434478","display_name":"Jiahao Chen","orcid":"https://orcid.org/0000-0002-6110-7851"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiahao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5089338846","display_name":"Cheng Wang","orcid":"https://orcid.org/0000-0002-3440-1596"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Cheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017004342","display_name":"Zhiwang Yu","orcid":"https://orcid.org/0000-0003-3252-9067"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yu, Zhiwang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075060057","display_name":"Tangquan Qi","orcid":"https://orcid.org/0009-0007-9746-1557"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qi, Tangquan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100602151","display_name":"Di Wu","orcid":"https://orcid.org/0000-0002-7788-9202"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Di","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.999,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11448","display_name":"Face recognition and analysis","score":0.999,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9855,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/high-fidelity","display_name":"High fidelity","score":0.45599172}],"concepts":[{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.7515164},{"id":"https://openalex.org/C2779304628","wikidata":"https://www.wikidata.org/wiki/Q3503480","display_name":"Face (sociological concept)","level":2,"score":0.6626786},{"id":"https://openalex.org/C2776459999","wikidata":"https://www.wikidata.org/wiki/Q2119376","display_name":"Fidelity","level":2,"score":0.5990557},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.59430224},{"id":"https://openalex.org/C113364801","wikidata":"https://www.wikidata.org/wiki/Q26674","display_name":"High fidelity","level":2,"score":0.45599172},{"id":"https://openalex.org/C107457646","wikidata":"https://www.wikidata.org/wiki/Q207434","display_name":"Human\u2013computer interaction","level":1,"score":0.32496467},{"id":"https://openalex.org/C192562407","wikidata":"https://www.wikidata.org/wiki/Q228736","display_name":"Materials science","level":0,"score":0.15050676},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.10264245},{"id":"https://openalex.org/C24890656","wikidata":"https://www.wikidata.org/wiki/Q82811","display_name":"Acoustics","level":1,"score":0.089771},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.08922011},{"id":"https://openalex.org/C76155785","wikidata":"https://www.wikidata.org/wiki/Q418","display_name":"Telecommunications","level":1,"score":0.08021787},{"id":"https://openalex.org/C36289849","wikidata":"https://www.wikidata.org/wiki/Q34749","display_name":"Social science","level":1,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2402.18122","pdf_url":"https://arxiv.org/pdf/2402.18122","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2402.18122","pdf_url":"https://arxiv.org/pdf/2402.18122","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4385452045","https://openalex.org/W4319589573","https://openalex.org/W4313443006","https://openalex.org/W4293777179","https://openalex.org/W2945374968","https://openalex.org/W2895525995","https://openalex.org/W2319626700","https://openalex.org/W2164070813","https://openalex.org/W2135608140","https://openalex.org/W1997070615"],"abstract_inverted_index":{"Despite":[0],"numerous":[1],"completed":[2],"studies,":[3],"achieving":[4],"high":[5,44,57],"fidelity":[6,45,58,119],"talking":[7,46,170,181],"face":[8,47],"generation":[9,48],"with":[10,49],"highly":[11,64,168],"synchronized":[12,65,169],"lip":[13,66,132],"movements":[14,67,133],"corresponding":[15],"to":[16,32,77,87,114,144,185],"arbitrary":[17],"audio":[18,71],"remains":[19],"a":[20,40,84,108],"significant":[21,157],"challenge":[22],"in":[23,159],"the":[24,56,81,89,99,117,124,129,135,147,151],"field.":[25],"The":[26,75],"shortcomings":[27],"of":[28,59,69,83,92,120,131,161],"published":[29],"studies":[30],"continue":[31],"confuse":[33],"many":[34],"researchers.":[35],"This":[36],"paper":[37],"introduces":[38],"G4G,":[39],"generic":[41,176],"framework":[42,177],"for":[43],"fine-grained":[50],"intra-modal":[51,94],"alignment.":[52],"G4G":[53,172],"can":[54,179],"reenact":[55,116],"original":[60,121,162],"video":[61,122,163],"while":[62,127],"producing":[63],"regardless":[68],"given":[70],"tones":[72],"or":[73],"volumes.":[74],"key":[76],"G4G's":[78],"success":[79],"is":[80,112,141,173],"use":[82],"diagonal":[85],"matrix":[86],"enhance":[88],"ordinary":[90],"alignment":[91],"audio-image":[93],"features,":[95],"which":[96],"significantly":[97],"increases":[98],"comparative":[100],"learning":[101],"between":[102],"positive":[103],"and":[104,134,150],"negative":[105],"samples.":[106],"Additionally,":[107],"multi-scaled":[109],"supervision":[110],"module":[111],"introduced":[113],"comprehensively":[115],"perceptional":[118],"across":[123],"facial":[125,148],"region":[126,149],"emphasizing":[128],"synchronization":[130],"input":[136],"audio.":[137],"A":[138],"fusion":[139],"network":[140],"then":[142],"used":[143],"further":[145],"fuse":[146],"rest.":[152],"Our":[153],"experimental":[154],"results":[155],"demonstrate":[156],"achievements":[158],"reenactment":[160],"quality":[164],"as":[165,167],"well":[166],"lips.":[171],"an":[174],"outperforming":[175],"that":[178],"produce":[180],"videos":[182],"competitively":[183],"closer":[184],"ground":[186],"truth":[187],"level":[188],"than":[189],"current":[190],"state-of-the-art":[191],"methods.":[192]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4392340304","counts_by_year":[],"updated_date":"2025-04-20T20:20:02.442576","created_date":"2024-03-05"}