{"id":"https://openalex.org/W4391047492","doi":"https://doi.org/10.48550/arxiv.2401.10208","title":"MM-Interleaved: Interleaved Image-Text Generative Modeling via Multi-modal Feature Synchronizer","display_name":"MM-Interleaved: Interleaved Image-Text Generative Modeling via Multi-modal Feature Synchronizer","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4391047492","doi":"https://doi.org/10.48550/arxiv.2401.10208"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2401.10208","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2401.10208","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5087792832","display_name":"Changyao Tian","orcid":"https://orcid.org/0000-0002-3285-4671"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Changyao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5112877229","display_name":"Xizhou Zhu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhu, Xizhou","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100528261","display_name":"Yuwen Xiong","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Yuwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101736298","display_name":"Weiyun Wang","orcid":"https://orcid.org/0000-0002-2662-9144"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Weiyun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100457637","display_name":"Zhe Chen","orcid":"https://orcid.org/0000-0001-5004-8975"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Zhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5062687402","display_name":"Wenhai Wang","orcid":"https://orcid.org/0000-0003-3707-6546"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Wenhai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102821179","display_name":"Yuntao Chen","orcid":"https://orcid.org/0000-0002-9555-1897"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Yuntao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5000474748","display_name":"Lewei Lu","orcid":"https://orcid.org/0009-0009-9809-3818"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Lewei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061696740","display_name":"Tong L\u00fc","orcid":"https://orcid.org/0000-0002-7051-5347"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lu, Tong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100620306","display_name":"Jie Zhou","orcid":"https://orcid.org/0000-0001-7701-234X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Jie","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100732450","display_name":"Hongsheng Li","orcid":"https://orcid.org/0000-0002-2664-7975"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Hongsheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100748135","display_name":"Yu Qiao","orcid":"https://orcid.org/0000-0002-1889-2567"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Yu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5026944066","display_name":"Jifeng Dai","orcid":"https://orcid.org/0000-0002-6785-0785"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Jifeng","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":4,"citation_normalized_percentile":{"value":0.915412,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":95,"max":96},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9943,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9943,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9937,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9814,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/feature","display_name":"Feature (linguistics)","score":0.5965061},{"id":"https://openalex.org/keywords/generative-model","display_name":"Generative model","score":0.5272307},{"id":"https://openalex.org/keywords/code","display_name":"Code (set theory)","score":0.42968714}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.82758415},{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.6079189},{"id":"https://openalex.org/C2776401178","wikidata":"https://www.wikidata.org/wiki/Q12050496","display_name":"Feature (linguistics)","level":2,"score":0.5965061},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.56366366},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.54286945},{"id":"https://openalex.org/C167966045","wikidata":"https://www.wikidata.org/wiki/Q5532625","display_name":"Generative model","level":3,"score":0.5272307},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.51892614},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.47055152},{"id":"https://openalex.org/C71139939","wikidata":"https://www.wikidata.org/wiki/Q910194","display_name":"Modal","level":2,"score":0.46190056},{"id":"https://openalex.org/C2776760102","wikidata":"https://www.wikidata.org/wiki/Q5139990","display_name":"Code (set theory)","level":3,"score":0.42968714},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.4185379},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34465754},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.0},{"id":"https://openalex.org/C188027245","wikidata":"https://www.wikidata.org/wiki/Q750446","display_name":"Polymer chemistry","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2401.10208","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.10208","pdf_url":"http://arxiv.org/pdf/2401.10208","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2401.10208","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2401.10208","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4387506531","https://openalex.org/W4380551139","https://openalex.org/W4365211920","https://openalex.org/W4317695495","https://openalex.org/W4299831724","https://openalex.org/W4287117424","https://openalex.org/W4238433571","https://openalex.org/W3174044702","https://openalex.org/W3014948380","https://openalex.org/W2967848559"],"abstract_inverted_index":{"Developing":[0],"generative":[1,64],"models":[2,15,150],"for":[3,66],"interleaved":[4,19,67,102],"image-text":[5,68,103],"data":[6],"has":[7],"both":[8,99,143],"research":[9],"and":[10,21,25,74,101,138,145,149],"practical":[11],"value.":[12],"It":[13,70,105],"requires":[14],"to":[16,82,120],"understand":[17],"the":[18,33,36,52,87,91,115,127],"sequences":[20],"subsequently":[22],"generate":[23],"images":[24,141],"text.":[26],"However,":[27],"existing":[28],"attempts":[29],"are":[30,151],"limited":[31],"by":[32],"issue":[34],"that":[35],"fixed":[37],"number":[38],"of":[39,129],"visual":[40,133,146],"tokens":[41],"cannot":[42],"efficiently":[43],"capture":[44],"image":[45,84],"details,":[46],"which":[47],"is":[48,95,106],"particularly":[49],"problematic":[50],"in":[51,86,131],"multi-image":[53,75],"scenarios.":[54],"To":[55],"address":[56],"this,":[57],"this":[58],"paper":[59],"presents":[60],"MM-Interleaved,":[61],"an":[62],"end-to-end":[63,96],"model":[65,116],"data.":[69],"introduces":[71],"a":[72,110],"multi-scale":[73],"feature":[76],"synchronizer":[77],"module,":[78],"allowing":[79],"direct":[80],"access":[81],"fine-grained":[83],"features":[85],"previous":[88],"context":[89],"during":[90],"generation":[92],"process.":[93],"MM-Interleaved":[94,130],"pre-trained":[97],"on":[98],"paired":[100],"corpora.":[104],"further":[107],"enhanced":[108],"through":[109],"supervised":[111],"fine-tuning":[112],"phase,":[113],"wherein":[114],"improves":[117],"its":[118],"ability":[119],"follow":[121],"complex":[122],"multi-modal":[123,136],"instructions.":[124],"Experiments":[125],"demonstrate":[126],"versatility":[128],"recognizing":[132],"details":[134],"following":[135,142],"instructions":[137],"generating":[139],"consistent":[140],"textual":[144],"conditions.":[147],"Code":[148],"available":[152],"at":[153],"\\url{https://github.com/OpenGVLab/MM-Interleaved}.":[154]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4391047492","counts_by_year":[{"year":2024,"cited_by_count":4}],"updated_date":"2025-04-22T20:39:51.817931","created_date":"2024-01-20"}