{"id":"https://openalex.org/W4403884281","doi":"https://doi.org/10.48550/arxiv.2410.02746","title":"Contrastive Localized Language-Image Pre-Training","display_name":"Contrastive Localized Language-Image Pre-Training","publication_year":2024,"publication_date":"2024-10-03","ids":{"openalex":"https://openalex.org/W4403884281","doi":"https://doi.org/10.48550/arxiv.2410.02746"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.02746","pdf_url":"http://arxiv.org/pdf/2410.02746","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2410.02746","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5012764388","display_name":"Hong-You Chen","orcid":"https://orcid.org/0000-0002-8127-5588"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Hong-You","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084288655","display_name":"Zhengfeng Lai","orcid":"https://orcid.org/0000-0002-2984-7913"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lai, Zhengfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100392968","display_name":"Haotian Zhang","orcid":"https://orcid.org/0000-0003-0844-3730"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Haotian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102839147","display_name":"Xinze Wang","orcid":"https://orcid.org/0000-0002-8615-4929"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xinze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042419329","display_name":"Marcin Eichner","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Eichner, Marcin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113173908","display_name":"Keen You","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"You, Keen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101856888","display_name":"Meng Cao","orcid":"https://orcid.org/0000-0003-2657-7142"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cao, Meng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100385123","display_name":"Bowen Zhang","orcid":"https://orcid.org/0000-0001-6180-6815"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Bowen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5083263708","display_name":"Yinfei Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yinfei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5066666034","display_name":"Zhe Gan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gan, Zhe","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.945908,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":77,"max":88},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12542","display_name":"Second Language Learning and Teaching","score":0.3902,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12542","display_name":"Second Language Learning and Teaching","score":0.3902,"subfield":{"id":"https://openalex.org/subfields/1208","display_name":"Literature and Literary Theory"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10021","display_name":"EFL/ESL Teaching and Learning","score":0.387,"subfield":{"id":"https://openalex.org/subfields/1203","display_name":"Language and Linguistics"},"field":{"id":"https://openalex.org/fields/12","display_name":"Arts and Humanities"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.6869552},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5015898},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.46567553},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.37151247},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.34875172},{"id":"https://openalex.org/C205649164","wikidata":"https://www.wikidata.org/wiki/Q1071","display_name":"Geography","level":0,"score":0.111026496},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.065255135},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.02746","pdf_url":"http://arxiv.org/pdf/2410.02746","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2410.02746","pdf_url":"http://arxiv.org/pdf/2410.02746","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4396701345","https://openalex.org/W4391913857","https://openalex.org/W4391375266","https://openalex.org/W2899084033","https://openalex.org/W2810751659","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W230091440","https://openalex.org/W2233261550"],"abstract_inverted_index":{"Contrastive":[0,104],"Language-Image":[1,106],"Pre-training":[2,107],"(CLIP)":[3],"has":[4,22],"been":[5,23],"a":[6,48,100,120,147,183],"celebrated":[7],"method":[8,102],"for":[9,40,68,83,173],"training":[10],"vision":[11,28,75],"encoders":[12],"to":[13,36,133,153,163,188],"generate":[14,155],"image/text":[15],"representations":[16,137],"facilitating":[17],"various":[18],"applications.":[19],"Recently,":[20],"CLIP":[21,46,94,111,187],"widely":[24],"adopted":[25],"as":[26,47],"the":[27,90,127],"backbone":[29],"of":[30,45,73,93,125,165,186],"multimodal":[31],"large":[32],"language":[33,41],"models":[34],"(MLLMs)":[35],"connect":[37],"image":[38,60,130,174],"inputs":[39],"interactions.":[42],"The":[43],"success":[44],"vision-language":[49],"foundation":[50],"model":[51],"relies":[52],"on":[53,192],"aligning":[54],"web-crawled":[55],"noisy":[56],"text":[57],"annotations":[58],"at":[59,158],"levels.":[61],"Nevertheless,":[62],"such":[63],"criteria":[64],"may":[65],"become":[66],"insufficient":[67],"downstream":[69],"tasks":[70],"in":[71],"need":[72],"fine-grained":[74],"representations,":[76],"especially":[77,191],"when":[78],"region-level":[79],"understanding":[80],"is":[81],"demanding":[82],"MLLMs.":[84],"In":[85],"this":[86],"paper,":[87],"we":[88,145],"improve":[89],"localization":[91],"capability":[92],"with":[95,112],"several":[96],"advances.":[97],"We":[98,118],"propose":[99],"pre-training":[101],"called":[103],"Localized":[105],"(CLOC)":[108],"by":[109],"complementing":[110],"region-text":[113,156],"contrastive":[114],"loss":[115],"and":[116,149,177,180,194],"modules.":[117],"formulate":[119],"new":[121],"concept,":[122],"promptable":[123],"embeddings,":[124],"which":[126],"encoder":[128],"produces":[129],"embeddings":[131,172],"easy":[132],"transform":[134],"into":[135],"region":[136,175],"given":[138],"spatial":[139],"hints.":[140],"To":[141],"support":[142],"large-scale":[143],"pre-training,":[144],"design":[146],"visually-enriched":[148],"spatially-localized":[150],"captioning":[151],"framework":[152],"effectively":[154],"pseudo-labels":[157],"scale.":[159],"By":[160],"scaling":[161],"up":[162],"billions":[164],"annotated":[166],"images,":[167],"CLOC":[168],"enables":[169],"high-quality":[170],"regional":[171],"recognition":[176],"retrieval":[178],"tasks,":[179],"can":[181],"be":[182],"drop-in":[184],"replacement":[185],"enhance":[189],"MLLMs,":[190],"referring":[193],"grounding":[195],"tasks.":[196]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4403884281","counts_by_year":[{"year":2025,"cited_by_count":1}],"updated_date":"2025-04-22T19:12:58.537204","created_date":"2024-10-30"}