{"id":"https://openalex.org/W4226245101","doi":"https://doi.org/10.48550/arxiv.2203.05175","title":"MVP: Multimodality-guided Visual Pre-training","display_name":"MVP: Multimodality-guided Visual Pre-training","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4226245101","doi":"https://doi.org/10.48550/arxiv.2203.05175"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2203.05175","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2203.05175","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5050660610","display_name":"Longhui Wei","orcid":"https://orcid.org/0000-0001-6916-3009"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Longhui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5075290241","display_name":"Lingxi Xie","orcid":"https://orcid.org/0000-0003-4831-9451"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xie, Lingxi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5046805800","display_name":"Wengang Zhou","orcid":"https://orcid.org/0000-0003-1690-9836"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Wengang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5078141810","display_name":"Houqiang Li","orcid":"https://orcid.org/0000-0003-2188-3028"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Houqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100393506","display_name":"Qi Tian","orcid":"https://orcid.org/0000-0002-7252-5047"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tian, Qi","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.633957,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":60,"max":70},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9992,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9992,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9954,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9927,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/multimodality","display_name":"Multimodality","score":0.55143124},{"id":"https://openalex.org/keywords/margin","display_name":"Margin (machine learning)","score":0.5290909}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7633322},{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.700479},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.58797455},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5522978},{"id":"https://openalex.org/C2780910867","wikidata":"https://www.wikidata.org/wiki/Q1952416","display_name":"Multimodality","level":2,"score":0.55143124},{"id":"https://openalex.org/C774472","wikidata":"https://www.wikidata.org/wiki/Q6760393","display_name":"Margin (machine learning)","level":2,"score":0.5290909},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5215471},{"id":"https://openalex.org/C153180895","wikidata":"https://www.wikidata.org/wiki/Q7148389","display_name":"Pattern recognition (psychology)","level":2,"score":0.35191137},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.32942104},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.2930756},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.080215484},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2203.05175","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2203.05175","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2203.05175","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Quality education","id":"https://metadata.un.org/sdg/4","score":0.49}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4388989680","https://openalex.org/W4304700937","https://openalex.org/W4302009627","https://openalex.org/W4212781403","https://openalex.org/W4205899436","https://openalex.org/W3160379098","https://openalex.org/W3111427900","https://openalex.org/W2901011898","https://openalex.org/W2462138927","https://openalex.org/W2373442452"],"abstract_inverted_index":{"Recently,":[0],"masked":[1],"image":[2,42],"modeling":[3],"(MIM)":[4],"has":[5],"become":[6],"a":[7,30,36,40,95,125,141],"promising":[8],"direction":[9],"for":[10,71,136],"visual":[11,22,72,129],"pre-training.":[12,73],"In":[13,47,132],"the":[14,26,45,87,90,106,116],"context":[15],"of":[16,93,108,127,157],"vision":[17,91],"transformers,":[18],"MIM":[19],"learns":[20],"effective":[21],"representation":[23],"by":[24,55,110],"aligning":[25],"token-level":[27],"features":[28],"with":[29,89,153],"pre-defined":[31],"space":[32],"(e.g.,":[33],"BEIT":[34,147],"used":[35],"d-VAE":[37],"trained":[38],"on":[39,99,119,124,144],"large":[41],"corpus":[43],"as":[44],"tokenizer).":[46],"this":[48],"paper,":[49],"we":[50,85],"go":[51],"one":[52],"step":[53],"further":[54],"introducing":[56],"guidance":[57],"from":[58],"other":[59],"modalities":[60],"and":[61,121,150],"validating":[62],"that":[63],"such":[64],"additional":[65],"knowledge":[66],"leads":[67],"to":[68],"impressive":[69,155],"gains":[70],"The":[74],"proposed":[75],"approach":[76],"is":[77],"named":[78],"Multimodality-guided":[79],"Visual":[80],"Pre-training":[81],"(MVP),":[82],"in":[83],"which":[84],"replace":[86],"tokenizer":[88],"branch":[92],"CLIP,":[94],"vision-language":[96],"model":[97],"pre-trained":[98],"400":[100],"million":[101],"image-text":[102],"pairs.":[103],"We":[104],"demonstrate":[105],"effectiveness":[107],"MVP":[109,139],"performing":[111],"standard":[112],"experiments,":[113],"i.e.,":[114],"pre-training":[115,134],"ViT":[117],"models":[118],"ImageNet":[120],"fine-tuning":[122],"them":[123],"series":[126],"downstream":[128],"recognition":[130],"tasks.":[131],"particular,":[133],"ViT-Base/16":[135],"300":[137],"epochs,":[138],"reports":[140],"52.4%":[142],"mIoU":[143],"ADE20K,":[145],"surpassing":[146],"(the":[148],"baseline":[149],"previous":[151],"state-of-the-art)":[152],"an":[154],"margin":[156],"6.8%.":[158]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4226245101","counts_by_year":[{"year":2023,"cited_by_count":1}],"updated_date":"2025-02-20T19:12:18.465453","created_date":"2022-05-05"}