{"id":"https://openalex.org/W4386076227","doi":"https://doi.org/10.1109/cvpr52729.2023.00968","title":"VILA: Learning Image Aesthetics from User Comments with Vision-Language Pretraining","display_name":"VILA: Learning Image Aesthetics from User Comments with Vision-Language Pretraining","publication_year":2023,"publication_date":"2023-06-01","ids":{"openalex":"https://openalex.org/W4386076227","doi":"https://doi.org/10.1109/cvpr52729.2023.00968"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52729.2023.00968","pdf_url":null,"source":{"id":"https://openalex.org/S4363607701","display_name":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2303.14302","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5030485695","display_name":"Junjie Ke","orcid":"https://orcid.org/0000-0001-8512-3744"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Junjie Ke","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058759451","display_name":"Keren Ye","orcid":"https://orcid.org/0000-0002-7349-7762"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Keren Ye","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100745253","display_name":"Jiahui Yu","orcid":"https://orcid.org/0000-0002-1215-3851"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Jiahui Yu","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101011504","display_name":"Yonghui Wu","orcid":null},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Yonghui Wu","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002085979","display_name":"Peyman Milanfar","orcid":"https://orcid.org/0000-0003-1455-7662"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Peyman Milanfar","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100698893","display_name":"Feng Yang","orcid":"https://orcid.org/0000-0001-7190-4064"},"institutions":[{"id":"https://openalex.org/I1291425158","display_name":"Google (United States)","ror":"https://ror.org/00njsd438","country_code":"US","type":"company","lineage":["https://openalex.org/I1291425158","https://openalex.org/I4210128969"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Feng Yang","raw_affiliation_strings":["Google Research"],"affiliations":[{"raw_affiliation_string":"Google Research","institution_ids":["https://openalex.org/I1291425158"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":10.62,"has_fulltext":false,"cited_by_count":33,"citation_normalized_percentile":{"value":0.999599,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":98,"max":99},"biblio":{"volume":null,"issue":null,"first_page":"10041","last_page":"10051"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9996,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11605","display_name":"Visual Attention and Saliency Detection","score":0.9996,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11714","display_name":"Multimodal Machine Learning Applications","score":0.9992,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10627","display_name":"Advanced Image and Video Retrieval Techniques","score":0.9962,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/closed-captioning","display_name":"Closed captioning","score":0.59771013}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.68158317},{"id":"https://openalex.org/C157657479","wikidata":"https://www.wikidata.org/wiki/Q2367247","display_name":"Closed captioning","level":3,"score":0.59771013},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.57081455},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.49345246},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.46848238},{"id":"https://openalex.org/C195324797","wikidata":"https://www.wikidata.org/wiki/Q33742","display_name":"Natural language","level":2,"score":0.4654995},{"id":"https://openalex.org/C184337299","wikidata":"https://www.wikidata.org/wiki/Q1437428","display_name":"Semantics (computer science)","level":2,"score":0.43541056},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.37767673},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/cvpr52729.2023.00968","pdf_url":null,"source":{"id":"https://openalex.org/S4363607701","display_name":"2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.14302","pdf_url":"https://arxiv.org/pdf/2303.14302","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.14302","pdf_url":"https://arxiv.org/pdf/2303.14302","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.82,"id":"https://metadata.un.org/sdg/4","display_name":"Quality education"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":55,"referenced_works":["https://openalex.org/W1527575280","https://openalex.org/W1956340063","https://openalex.org/W2056380823","https://openalex.org/W2060277733","https://openalex.org/W2078807908","https://openalex.org/W2080754665","https://openalex.org/W2101105183","https://openalex.org/W2123024445","https://openalex.org/W2217895792","https://openalex.org/W2417288846","https://openalex.org/W2467531333","https://openalex.org/W2524036617","https://openalex.org/W2560647685","https://openalex.org/W2623430765","https://openalex.org/W2747102465","https://openalex.org/W2756217618","https://openalex.org/W2774267535","https://openalex.org/W2779483295","https://openalex.org/W2807107013","https://openalex.org/W2931027027","https://openalex.org/W2953273646","https://openalex.org/W2962883796","https://openalex.org/W2966715458","https://openalex.org/W2968124245","https://openalex.org/W2970231061","https://openalex.org/W2975501350","https://openalex.org/W2976886057","https://openalex.org/W2988824574","https://openalex.org/W3012364787","https://openalex.org/W3035523707","https://openalex.org/W3035595647","https://openalex.org/W3035712445","https://openalex.org/W3091249416","https://openalex.org/W3093501041","https://openalex.org/W3103635814","https://openalex.org/W3126337491","https://openalex.org/W3126792443","https://openalex.org/W3166396011","https://openalex.org/W3172942063","https://openalex.org/W3193689960","https://openalex.org/W3208314443","https://openalex.org/W4214745154","https://openalex.org/W4229042118","https://openalex.org/W4231510805","https://openalex.org/W4285606417","https://openalex.org/W4293057409","https://openalex.org/W4293569541","https://openalex.org/W4306820534","https://openalex.org/W4309933612","https://openalex.org/W4312353506","https://openalex.org/W4312574244","https://openalex.org/W4312847199","https://openalex.org/W4382462760","https://openalex.org/W4385245566","https://openalex.org/W4394666117"],"related_works":["https://openalex.org/W4380190185","https://openalex.org/W4298897568","https://openalex.org/W4290852288","https://openalex.org/W4226226396","https://openalex.org/W3217388757","https://openalex.org/W3215212336","https://openalex.org/W3164229987","https://openalex.org/W3122720459","https://openalex.org/W3009270862","https://openalex.org/W1938708284"],"abstract_inverted_index":{"Assessing":[0],"the":[1,36,115,137,158,188,199],"aesthetics":[2,73],"of":[3,67],"an":[4,90,133],"image":[5,23,63,72,154],"is":[6,10,210],"challenging,":[7],"as":[8,132,171],"it":[9,162],"influenced":[11],"by":[12],"multiple":[13],"factors":[14],"including":[15],"composition,":[16],"color,":[17],"style,":[18],"and":[19,50,60,77,99,105,161,175],"high-level":[20],"semantics.":[21],"Existing":[22],"aesthetic":[24,38,85,107,138,147,155,168],"assessment":[25],"(IAA)":[26],"methods":[27,81],"primarily":[28],"rely":[29],"on":[30,153],"human-labeled":[31],"rating":[32],"scores,":[33],"which":[34],"oversimplify":[35],"visual":[37],"information":[39,49],"that":[40,129,144],"humans":[41],"perceive.":[42],"Conversely,":[43],"user":[44,75],"comments":[45],"offer":[46],"more":[47,53],"comprehensive":[48],"are":[51],"a":[52,125],"natural":[54],"way":[55],"to":[56,82,102,135],"express":[57],"human":[58,110],"opinions":[59],"preferences":[61],"regarding":[62],"aesthetics.":[64],"In":[65],"light":[66],"this,":[68],"we":[69,88,122],"propose":[70,124],"learning":[71],"from":[74],"comments,":[76],"exploring":[78],"vision-language":[79,148],"pretraining":[80],"learn":[83,103,136],"multimodal":[84],"representations.":[86],"Specifically,":[87],"pretrain":[89],"image-text":[91],"encoder-decoder":[92],"model":[93,117,149,193,209],"with":[94],"image-comment":[95],"pairs,":[96],"using":[97,187],"contrastive":[98],"generative":[100],"objectives":[101],"rich":[104],"generic":[106],"semantics":[108],"without":[109],"labels.":[111],"To":[112],"efficiently":[113],"adapt":[114],"pretrained":[116,146],"for":[118,167],"downstream":[119],"IAA":[120,196],"tasks,":[121],"further":[123],"lightweight":[126],"rank-based":[127],"adapter":[128,190],"employs":[130],"text":[131],"anchor":[134],"ranking":[139],"concept.":[140],"Our":[141,208],"results":[142],"show":[143],"our":[145,192],"outperforms":[150],"prior":[151],"works":[152],"captioning":[156],"over":[157,198],"AVA-Captions":[159],"dataset,":[160],"has":[163],"powerful":[164],"zero-shot":[165,172,176],"capability":[166],"tasks":[169],"such":[170],"style":[173],"classification":[174],"IAA,":[177],"surpassing":[178],"many":[179],"supervised":[180],"baselines.":[181],"With":[182],"only":[183],"minimal":[184],"finetuning":[185],"parameters":[186],"proposed":[189],"module,":[191],"achieves":[194],"state-of-the-art":[195],"performance":[197],"AVA":[200],"dataset.":[201],"1":[204,207],"available":[211],"at":[212],"https://github.com/google-research/google-research/tree/master/VILA":[213]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4386076227","counts_by_year":[{"year":2024,"cited_by_count":28},{"year":2023,"cited_by_count":5}],"updated_date":"2025-01-03T21:48:12.623274","created_date":"2023-08-23"}