{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,12]],"date-time":"2025-04-12T22:43:16Z","timestamp":1744497796672,"version":"3.37.3"},"reference-count":83,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100004052","name":"KAUST","doi-asserted-by":"publisher","award":["BAS\/1\/1685-01-0,KAUST-FCC\/1\/2533-17-01"],"id":[{"id":"10.13039\/501100004052","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1109\/cvpr52688.2022.01750","type":"proceedings-article","created":{"date-parts":[[2022,9,27]],"date-time":"2022-09-27T19:56:41Z","timestamp":1664308601000},"page":"18009-18019","source":"Crossref","is-referenced-by-count":96,"title":["VisualGPT: Data-efficient Adaptation of Pretrained Language Models for Image Captioning"],"prefix":"10.1109","author":[{"given":"Jun","family":"Chen","sequence":"first","affiliation":[{"name":"King Abdullah University of Science and Technology (KAUST)"}]},{"given":"Han","family":"Guo","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Kai","family":"Yi","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology (KAUST)"}]},{"given":"Boyang","family":"Li","sequence":"additional","affiliation":[{"name":"Nanyang Technological University"}]},{"given":"Mohamed","family":"Elhoseiny","sequence":"additional","affiliation":[{"name":"King Abdullah University of Science and Technology (KAUST)"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.780"},{"journal-title":"CNN+CNN Convolutional Decoders for Image Captioning","year":"2018","author":"wang","key":"ref72"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.130"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"ref77","article-title":"Xlnet: Generalized autoregressive pretraining for language understanding","author":"yang","year":"0","journal-title":"NeurIPS"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00070"},{"key":"ref39","article-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks","author":"li","year":"0","journal-title":"ECCV"},{"key":"ref75","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"ICML"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2896516"},{"key":"ref78","article-title":"Review networks for caption generation","volume":"29","author":"yang","year":"0","journal-title":"NeurIPS"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2010.2050411"},{"key":"ref33","article-title":"Stacked cross attention for image-text matching","author":"lee","year":"0","journal-title":"ECCV"},{"key":"ref32","article-title":"Albert: A lite bert for self-supervised learning of language representations","author":"lan","year":"0","journal-title":"ICLRE"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.162"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1208"},{"key":"ref37","article-title":"Composing simple image descriptions using web-scale n-grams","author":"li","year":"0","journal-title":"CoNLL"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00902"},{"key":"ref35","article-title":"Hybrid retrieval-generation reinforced agent for medical image report generation","author":"li","year":"0","journal-title":"NeurIPS"},{"journal-title":"BART Denoising Sequence-to-Sequence Pre-training for Natural Language Generation Translation and Comprehension","year":"2019","author":"lewis","key":"ref34"},{"key":"ref60","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540112"},{"key":"ref66","article-title":"Vl-bert: Pre-training of generic visual-linguistic representations","author":"su","year":"0","journal-title":"ICLRE"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1208"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref68","article-title":"Attention is all you need","author":"vaswani","year":"0","journal-title":"NeurIPS"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00425"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_31"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1608"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.8"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1240"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1657"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00754"},{"journal-title":"Pointer sentinel mixture models","year":"2017","author":"merity","key":"ref51"},{"key":"ref59","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"radford","year":"2019","journal-title":"OpenAIRE blog"},{"journal-title":"Improving language understanding by generative pre-training","year":"2018","author":"radford","key":"ref58"},{"key":"ref57","article-title":"Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data","author":"qi","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-1202"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1080\/14786440009463897"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2011.5947611"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.112"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.681"},{"key":"ref40","article-title":"Hybrid retrieval-generation reinforced agent for medical image report generation","author":"li","year":"0","journal-title":"NeurIPS"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00850"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1093\/jamia\/ocv080"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref16","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"NAACL"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00271"},{"key":"ref17","article-title":"Unified language model pre-training for natural language understanding and generation","author":"dong","year":"0","journal-title":"NeurIPS"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.5220\/0008881202330241"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01246"},{"key":"ref19","article-title":"Every picture tells a story: Generating sentences from images","author":"farhadi","year":"0","journal-title":"ECCV"},{"key":"ref80","article-title":"Exploring visual relationship for image captioning","author":"yao","year":"0","journal-title":"ECCV"},{"key":"ref4","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","author":"banerjee","year":"0","journal-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization"},{"journal-title":"Layer normalization","year":"2016","author":"ba","key":"ref3"},{"key":"ref6","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref5","article-title":"A neural probabilistic language model","volume":"3","author":"bengio","year":"2003","journal-title":"Journal of Machine Learning Research"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5602"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"ref46","article-title":"RoBERTa: A robustly optimized BERT pretraining approach","volume":"arxiv1907 11692","author":"liu","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref48","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"0","journal-title":"NeurIPS"},{"journal-title":"RoBERTa A Robustly optimized BERT Pretraining Approach","year":"2019","author":"liu","key":"ref47"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.3115\/1118162.1118168"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01278"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1112"},{"key":"ref43","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"0","journal-title":"ECCV"}],"event":{"name":"2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","start":{"date-parts":[[2022,6,18]]},"location":"New Orleans, LA, USA","end":{"date-parts":[[2022,6,24]]}},"container-title":["2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9878378\/9878366\/09879874.pdf?arnumber=9879874","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,3]],"date-time":"2022-11-03T22:57:14Z","timestamp":1667516234000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9879874\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6]]},"references-count":83,"URL":"https:\/\/doi.org\/10.1109\/cvpr52688.2022.01750","relation":{},"subject":[],"published":{"date-parts":[[2022,6]]}}}