{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T18:27:08Z","timestamp":1732040828936},"reference-count":55,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,2,1]],"date-time":"2022-02-01T00:00:00Z","timestamp":1643673600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,2,1]],"date-time":"2022-02-01T00:00:00Z","timestamp":1643673600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2022,2,1]],"date-time":"2022-02-01T00:00:00Z","timestamp":1643673600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2022,2,1]],"date-time":"2022-02-01T00:00:00Z","timestamp":1643673600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2022,2,1]],"date-time":"2022-02-01T00:00:00Z","timestamp":1643673600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,2,1]],"date-time":"2022-02-01T00:00:00Z","timestamp":1643673600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2022,2]]},"DOI":"10.1016\/j.patcog.2021.108358","type":"journal-article","created":{"date-parts":[[2021,10,1]],"date-time":"2021-10-01T05:24:46Z","timestamp":1633065886000},"page":"108358","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":10,"special_numbering":"C","title":["Revisiting image captioning via maximum discrepancy competition"],"prefix":"10.1016","volume":"122","author":[{"given":"Boyang","family":"Wan","sequence":"first","affiliation":[]},{"given":"Wenhui","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Yu-Ming","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Minwei","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Qin","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Liu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"4","key":"10.1016\/j.patcog.2021.108358_bib0001","doi-asserted-by":"crossref","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","article-title":"Show and tell: lessons learned from the 2015 MSCOCO image captioning challenge","volume":"39","author":"Vinyals","year":"2016","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2021.108358_bib0002","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions","volume":"2","author":"Young","year":"2014","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10.1016\/j.patcog.2021.108358_bib0003","series-title":"European Conference on Computer Vision","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.patcog.2021.108358_bib0004","unstructured":"X. Chen, H. Fang, T.-Y. Lin, R. Vedantam, S. Gupta, P. Doll\u00e1r, C.L. Zitnick, Microsoft COCO captions: data collection and evaluation server, arXiv preprint arXiv:1504.00325(2015)."},{"key":"10.1016\/j.patcog.2021.108358_bib0005","doi-asserted-by":"crossref","first-page":"285","DOI":"10.1016\/j.patcog.2019.01.028","article-title":"Dense semantic embedding network for image captioning","volume":"90","author":"Xiao","year":"2019","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.108358_bib0006","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"10.1016\/j.patcog.2021.108358_bib0007","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"10578","article-title":"Meshed-memory transformer for image captioning","author":"Cornia","year":"2020"},{"key":"10.1016\/j.patcog.2021.108358_bib0008","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"10971","article-title":"X-linear attention networks for image captioning","author":"Pan","year":"2020"},{"key":"10.1016\/j.patcog.2021.108358_bib0009","series-title":"Advances in Neural Information Processing Systems","first-page":"11137","article-title":"Image captioning: Transforming objects into words","author":"Herdade","year":"2019"},{"key":"10.1016\/j.patcog.2021.108358_bib0010","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7008","article-title":"Self-critical sequence training for image captioning","author":"Rennie","year":"2017"},{"key":"10.1016\/j.patcog.2021.108358_bib0011","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6964","article-title":"Discriminability objective for training descriptive captions","author":"Luo","year":"2018"},{"key":"10.1016\/j.patcog.2021.108358_bib0012","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"10890","article-title":"Better captioning with sequence-level exploration","author":"Chen","year":"2020"},{"key":"10.1016\/j.patcog.2021.108358_bib0013","series-title":"Proceedings of the Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.patcog.2021.108358_bib0014","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4566","article-title":"CIDEr: consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0015","series-title":"Proceedings of the Workshop on Statistical Machine Translation","first-page":"376","article-title":"Meteor universal: language specific translation evaluation for any target language","author":"Denkowski","year":"2014"},{"key":"10.1016\/j.patcog.2021.108358_bib0016","series-title":"Text Summarization Branches Out","first-page":"74","article-title":"ROUGE: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.patcog.2021.108358_bib0017","series-title":"European Conference on Computer Vision","first-page":"382","article-title":"SPICE: semantic propositional image caption evaluation","author":"Anderson","year":"2016"},{"key":"10.1016\/j.patcog.2021.108358_bib0018","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"10327","article-title":"Normalized and geometry-aware self-attention network for image captioning","author":"Guo","year":"2020"},{"key":"10.1016\/j.patcog.2021.108358_bib0019","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"3128","article-title":"Deep visual-semantic alignments for generating image descriptions","author":"Karpathy","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0020","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Stack-captioning: Coarse-to-fine learning for image captioning","author":"Gu","year":"2018"},{"key":"10.1016\/j.patcog.2021.108358_bib0021","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"10685","article-title":"Auto-encoding scene graphs for image captioning","author":"Yang","year":"2019"},{"key":"10.1016\/j.patcog.2021.108358_bib0022","series-title":"Proceedings of the Conference on Computational Natural Language Learning","first-page":"220","article-title":"Composing simple image descriptions using web-scale n-grams","author":"Li","year":"2011"},{"key":"10.1016\/j.patcog.2021.108358_bib0023","series-title":"European Conference on Computer Vision","first-page":"15","article-title":"Every picture tells a story: generating sentences from images","author":"Farhadi","year":"2010"},{"issue":"12","key":"10.1016\/j.patcog.2021.108358_bib0024","doi-asserted-by":"crossref","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","article-title":"Baby talk: understanding and generating simple image descriptions","volume":"35","author":"Kulkarni","year":"2013","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2021.108358_bib0025","first-page":"1143","article-title":"Im2Text: describing images using 1 million captioned photographs","volume":"24","author":"Ordonez","year":"2011","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2021.108358_bib0026","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Choosing linguistics over vision to describe images","author":"Gupta","year":"2012"},{"key":"10.1016\/j.patcog.2021.108358_bib0027","doi-asserted-by":"crossref","first-page":"351","DOI":"10.1162\/tacl_a_00188","article-title":"TreeTalk: composition and compression of trees for image descriptions","volume":"2","author":"Kuznetsova","year":"2014","journal-title":"Trans. Assoc. Comput.Linguist."},{"key":"10.1016\/j.patcog.2021.108358_bib0028","series-title":"International Conference on Learning Representations","article-title":"Deep captioning with multimodal recurrent neural networks (m-RNN)","author":"Mao","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0029","series-title":"International Conference on Learning Representations","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0030","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"580","article-title":"Rich feature hierarchies for accurate object detection and semantic segmentation","author":"Girshick","year":"2014"},{"key":"10.1016\/j.patcog.2021.108358_bib0031","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1","article-title":"Going deeper with convolutions","author":"Szegedy","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0032","series-title":"International Conference on Machine Learning","first-page":"2048","article-title":"Show, attend and tell: neural image caption generation with visual attention","author":"Xu","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0033","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"4133","article-title":"Image caption with global-local attention","author":"Li","year":"2017"},{"key":"10.1016\/j.patcog.2021.108358_bib0034","doi-asserted-by":"crossref","first-page":"107075","DOI":"10.1016\/j.patcog.2019.107075","article-title":"Learning visual relationship and context-aware attention for image captioning","volume":"98","author":"Wang","year":"2020","journal-title":"Pattern Recognit."},{"issue":"1","key":"10.1016\/j.patcog.2021.108358_bib0035","doi-asserted-by":"crossref","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","article-title":"Visual genome: connecting language and vision using crowdsourced dense image annotations","volume":"123","author":"Krishna","year":"2017","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2021.108358_bib0036","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"4634","article-title":"Attention on attention for image captioning","author":"Huang","year":"2019"},{"key":"10.1016\/j.patcog.2021.108358_bib0037","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"8948","article-title":"nocaps: novel object captioning at scale","author":"Agrawal","year":"2019"},{"key":"10.1016\/j.patcog.2021.108358_bib0038","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"9962","article-title":"Say as you wish: fine-grained control of image caption generation with abstract scene graphs","author":"Chen","year":"2020"},{"issue":"11","key":"10.1016\/j.patcog.2021.108358_bib0039","doi-asserted-by":"crossref","first-page":"1254","DOI":"10.1109\/34.730558","article-title":"A model of saliency-based visual attention for rapid scene analysis","volume":"20","author":"Itti","year":"1998","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2021.108358_bib0040","doi-asserted-by":"crossref","first-page":"106987","DOI":"10.1016\/j.patcog.2019.106987","article-title":"Video saliency detection by gestalt theory","volume":"96","author":"Fang","year":"2019","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.108358_bib0041","doi-asserted-by":"crossref","first-page":"107294","DOI":"10.1016\/j.patcog.2020.107294","article-title":"DevsNet: deep video saliency network using short-term and long-term cues","volume":"103","author":"Fang","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2021.108358_bib0042","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7132","article-title":"Squeeze-and-excitation networks","author":"Hu","year":"2018"},{"key":"10.1016\/j.patcog.2021.108358_bib0043","series-title":"European Conference on Computer Vision","first-page":"3","article-title":"CBAM: convolutional block attention module","author":"Woo","year":"2018"},{"key":"10.1016\/j.patcog.2021.108358_bib0044","series-title":"International Conference on Medical Image Computing and Computer-Assisted Intervention","first-page":"421","article-title":"Concurrent spatial and channel squeeze & excitation in fully convolutional networks","author":"Roy","year":"2018"},{"key":"10.1016\/j.patcog.2021.108358_bib0045","series-title":"International Conference on Learning Representations","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2015"},{"key":"10.1016\/j.patcog.2021.108358_bib0046","series-title":"Advances in Neural Information Processing Systems","first-page":"5998","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.patcog.2021.108358_bib0047","series-title":"Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"issue":"12","key":"10.1016\/j.patcog.2021.108358_bib0048","doi-asserted-by":"crossref","first-page":"8","DOI":"10.1167\/8.12.8","article-title":"Maximum differentiation (mad) competition: amethodology for comparing computational models of perceptual quantities","volume":"8","author":"Wang","year":"2008","journal-title":"J. Vis."},{"issue":"4","key":"10.1016\/j.patcog.2021.108358_bib0049","doi-asserted-by":"crossref","first-page":"851","DOI":"10.1109\/TPAMI.2018.2889948","article-title":"Group maximum differentiation competition: model comparison with few samples","volume":"42","author":"Ma","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2021.108358_bib0050","series-title":"International Conference on Learning Representations","article-title":"I am going MAD: maximum discrepancy competition for comparing classifiers adaptively","author":"Wang","year":"2020"},{"key":"10.1016\/j.patcog.2021.108358_bib0051","series-title":"Proceedings of the European Chapter of the Association for Computational Linguistics","first-page":"199","article-title":"Re-evaluating automatic metrics for image captioning","author":"Kilickaya","year":"2017"},{"issue":"2","key":"10.1016\/j.patcog.2021.108358_bib0052","doi-asserted-by":"crossref","first-page":"205","DOI":"10.1016\/0022-2496(84)90027-0","article-title":"Inconsistency and rank preservation","volume":"28","author":"Saaty","year":"1984","journal-title":"J. Math. Psychol."},{"issue":"3","key":"10.1016\/j.patcog.2021.108358_bib0053","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"ImageNet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2021.108358_bib0054","series-title":"International Conference on Learning Representations","first-page":"1254","article-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"issue":"1","key":"10.1016\/j.patcog.2021.108358_bib0055","first-page":"1997","article-title":"Neural architecture search: a survey","volume":"20","author":"Elsken","year":"2019","journal-title":"J. Mach. Learn. Res."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320321005380?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320321005380?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2022,12,30]],"date-time":"2022-12-30T10:45:59Z","timestamp":1672397159000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320321005380"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,2]]},"references-count":55,"alternative-id":["S0031320321005380"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2021.108358","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2022,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Revisiting image captioning via maximum discrepancy competition","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2021.108358","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2021 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"108358"}}