{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:53:27Z","timestamp":1740102807576,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,27]]},"DOI":"10.1145\/3634814.3634842","type":"proceedings-article","created":{"date-parts":[[2024,3,26]],"date-time":"2024-03-26T22:47:42Z","timestamp":1711493262000},"page":"210-215","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Performance and Cost Balancing in Vision Transformer-Based Image Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7329-2225","authenticated-orcid":false,"given":"Yan","family":"Lyu","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, University of Aizu, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4663-6739","authenticated-orcid":false,"given":"Yong","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Aizu, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3101-749X","authenticated-orcid":false,"given":"Qiangfu","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, University of Aizu, Japan"}]}],"member":"320","published-online":{"date-parts":[[2024,3,26]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Cornia M Stefanini M Baraldi L Meshed-memory transformer for image captioning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2020: 10578-10587.","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"crossref","unstructured":"Fang Z Wang J Hu X Injecting semantic concepts into end-to-end image captioning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022: 18009-18019.","DOI":"10.1109\/CVPR52688.2022.01748"},{"key":"e_1_3_2_1_3_1","volume-title":"Image captioning: Transforming objects into words[J]. Advances in neural information processing systems","author":"Herdade S","year":"2019","unstructured":"Herdade S, Kappeler A, Boakye K, Image captioning: Transforming objects into words[J]. Advances in neural information processing systems, 2019, 32."},{"key":"e_1_3_2_1_4_1","volume-title":"Scheduled sampling in vision-language pretraining with decoupled encoder-decoder network[C]\/\/Proceedings of the AAAI Conference on Artificial Intelligence","author":"Li Y","year":"2021","unstructured":"Li Y, Pan Y, Yao T, Scheduled sampling in vision-language pretraining with decoupled encoder-decoder network[C]\/\/Proceedings of the AAAI Conference on Artificial Intelligence. 2021, 35(10): 8518-8526."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Li Y Pan Y Yao T Comprehending and ordering semantics for image captioning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022: 17990-17999.","DOI":"10.1109\/CVPR52688.2022.01746"},{"volume-title":"CoCo-BERT: Improving video-language pre-training with contrastive cross-modal matching and denoising[C]\/\/Proceedings of the 29th ACM International Conference on Multimedia. 2021: 5600-5608","author":"Luo J","key":"e_1_3_2_1_6_1","unstructured":"Luo J, Li Y, Pan Y, CoCo-BERT: Improving video-language pre-training with contrastive cross-modal matching and denoising[C]\/\/Proceedings of the 29th ACM International Conference on Multimedia. 2021: 5600-5608."},{"volume-title":"Auto-captions on GIF: A large-scale video-sentence dataset for vision-language pre-training[C]\/\/Proceedings of the 30th ACM International Conference on Multimedia. 2022: 7070-7074","author":"Pan Y","key":"e_1_3_2_1_7_1","unstructured":"Pan Y, Li Y, Luo J, Auto-captions on GIF: A large-scale video-sentence dataset for vision-language pre-training[C]\/\/Proceedings of the 30th ACM International Conference on Multimedia. 2022: 7070-7074."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Sharma P Ding N Goodman S Conceptual captions: A cleaned hypernymed image alt-text dataset for automatic image captioning[C]\/\/Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). 2018: 2556-2565.","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3036860"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Yao T Pan Y Li Y Hierarchy parsing for image captioning[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. 2019: 2621-2629.","DOI":"10.1109\/ICCV.2019.00271"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Vinyals O Toshev A Bengio S Show and tell: A neural image caption generator[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2015: 3156-3164.","DOI":"10.1109\/CVPR.2015.7298935"},{"volume-title":"PMLR","author":"Xu K","key":"e_1_3_2_1_12_1","unstructured":"Xu K, Ba J, Kiros R, Show, attend and tell: Neural image caption generation with visual attention[C]\/\/International conference on machine learning. PMLR, 2015: 2048-2057."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Anderson P He X Buehler C Bottom-up and top-down attention for image captioning and visual question answering[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2018: 6077-6086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Jiang W Ma L Jiang Y G Recurrent fusion network for image captioning[C]\/\/Proceedings of the European conference on computer vision (ECCV). 2018: 499-515.","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Yao T Pan Y Li Y Exploring visual relationship for image captioning[C]\/\/Proceedings of the European conference on computer vision (ECCV). 2018: 684-699.","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"e_1_3_2_1_16_1","volume-title":"Lee K","author":"Devlin J","year":"1810","unstructured":"Devlin J, Chang M W, Lee K, Bert: Pre-training of deep bidirectional transformers for language understanding[J]. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_17_1","volume-title":"Improving language understanding by generative pre-training[J]","author":"Radford A","year":"2018","unstructured":"Radford A, Narasimhan K, Salimans T, Improving language understanding by generative pre-training[J]. 2018."},{"key":"e_1_3_2_1_18_1","volume-title":"Yang Y","author":"Yang Z","year":"2019","unstructured":"Yang Z, Dai Z, Yang Y, Xlnet: Generalized autoregressive pretraining for language understanding[J]. Advances in neural information processing systems, 2019, 32."},{"key":"e_1_3_2_1_19_1","volume-title":"Li Y","author":"Sun Y","year":"1904","unstructured":"Sun Y, Wang S, Li Y, Ernie: Enhanced representation through knowledge integration[J]. arXiv preprint arXiv:1904.09223, 2019."},{"key":"e_1_3_2_1_20_1","volume-title":"A continual pre-training framework for language understanding[C]\/\/Proceedings of the AAAI conference on artificial intelligence","author":"Sun Y","year":"2020","unstructured":"Sun Y, Wang S, Li Y, Ernie 2.0: A continual pre-training framework for language understanding[C]\/\/Proceedings of the AAAI conference on artificial intelligence. 2020, 34(05): 8968-8975."},{"key":"e_1_3_2_1_21_1","volume-title":"Roberta: A robustly optimized bert pretraining approach[J]. arXiv preprint arXiv:1907.11692","author":"Liu Y","year":"2019","unstructured":"Liu Y, Ott M, Goyal N, Roberta: A robustly optimized bert pretraining approach[J]. arXiv preprint arXiv:1907.11692, 2019."},{"key":"e_1_3_2_1_22_1","volume-title":"Unified language model pre-training for natural language understanding and generation[J]. Advances in neural information processing systems","author":"Dong L","year":"2019","unstructured":"Dong L, Yang N, Wang W, Unified language model pre-training for natural language understanding and generation[J]. Advances in neural information processing systems, 2019, 32."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_24_1","volume-title":"Goyal N","author":"Lewis M","year":"1910","unstructured":"Lewis M, Liu Y, Goyal N, Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension[J]. arXiv preprint arXiv:1910.13461, 2019."},{"volume-title":"PMLR","author":"Chen T","key":"e_1_3_2_1_25_1","unstructured":"Chen T, Kornblith S, Norouzi M, A simple framework for contrastive learning of visual representations[C]\/\/International conference on machine learning. PMLR, 2020: 1597-1607."},{"key":"e_1_3_2_1_26_1","volume-title":"Improved baselines with momentum contrastive learning[J]. arXiv preprint arXiv:2003.04297","author":"Chen X","year":"2020","unstructured":"Chen X, Fan H, Girshick R, Improved baselines with momentum contrastive learning[J]. arXiv preprint arXiv:2003.04297, 2020."},{"key":"e_1_3_2_1_27_1","volume-title":"Bootstrap your own latent-a new approach to self-supervised learning[J]. Advances in neural information processing systems","author":"Grill J B","year":"2020","unstructured":"Grill J B, Strub F, Altch\u00e9 F, Bootstrap your own latent-a new approach to self-supervised learning[J]. Advances in neural information processing systems, 2020, 33: 21271-21284."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Chen X He K. Exploring simple siamese representation learning[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2021: 15750-15758.","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"e_1_3_2_1_29_1","volume-title":"Piao S","author":"Bao H","year":"2021","unstructured":"Bao H, Dong L, Piao S, Beit: Bert pre-training of image transformers[J]. arXiv preprint arXiv:2106.08254, 2021."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"He K Chen X Xie S Masked autoencoders are scalable vision learners[C]\/\/Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. 2022: 16000-16009.","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"e_1_3_2_1_31_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale[J]. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy A","year":"2020","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, An image is worth 16x16 words: Transformers for image recognition at scale[J]. arXiv preprint arXiv:2010.11929, 2020."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Girshick R. Fast r-cnn[C]\/\/Proceedings of the IEEE international conference on computer vision. 2015: 1440-1448.","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"He K Zhang X Ren S Deep residual learning for image recognition[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2016: 770-778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Lu J Xiong C Parikh D Knowing when to look: Adaptive attention via a visual sentinel for image captioning[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 375-383.","DOI":"10.1109\/CVPR.2017.345"},{"key":"e_1_3_2_1_35_1","volume-title":"Vision-to-language tasks based on attributes and attention mechanism[J]","author":"Li X","year":"2019","unstructured":"Li X, Yuan A, Lu X. Vision-to-language tasks based on attributes and attention mechanism[J]. IEEE transactions on cybernetics, 2019, 51(2): 913-926."},{"key":"e_1_3_2_1_36_1","volume-title":"Dual-level collaborative transformer for image captioning[C]\/\/Proceedings of the AAAI conference on artificial intelligence","author":"Luo Y","year":"2021","unstructured":"Luo Y, Ji J, Sun X, Dual-level collaborative transformer for image captioning[C]\/\/Proceedings of the AAAI conference on artificial intelligence. 2021, 35(3): 2286-2293."},{"key":"e_1_3_2_1_37_1","volume-title":"Men R","author":"Wang P","year":"2022","unstructured":"Wang P, Yang A, Men R, Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework[C]\/\/International Conference on Machine Learning. PMLR, 2022: 23318-23340."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1561\/0600000105"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Wang W Bao H Dong L Image as a Foreign Language: BEiT Pretraining for Vision and Vision-Language Tasks[C]\/\/Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 2023: 19175-19186.","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_40_1","volume-title":"Attention is all you need[J]. Advances in neural information processing systems","author":"Vaswani A","year":"2017","unstructured":"Vaswani A, Shazeer N, Parmar N, Attention is all you need[J]. Advances in neural information processing systems, 2017, 30."},{"key":"e_1_3_2_1_41_1","volume-title":"Long short-term memory[J]. Neural computation","author":"Hochreiter S","year":"1997","unstructured":"Hochreiter S, Schmidhuber J. Long short-term memory[J]. Neural computation, 1997, 9(8): 1735-1780."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Karpathy A Fei-Fei L. Deep visual-semantic alignments for generating image descriptions[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2015: 3128-3137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.48084\/etasr.4772"},{"key":"e_1_3_2_1_44_1","volume-title":"Image Captioning based on Deep Convolutional Neural Networks and LSTM[C]\/\/2022 2nd International Conference on Power Electronics & IoT Applications in Renewable Energy and its Control (PARC)","author":"Srivastava S","year":"2022","unstructured":"Srivastava S, Sharma H, Dixit P. Image Captioning based on Deep Convolutional Neural Networks and LSTM[C]\/\/2022 2nd International Conference on Power Electronics & IoT Applications in Renewable Energy and its Control (PARC). IEEE, 2022: 1-4."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Lu J Yang J Batra D Neural baby talk[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2018: 7219-7228.","DOI":"10.1109\/CVPR.2018.00754"},{"key":"e_1_3_2_1_46_1","volume-title":"Generating sequences with recurrent neural networks[J]. arXiv preprint arXiv:1308.0850","author":"Graves A.","year":"2013","unstructured":"Graves A. Generating sequences with recurrent neural networks[J]. arXiv preprint arXiv:1308.0850, 2013."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Rennie S J Marcheret E Mroueh Y Self-critical sequence training for image captioning[C]\/\/Proceedings of the IEEE conference on computer vision and pattern recognition. 2017: 7008-7024.","DOI":"10.1109\/CVPR.2017.131"},{"key":"e_1_3_2_1_48_1","volume-title":"Image Caption Generator in Hindi Using Attention[M]\/\/Advanced Production and Industrial Engineering","author":"Sethi A","year":"2022","unstructured":"Sethi A, Jain A, Dhiman C. Image Caption Generator in Hindi Using Attention[M]\/\/Advanced Production and Industrial Engineering. IOS Press, 2022: 101-107."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Huang L Wang W Chen J Attention on attention for image captioning[C]\/\/Proceedings of the IEEE\/CVF international conference on computer vision. 2019: 4634-4643.","DOI":"10.1109\/ICCV.2019.00473"},{"key":"e_1_3_2_1_50_1","first-page":"2023","article-title":"Amharic Language Image Captions Generation Using Hybridized Attention-Based Deep Neural Networks[J]","author":"Solomon R","year":"2023","unstructured":"Solomon R, Abebe M. Amharic Language Image Captions Generation Using Hybridized Attention-Based Deep Neural Networks[J]. Applied Computational Intelligence and Soft Computing, 2023, 2023.","journal-title":"Applied Computational Intelligence and Soft Computing"},{"key":"e_1_3_2_1_51_1","volume-title":"Zhuo Y","author":"Zhang T","year":"2023","unstructured":"Zhang T, Zhang T, Zhuo Y, CATANIC: Automatic generation model of image captions based on multiple attention mechanism[J]. 2023."}],"event":{"name":"ASSE 2023: 2023 4th Asia Service Sciences and Software Engineering Conference","acronym":"ASSE 2023","location":"Aizu-Wakamatsu City Japan"},"container-title":["Proceedings of the 2023 4th Asia Service Sciences and Software Engineering Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3634814.3634842","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,10]],"date-time":"2024-04-10T15:32:00Z","timestamp":1712763120000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3634814.3634842"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,27]]},"references-count":51,"alternative-id":["10.1145\/3634814.3634842","10.1145\/3634814"],"URL":"https:\/\/doi.org\/10.1145\/3634814.3634842","relation":{},"subject":[],"published":{"date-parts":[[2023,10,27]]},"assertion":[{"value":"2024-03-26","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}