{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T12:28:50Z","timestamp":1743769730706,"version":"3.37.3"},"reference-count":60,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,5,1]],"date-time":"2020-05-01T00:00:00Z","timestamp":1588291200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772359","61525206","61872267"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013290","name":"National Key Research and Development Program of China Stem Cell and Translational Research","doi-asserted-by":"publisher","award":["2017YFC0820600"],"id":[{"id":"10.13039\/501100013290","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Tianjin New Generation Artificial Intelligence Major Program","award":["18ZXZNGX00150"]},{"DOI":"10.13039\/501100004835","name":"Zhejiang University","doi-asserted-by":"publisher","award":["A1907"],"id":[{"id":"10.13039\/501100004835","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004517","name":"Tianjin University","doi-asserted-by":"publisher","award":["2019XRX-0035"],"id":[{"id":"10.13039\/501100004517","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2020,5]]},"DOI":"10.1109\/tmm.2019.2941820","type":"journal-article","created":{"date-parts":[[2019,9,18]],"date-time":"2019-09-18T19:55:41Z","timestamp":1568836541000},"page":"1372-1383","source":"Crossref","is-referenced-by-count":82,"title":["Multi-Level Policy and Reward-Based Deep Reinforcement Learning Framework for Image Captioning"],"prefix":"10.1109","volume":"22","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7526-4356","authenticated-orcid":false,"given":"Ning","family":"Xu","sequence":"first","affiliation":[]},{"given":"Hanwang","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5755-9145","authenticated-orcid":false,"given":"An-An","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0578-8138","authenticated-orcid":false,"given":"Weizhi","family":"Nie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5165-204X","authenticated-orcid":false,"given":"Yuting","family":"Su","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Nie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1151-1792","authenticated-orcid":false,"given":"Yongdong","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.497"},{"key":"ref33","article-title":"Deep captioning with multimodal recurrent neural networks (M-RNN)","author":"mao","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/563"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729400"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2010.2041101"},{"key":"ref35","doi-asserted-by":"crossref","first-page":"529","DOI":"10.1038\/nature14236","article-title":"Human-level control through deep reinforcement learning","volume":"518","author":"mnih","year":"2015","journal-title":"Nature"},{"key":"ref34","first-page":"747","article-title":"Midge: Generating image descriptions from computer vision detections","author":"mitchell","year":"0","journal-title":"Proc 13th Conf Eur Chapter Assoc Comput Linguistics"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2017.7989381"},{"key":"ref28","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.3115\/1073445.1073465"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/114"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref1","first-page":"1250","article-title":"Generating image descriptions using dependency relational patterns","author":"aker","year":"0","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref20","first-page":"3294","article-title":"Skip-thought vectors","author":"kiros","year":"2015","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.275"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.748"},{"key":"ref24","first-page":"359","article-title":"Collective generation of natural image descriptions","author":"kuznetsova","year":"0","journal-title":"Proc Assoc Comput Linguistics"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.162"},{"key":"ref26","first-page":"220","article-title":"Composing simple image descriptions using web-scale n-grams","author":"li","year":"0","journal-title":"Proc 15th Conf Computational Natural Language Learning"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00188"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.29"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2722687"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.148"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"ref54","article-title":"Encode, review, and decode: Reviewer module for caption generation","author":"yang","year":"0","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref53","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/2998181.2998364"},{"key":"ref10","first-page":"1292","article-title":"Image description using visual dependency representations","author":"elliott","year":"0","journal-title":"Proc Conf Empirical Methods Natural Lang Process"},{"key":"ref11","first-page":"15","article-title":"Every picture tells a story: Generating sentences from images","author":"farhadi","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref40","article-title":"Automatic differentiation in pytorch","author":"paszke","year":"0","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.277"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2638622"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2598339"},{"key":"ref17","first-page":"1889","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","author":"karpathy","year":"0","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref18","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"article-title":"Unifying visual-semantic embeddings with multimodal neural language models","year":"0","author":"kiros","key":"ref19"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.286"},{"key":"ref3","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","author":"banerjee","year":"0","journal-title":"Proc ACL Workshop Intrinsic Extrinsic Eval Measures Mach Transl"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00146"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1038\/nrn755"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00834"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1038\/nature16961"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref48","first-page":"1057","article-title":"Policy gradient methods for reinforcement learning with function approximation","author":"sutton","year":"0","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref47","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"0","journal-title":"Proc Conf Neural Inf Process Syst"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2967212"},{"key":"ref41","article-title":"Sequence level training with recurrent neural networks","author":"ranzato","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.128"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/9076751\/08844130.pdf?arnumber=8844130","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T15:55:20Z","timestamp":1651074920000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8844130\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,5]]},"references-count":60,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tmm.2019.2941820","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"type":"print","value":"1520-9210"},{"type":"electronic","value":"1941-0077"}],"subject":[],"published":{"date-parts":[[2020,5]]}}}