{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,3]],"date-time":"2024-07-03T23:20:50Z","timestamp":1720048850616},"reference-count":30,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2020,4,1]],"date-time":"2020-04-01T00:00:00Z","timestamp":1585699200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61672133","61632007"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition Letters"],"published-print":{"date-parts":[[2020,4]]},"DOI":"10.1016\/j.patrec.2018.06.030","type":"journal-article","created":{"date-parts":[[2018,6,30]],"date-time":"2018-06-30T09:16:47Z","timestamp":1530350207000},"page":"62-68","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":8,"special_numbering":"C","title":["Movie fill in the blank by joint learning from video and text with adaptive temporal attention"],"prefix":"10.1016","volume":"132","author":[{"given":"Jie","family":"Chen","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0003-2615-1555","authenticated-orcid":false,"given":"Jie","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Chengkun","family":"He","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patrec.2018.06.030_bib0001","series-title":"2015\u202fIEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, December 7\u201313, 2015","first-page":"2425","article-title":"VQA: visual question answering","author":"Antol","year":"2015"},{"key":"10.1016\/j.patrec.2018.06.030_bib0002","series-title":"Proceedings of the 2017\u202fACM on Conference on Information and Knowledge Management, CIKM 2017, Singapore, November 06 - 10, 2017","first-page":"1039","article-title":"Movie fill in the blank with adaptive temporal attention and description update","author":"Chen","year":"2017"},{"key":"10.1016\/j.patrec.2018.06.030_bib0003","article-title":"Recurrent batch normalization","author":"Cooijmans","year":"2016","journal-title":"CoRR"},{"issue":"9","key":"10.1016\/j.patrec.2018.06.030_bib0004","doi-asserted-by":"crossref","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","article-title":"Video captioning with attention-based LSTM and semantic consistency","volume":"19","author":"Gao","year":"2017","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patrec.2018.06.030_bib0005","series-title":"IEEE International Conference on Computer Vision, ICCV 2013, Sydney, Australia, December 1\u20138, 2013","first-page":"2712","article-title":"Youtube2text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition","author":"Guadarrama","year":"2013"},{"key":"10.1016\/j.patrec.2018.06.030_bib0006","doi-asserted-by":"crossref","first-page":"70","DOI":"10.1016\/j.neucom.2016.09.129","article-title":"Exploiting score distribution for heterogenous feature fusion in image classification","volume":"253","author":"He","year":"2017","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patrec.2018.06.030_bib0007","series-title":"2016\u202fIEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, 2016","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"issue":"1","key":"10.1016\/j.patrec.2018.06.030_bib0008","doi-asserted-by":"crossref","first-page":"114","DOI":"10.1109\/TKDE.2016.2617326","article-title":"If-matching: towards accurate map-matching with information fusion","volume":"29","author":"Hu","year":"2017","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"10.1016\/j.patrec.2018.06.030_bib0009","series-title":"Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems 2016, December 5\u201310, 2016, Barcelona, Spain","first-page":"361","article-title":"Multimodal residual learning for visual QA","author":"Kim","year":"2016"},{"issue":"6","key":"10.1016\/j.patrec.2018.06.030_bib0010","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1145\/3065386","article-title":"ImageNet classification with deep convolutional neural networks","volume":"60","author":"Krizhevsky","year":"2017","journal-title":"Commun. ACM"},{"key":"10.1016\/j.patrec.2018.06.030_bib0011","series-title":"Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems 2016, December 5\u201310, 2016, Barcelona, Spain","first-page":"4655","article-title":"Visual question answering with question representation update (QRU)","author":"Li","year":"2016"},{"key":"10.1016\/j.patrec.2018.06.030_bib0012","series-title":"Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems 2016, December 5\u201310, 2016, Barcelona, Spain","first-page":"289","article-title":"Hierarchical question-image co-attention for visual question answering","author":"Lu","year":"2016"},{"key":"10.1016\/j.patrec.2018.06.030_bib0013","series-title":"2017\u202fIEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21\u201326, 2017","first-page":"7359","article-title":"A dataset and exploration of models for understanding video data through fill-in-the-blank question-answering","author":"Maharaj","year":"2017"},{"key":"10.1016\/j.patrec.2018.06.030_bib0014","series-title":"Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8\u201313 2014, Montreal, Quebec, Canada","first-page":"1682","article-title":"A multi-world approach to question answering about real-world scenes based on uncertain input","author":"Malinowski","year":"2014"},{"key":"10.1016\/j.patrec.2018.06.030_bib0015","article-title":"Video fill in the blank with merging LSTMs","author":"Mazaheri","year":"2016","journal-title":"CoRR"},{"key":"10.1016\/j.patrec.2018.06.030_bib0016","article-title":"Efficient estimation of word representations in vector space","author":"Mikolov","year":"2013","journal-title":"CoRR"},{"key":"10.1016\/j.patrec.2018.06.030_bib0017","series-title":"Pattern Recognition - 36th German Conference, GCPR 2014, M\u00fcnster, Germany, September 2\u20135, 2014, Proceedings","first-page":"184","article-title":"Coherent multi-sentence video description with variable level of detail","author":"Rohrbach","year":"2014"},{"key":"10.1016\/j.patrec.2018.06.030_bib0018","series-title":"IEEE International Conference on Computer Vision, ICCV 2013, Sydney, Australia, December 1\u20138, 2013","first-page":"433","article-title":"Translating video content to natural language descriptions","author":"Rohrbach","year":"2013"},{"key":"10.1016\/j.patrec.2018.06.030_bib0019","series-title":"2016\u202fIEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, 2016","first-page":"4613","article-title":"Where to look: focus regions for visual question answering","author":"Shih","year":"2016"},{"key":"10.1016\/j.patrec.2018.06.030_bib0020","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014","journal-title":"CoRR"},{"key":"10.1016\/j.patrec.2018.06.030_bib0021","series-title":"2015\u202fIEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, December 7\u201313, 2015","first-page":"4534","article-title":"Sequence to sequence - video to text","author":"Venugopalan","year":"2015"},{"key":"10.1016\/j.patrec.2018.06.030_bib0022","series-title":"NAACL HLT 2015, The 2015 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Denver, Colorado, USA, May 31 - June 5, 2015","first-page":"1494","article-title":"Translating videos to natural language using deep recurrent neural networks","author":"Venugopalan","year":"2015"},{"key":"10.1016\/j.patrec.2018.06.030_bib0023","first-page":"707","article-title":"A long short-term memory model for answer sentence selection in question answering","author":"Wang","year":"2015"},{"key":"10.1016\/j.patrec.2018.06.030_bib0024","series-title":"Proceedings of the 33nd International Conference on Machine Learning, ICML 2016, New York City, NY, USA, June 19\u201324, 2016","first-page":"2397","article-title":"Dynamic memory networks for visual and textual question answering","author":"Xiong","year":"2016"},{"key":"10.1016\/j.patrec.2018.06.030_bib0025","series-title":"2016\u202fIEEE Conference on Computer Vision and Pattern Recognition, CVPR 2016, Las Vegas, NV, USA, June 27\u201330, 2016","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"key":"10.1016\/j.patrec.2018.06.030_bib0026","series-title":"2015\u202fIEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, December 7\u201313, 2015","first-page":"4507","article-title":"Describing videos by exploiting temporal structure","author":"Yao","year":"2015"},{"key":"10.1016\/j.patrec.2018.06.030_bib0027","article-title":"Video captioning and retrieval models with semantic attention","author":"Yu","year":"2016","journal-title":"CoRR"},{"key":"10.1016\/j.patrec.2018.06.030_bib0028","series-title":"Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, February 4\u20139, 2017, San Francisco, California, USA.","first-page":"4334","article-title":"Leveraging video descriptions to learn video question answering","author":"Zeng","year":"2017"},{"issue":"3","key":"10.1016\/j.patrec.2018.06.030_bib0029","doi-asserted-by":"crossref","first-page":"409","DOI":"10.1007\/s11263-017-1033-7","article-title":"Uncovering the temporal context for video question answering","volume":"124","author":"Zhu","year":"2017","journal-title":"Int. J. Comput. Vis."},{"issue":"3","key":"10.1016\/j.patrec.2018.06.030_bib0030","doi-asserted-by":"crossref","first-page":"517","DOI":"10.1109\/TKDE.2017.2763618","article-title":"Local and global structure preservation for robust unsupervised spectral feature selection","volume":"30","author":"Zhu","year":"2018","journal-title":"IEEE Trans. Knowl. Data Eng."}],"container-title":["Pattern Recognition Letters"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167865518302794?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167865518302794?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2020,4,15]],"date-time":"2020-04-15T17:04:38Z","timestamp":1586970278000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167865518302794"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,4]]},"references-count":30,"alternative-id":["S0167865518302794"],"URL":"https:\/\/doi.org\/10.1016\/j.patrec.2018.06.030","relation":{},"ISSN":["0167-8655"],"issn-type":[{"value":"0167-8655","type":"print"}],"subject":[],"published":{"date-parts":[[2020,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Movie fill in the blank by joint learning from video and text with adaptive temporal attention","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition Letters","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patrec.2018.06.030","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2018 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}