{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T00:49:48Z","timestamp":1745455788724},"reference-count":43,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015,12]]},"DOI":"10.1109\/iccv.2015.512","type":"proceedings-article","created":{"date-parts":[[2016,2,19]],"date-time":"2016-02-19T23:23:49Z","timestamp":1455924229000},"source":"Crossref","is-referenced-by-count":682,"title":["Describing Videos by Exploiting Temporal Structure"],"prefix":"10.1109","author":[{"given":"Li","family":"Yao","sequence":"first","affiliation":[]},{"given":"Atousa","family":"Torabi","sequence":"additional","affiliation":[]},{"given":"Kyunghyun","family":"Cho","sequence":"additional","affiliation":[]},{"given":"Nicolas","family":"Ballas","sequence":"additional","affiliation":[]},{"given":"Christopher","family":"Pal","sequence":"additional","affiliation":[]},{"given":"Hugo","family":"Larochelle","sequence":"additional","affiliation":[]},{"given":"Aaron","family":"Courville","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1173"},{"key":"ref33","doi-asserted-by":"crossref","first-page":"140","DOI":"10.1007\/978-3-642-15567-3_11","article-title":"Convolutional learning of spatio-temporal features","author":"taylor","year":"2010","journal-title":"Computer Vision-ECCV 2010"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247808"},{"key":"ref31","article-title":"Going deeper with convolutions","author":"szegedy","year":"2015","journal-title":"CVPR"},{"key":"ref30","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"NIPS"},{"key":"ref37","article-title":"CIDEr: Consensus-based image description evaluation","author":"vedantam","year":"2015","journal-title":"CVPR"},{"key":"ref36","article-title":"C3D: Generic features for video analysis","author":"tran","year":"2014","journal-title":"arXiv 1412 0767"},{"key":"ref35","article-title":"Using descriptive video services to create a large data source for video annotation research","author":"torabi","year":"2015","journal-title":"arXiv 1503 01070v1"},{"key":"ref34","article-title":"Integrating language and vision to generate natural language descriptions of videos in the wild","author":"thomason","year":"2014","journal-title":"COLING"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"ref11","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"donahue","year":"2015","journal-title":"CVPR"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.5244\/C.23.124"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.65"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref14","doi-asserted-by":"crossref","DOI":"10.1613\/jair.3994","article-title":"Framing image description as a ranking task: Data, models and evaluation metrics","author":"hodosh","year":"2013","journal-title":"Journal of Artificial Intelligence Research"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.59"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654889"},{"key":"ref17","article-title":"Deep visual-semantic alignments for generating image descriptions","author":"karpathy","year":"2014","journal-title":"CVPR"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref19","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"ACL"},{"key":"ref28","article-title":"Two-stream convolutional networks for action recognition in videos","author":"simonyan","year":"2014","journal-title":"NIPS"},{"key":"ref4","doi-asserted-by":"crossref","DOI":"10.25080\/Majora-92bf1922-003","article-title":"Theano: a CPU and GPU math expression compiler","author":"bergstra","year":"2010","journal-title":"Proceedings of the Python for Scientific Computing Conference (SciPy)"},{"key":"ref27","article-title":"Overfeat: Integrated recognition, localization and detection using convolutional networks","author":"sermanet","year":"2014","journal-title":"ICLRE"},{"key":"ref3","article-title":"Theano: new features and speed improvements","author":"bastien","year":"2012","journal-title":"NIPS 2012 Workshop on Deep Learning and Unsupervised Feature Learning"},{"key":"ref6","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"chen","year":"2011","journal-title":"ACL"},{"key":"ref29","article-title":"Unsupervised learning of video representations using lstms","author":"srivastava","year":"2015","journal-title":"arXiv 1502 04681"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_41"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref7","article-title":"microsoft coco captions: Data collection and evaluation server","author":"chen","year":"2015","journal-title":"arXiv 1504 00325"},{"key":"ref2","article-title":"Video in sentences out","author":"barbu","year":"2012","journal-title":"UAI"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/11744047_33"},{"key":"ref1","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"ICLRE"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1023\/A:1020346032608"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-12900-1_14"},{"key":"ref21","article-title":"Imagenet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"NIPS"},{"key":"ref24","article-title":"Video (language) modeling: a baseline for generative models of natural videos","author":"ranzato","year":"2014","journal-title":"arXiv 1412 6604"},{"key":"ref42","article-title":"Recurrent neural network regularization","author":"zaremba","year":"2014","journal-title":"arXiv 1409 2329"},{"key":"ref23","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"papineni","year":"2002","journal-title":"ACL"},{"key":"ref41","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"ICML"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.61"},{"key":"ref25","article-title":"A dataset for movie description","author":"rohrbach","year":"2015","journal-title":"CVPR"},{"key":"ref43","article-title":"ADADELTA: an adaptive learning rate method","author":"zeiler","year":"2012","journal-title":"Technical Report"}],"event":{"name":"2015 IEEE International Conference on Computer Vision (ICCV)","location":"Santiago, Chile","start":{"date-parts":[[2015,12,7]]},"end":{"date-parts":[[2015,12,13]]}},"container-title":["2015 IEEE International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7407725\/7410356\/07410869.pdf?arnumber=7410869","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,9,15]],"date-time":"2020-09-15T21:07:29Z","timestamp":1600204049000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7410869\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,12]]},"references-count":43,"URL":"https:\/\/doi.org\/10.1109\/iccv.2015.512","relation":{},"subject":[],"published":{"date-parts":[[2015,12]]}}}