{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,19]],"date-time":"2025-04-19T13:27:47Z","timestamp":1745069267519},"reference-count":56,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2019,9,1]],"date-time":"2019-09-01T00:00:00Z","timestamp":1567296000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2019,9]]},"DOI":"10.1016\/j.neucom.2019.05.027","type":"journal-article","created":{"date-parts":[[2019,5,14]],"date-time":"2019-05-14T16:10:26Z","timestamp":1557850226000},"page":"24-35","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":20,"special_numbering":"C","title":["Semantic-filtered Soft-Split-Aware video captioning with audio-augmented feature"],"prefix":"10.1016","volume":"357","author":[{"given":"Yuecong","family":"Xu","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-8075-0439","authenticated-orcid":false,"given":"Jianfei","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Kezhi","family":"Mao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2019.05.027_bib0001","series-title":"NAACL HLT","article-title":"Translating videos to natural language using deep recurrent neural networks","author":"Venugopalan","year":"2015"},{"key":"10.1016\/j.neucom.2019.05.027_bib0002","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1473","article-title":"From captions to visual concepts and back","author":"Fang","year":"2015"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0003","series-title":"Proceedings of the 32nd International Conference on Machine Learning","first-page":"2048","article-title":"Show, attend and tell: neural image caption generation with visual attention","volume":"37","author":"Xu","year":"2015"},{"key":"10.1016\/j.neucom.2019.05.027_bib0004","series-title":"2015 IEEE International Conference on Computer Vision (ICCV)","first-page":"2407","article-title":"Guiding the long-short term memory model for image caption generation","author":"Jia","year":"2015"},{"key":"10.1016\/j.neucom.2019.05.027_bib0005","article-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)","author":"Mao","year":"2015","journal-title":"ICLR 2015"},{"key":"10.1016\/j.neucom.2019.05.027_bib0006","doi-asserted-by":"crossref","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","article-title":"A survey on automatic image caption generation","volume":"311","author":"Bai","year":"2018","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2019.05.027_bib0007","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2015","journal-title":"ICLR 2015"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0008","series-title":"Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)","first-page":"1724","article-title":"Learning phrase representations using rnn encoder\u2013decoder for statistical machine translation","author":"Cho","year":"2014"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0009","series-title":"Proceedings of the 25th International Conference on Neural Information Processing Systems - Volume 1","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","author":"Krizhevsky","year":"2012"},{"issue":"8","key":"10.1016\/j.neucom.2019.05.027_bib0010","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural Comput."},{"key":"10.1016\/j.neucom.2019.05.027_bib0011","doi-asserted-by":"crossref","first-page":"150","DOI":"10.1016\/j.neucom.2014.02.049","article-title":"Encoder combined video moving object detection","volume":"139","author":"Tong","year":"2014","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2019.05.027_bib0012","doi-asserted-by":"crossref","first-page":"69","DOI":"10.1016\/j.neucom.2017.02.004","article-title":"Compressive perceptual hashing tracking","volume":"239","author":"Chen","year":"2017","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2019.05.027_bib0013","series-title":"2015 IEEE International Conference on Computer Vision (ICCV)","first-page":"4534","article-title":"Sequence to sequence \u2013 video to text","author":"Venugopalan","year":"2015"},{"key":"10.1016\/j.neucom.2019.05.027_bib0014","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"203","article-title":"What value do explicit high level concepts have in vision to language problems?","author":"Wu","year":"2016"},{"key":"10.1016\/j.neucom.2019.05.027_bib0015","series-title":"Proceedings of the 2017 ACM on Multimedia Conference","first-page":"1014","article-title":"Video description with spatial-temporal attention","author":"Tu","year":"2017"},{"key":"10.1016\/j.neucom.2019.05.027_bib0016","series-title":"2015 IEEE International Conference on Computer Vision (ICCV)","first-page":"4507","article-title":"Describing videos by exploiting temporal structure","author":"Yao","year":"2015"},{"issue":"9","key":"10.1016\/j.neucom.2019.05.027_bib0017","doi-asserted-by":"crossref","first-page":"2045","DOI":"10.1109\/TMM.2017.2729019","article-title":"Video captioning with attention-based lstm and semantic consistency","volume":"19","author":"Gao","year":"2017","journal-title":"IEEE Trans. Multim."},{"key":"10.1016\/j.neucom.2019.05.027_bib0018","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1016\/j.neucom.2017.04.065","article-title":"Detecting shot boundary with sparse coding for video summarization","volume":"266","author":"Li","year":"2017","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2019.05.027_bib0019","series-title":"2013 IEEE International Conference on Computer Vision","first-page":"2712","article-title":"Youtube2text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition","author":"Guadarrama","year":"2013"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0020","series-title":"Proceedings of the Workshop on Vision and Natural Language Processing","first-page":"10","article-title":"Generating natural-language video descriptions using text-mined knowledge","author":"Krishnamoorthy","year":"2013"},{"issue":"4","key":"10.1016\/j.neucom.2019.05.027_bib0021","doi-asserted-by":"crossref","first-page":"664","DOI":"10.1109\/TPAMI.2016.2598339","article-title":"Deep visual-semantic alignments for generating image descriptions","volume":"39","author":"Karpathy","year":"2017","journal-title":"IEEE Trans. Pattern Anal. Mach.Intell."},{"key":"10.1016\/j.neucom.2019.05.027_bib0022","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3156","article-title":"Show and tell: a neural image caption generator","author":"Vinyals","year":"2015"},{"issue":"3","key":"10.1016\/j.neucom.2019.05.027_bib0023","doi-asserted-by":"crossref","first-page":"634","DOI":"10.1109\/TMM.2017.2749159","article-title":"Two-stream 3-d convnet fusion for action recognition in videos with arbitrary size and length","volume":"20","author":"Wang","year":"2018","journal-title":"IEEE Trans. Multim."},{"issue":"4","key":"10.1016\/j.neucom.2019.05.027_bib0024","doi-asserted-by":"crossref","first-page":"510","DOI":"10.1109\/LSP.2016.2611485","article-title":"Beyond frame-level cnn: saliency-aware 3-d cnn with lstm for video action recognition","volume":"24","author":"Wang","year":"2017","journal-title":"IEEE Signal Process. Lett."},{"issue":"7","key":"10.1016\/j.neucom.2019.05.027_bib0025","doi-asserted-by":"crossref","first-page":"3210","DOI":"10.1109\/TIP.2018.2814344","article-title":"Self-supervised video hashing with hierarchical binary auto-encoder","volume":"27","author":"Song","year":"2018","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.neucom.2019.05.027_bib0026","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4584","article-title":"Video paragraph captioning using hierarchical recurrent neural networks","author":"Yu","year":"2016"},{"key":"10.1016\/j.neucom.2019.05.027_bib0027","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"1029","article-title":"Hierarchical recurrent neural encoder for video representation with application to captioning","author":"Pan","year":"2016"},{"issue":"C","key":"10.1016\/j.neucom.2019.05.027_bib0028","doi-asserted-by":"crossref","first-page":"160","DOI":"10.1016\/j.ins.2016.06.029","article-title":"Boosted random contextual semantic space based representation for visual recognition","volume":"369","author":"Zhang","year":"2016","journal-title":"Inf. Sci."},{"key":"10.1016\/j.neucom.2019.05.027_bib0029","doi-asserted-by":"crossref","first-page":"125","DOI":"10.1016\/j.ins.2016.10.019","article-title":"Image classification by search with explicitly and implicitly semantic representations","volume":"376","author":"Zhang","year":"2017","journal-title":"Inf. Sci."},{"key":"10.1016\/j.neucom.2019.05.027_sbref0030","doi-asserted-by":"crossref","first-page":"88","DOI":"10.1016\/j.neucom.2016.11.065","article-title":"Hierarchical deep semantic representation for visual categorization","volume":"257","author":"Zhang","year":"2017","journal-title":"Neurocomputing"},{"issue":"8","key":"10.1016\/j.neucom.2019.05.027_bib0031","doi-asserted-by":"crossref","first-page":"3442","DOI":"10.1109\/TNNLS.2017.2728060","article-title":"Structured weak semantic space construction for visual categorization","volume":"29","author":"Zhang","year":"2018","journal-title":"IEEE Trans. Neural Netw. Learn.Syst."},{"key":"10.1016\/j.neucom.2019.05.027_bib0032","doi-asserted-by":"crossref","first-page":"271","DOI":"10.1016\/j.ins.2017.09.024","article-title":"Image-level classification by hierarchical structure learning with visual and semantic similarities","volume":"422","author":"Zhang","year":"2018","journal-title":"Inf. Sci."},{"key":"10.1016\/j.neucom.2019.05.027_bib0033","series-title":"The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","article-title":"Can spatiotemporal 3d CNNS retrace the history of 2d CNNS and imagenet?","author":"Hara","year":"2018"},{"key":"10.1016\/j.neucom.2019.05.027_bib0034","doi-asserted-by":"crossref","first-page":"770","DOI":"10.1109\/CVPR.2016.90","article-title":"Deep residual learning for image recognition","author":"He","year":"2016","journal-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"10.1016\/j.neucom.2019.05.027_bib0035","series-title":"2009 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"248","article-title":"Imagenet: a large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.neucom.2019.05.027_bib0036","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"5987","article-title":"Aggregated residual transformations for deep neural networks","author":"Xie","year":"2017"},{"key":"10.1016\/j.neucom.2019.05.027_bib0037","series-title":"2018 IEEE International Conference on Communications (ICC)","first-page":"1","article-title":"Deepsense: device-free human activity recognition via autoencoder long-term recurrent convolutional network","author":"Zou","year":"2018"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0038","series-title":"Proceedings of the 2011 International Conference on Unsupervised and Transfer Learning Workshop - Volume 27","first-page":"37","article-title":"Autoencoders, unsupervised learning and deep architectures","author":"Baldi","year":"2011"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0039","article-title":"Voice recognition algorithms using mel frequency cepstral coefficient (mfcc) and dynamic time warping (dtw) techniques","volume":"abs\/1003.4083","author":"Muda","year":"2010","journal-title":"CoRR"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0040","series-title":"Parallel Distributed Processing: Explorations in the Microstructure of Cognition, Vol. 1","first-page":"318","article-title":"Parallel distributed processing: explorations in the microstructure of cognition, vol. 1","author":"Rumelhart","year":"1986"},{"key":"10.1016\/j.neucom.2019.05.027_bib0041","doi-asserted-by":"crossref","first-page":"2451","DOI":"10.1162\/089976600300015015","article-title":"Learning to forget: continual prediction with lstm","volume":"12","author":"Gers","year":"1999","journal-title":"Neural Computation"},{"key":"10.1016\/j.neucom.2019.05.027_bib0042","series-title":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"5288","article-title":"Msr-vtt: a large video description dataset for bridging video and language","author":"Xu","year":"2016"},{"key":"10.1016\/j.neucom.2019.05.027_bib0043","series-title":"Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies-Volume 1","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","author":"Chen","year":"2011"},{"key":"10.1016\/j.neucom.2019.05.027_bib0044","article-title":"Using descriptive video services to create a large data source for video annotation research","volume":"abs\/1503.01070","author":"Torabi","year":"2015","journal-title":"CoRR"},{"key":"10.1016\/j.neucom.2019.05.027_bib0045","series-title":"Proceedings of the 40th Annual Meeting on Association for Computational Linguistics","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0046","series-title":"Proceedings of the Second Workshop on Statistical Machine Translation","first-page":"228","article-title":"Meteor: an automatic metric for mt evaluation with high levels of correlation with human judgments","author":"Lavie","year":"2007"},{"key":"10.1016\/j.neucom.2019.05.027_bib0047","series-title":"2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4566","article-title":"Cider: consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0048","series-title":"Proc. ACL workshop on Text Summarization Branches Out","first-page":"10","article-title":"Rouge: a package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.neucom.2019.05.027_bib0049","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"4724","article-title":"Quo vadis, action recognition? a new model and the kinetics dataset","author":"Carreira","year":"2017"},{"key":"10.1016\/j.neucom.2019.05.027_bib0050","series-title":"2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3185","article-title":"Hierarchical boundary-aware neural encoder for video captioning","author":"Baraldi","year":"2017"},{"key":"10.1016\/j.neucom.2019.05.027_bib0051","article-title":"Adam: a method for stochastic optimization","author":"Kingma","year":"2015","journal-title":"ICLR 2015"},{"key":"10.1016\/j.neucom.2019.05.027_sbref0052","series-title":"Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics","first-page":"249","article-title":"Understanding the difficulty of training deep feedforward neural networks","volume":"9","author":"Glorot","year":"2010"},{"key":"10.1016\/j.neucom.2019.05.027_bib0053","series-title":"Proceedings of the 2016 ACM on Multimedia Conference","first-page":"1087","article-title":"Describing videos using multi-modal fusion","author":"Jin","year":"2016"},{"key":"10.1016\/j.neucom.2019.05.027_bib0054","series-title":"Proceedings of the 2016 ACM on Multimedia Conference","first-page":"1073","article-title":"Frame- and segment-level features and candidate pool evaluation for video caption generation","author":"Shetty","year":"2016"},{"key":"10.1016\/j.neucom.2019.05.027_bib0055","series-title":"Proceedings of the 2016 ACM on Multimedia Conference","first-page":"1092","article-title":"Multimodal video description","author":"Ramanishka","year":"2016"},{"key":"10.1016\/j.neucom.2019.05.027_bib0056","series-title":"Proceedings of the 2016 ACM on Multimedia Conference","first-page":"1082","article-title":"Early embedding and late reranking for video captioning","author":"Dong","year":"2016"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231219306897?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231219306897?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2019,5,29]],"date-time":"2019-05-29T20:59:59Z","timestamp":1559163599000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231219306897"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,9]]},"references-count":56,"alternative-id":["S0925231219306897"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2019.05.027","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2019,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Semantic-filtered Soft-Split-Aware video captioning with audio-augmented feature","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2019.05.027","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2019 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}