{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,7]],"date-time":"2024-07-07T23:59:45Z","timestamp":1720396785925},"reference-count":54,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,6,1]],"date-time":"2022-06-01T00:00:00Z","timestamp":1654041600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2022,6]]},"DOI":"10.1016\/j.neucom.2022.02.062","type":"journal-article","created":{"date-parts":[[2022,3,2]],"date-time":"2022-03-02T16:37:55Z","timestamp":1646239075000},"page":"88-96","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":6,"special_numbering":"C","title":["Multimodal graph neural network for video procedural captioning"],"prefix":"10.1016","volume":"488","author":[{"given":"Lei","family":"Ji","sequence":"first","affiliation":[]},{"given":"Rongcheng","family":"Tu","sequence":"additional","affiliation":[]},{"given":"Kevin","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Lijuan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Nan","family":"Duan","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2022.02.062_b0005","series-title":"Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)","article-title":"A case study on combining asr and visual features for generating instructional video captions","author":"Hessel","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0010","unstructured":"X. Wang, Y.-F. Wang, W.Y. Wang, Watch, listen, and describe: Globally and locally aligned cross-modal attentions for video captioning, arXiv preprint arXiv:1804.05448."},{"key":"10.1016\/j.neucom.2022.02.062_b0015","unstructured":"C. Sun, A. Myers, C. Vondrick, K. Murphy, C. Schmid, Videobert: A joint model for video and language representation learning, Proceedings of the IEEE international conference on computer vision."},{"key":"10.1016\/j.neucom.2022.02.062_b0020","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops","first-page":"958","article-title":"Multi-modal dense video captioning","author":"Iashin","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0025","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"2603","article-title":"Mart: Memory-augmented recurrent transformer for coherent video paragraph captioning","author":"Lei","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0030","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"4355","article-title":"Learning semantic concepts and temporal alignment for narrated video procedural captioning","author":"Shi","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0035","unstructured":"F.F. Xu, L. Ji, B. Shi, J. Du, G. Neubig, Y. Bisk, N. Duan, A benchmark for structured procedural knowledge extraction from cooking videos, arXiv preprint arXiv:2005.00706."},{"key":"10.1016\/j.neucom.2022.02.062_b0040","series-title":"International Conference on Learning Representations (ICLR)","article-title":"Semi-supervised classification with graph convolutional networks","author":"Kipf","year":"2017"},{"key":"10.1016\/j.neucom.2022.02.062_b0045","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"5288","article-title":"Msr-vtt: A large video description dataset for bridging video and language","author":"Xu","year":"2016"},{"key":"10.1016\/j.neucom.2022.02.062_b0050","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10870","article-title":"Spatio-temporal graph for video captioning with knowledge distillation","author":"Pan","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0055","series-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","first-page":"13278","article-title":"Object relational graph with teacher-recommended learning for video captioning","author":"Zhang","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0060","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"4281","article-title":"G3raphground: Graph-based language grounding","author":"Bajaj","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0065","doi-asserted-by":"crossref","unstructured":"P. Jiang, Y. Han, Reasoning with heterogeneous graph alignment for video question answering, in: Proceedings of the AAAI Conference on Artificial Intelligence, Vol. 34, 2020, pp. 11109\u201311116.","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"10.1016\/j.neucom.2022.02.062_b0070","unstructured":"R. Saqur, K. Narasimhan, Multimodal graph networks for compositional generalization in visual question answering, Advances in Neural Information Processing Systems."},{"key":"10.1016\/j.neucom.2022.02.062_b0075","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12746","article-title":"Multi-modal graph neural network for joint reasoning on vision and scene text","author":"Gao","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0080","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"706","article-title":"Dense-captioning events in videos","author":"Krishna","year":"2017"},{"key":"10.1016\/j.neucom.2022.02.062_b0085","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7190","article-title":"Bidirectional attentive fusion with context gating for dense video captioning","author":"Wang","year":"2018"},{"key":"10.1016\/j.neucom.2022.02.062_b0090","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6588","article-title":"Streamlined dense video captioning","author":"Mun","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0095","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"8739","article-title":"End-to-end dense video captioning with masked transformer","author":"Zhou","year":"2018"},{"issue":"8","key":"10.1016\/j.neucom.2022.02.062_b0100","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","article-title":"Long short-term memory","volume":"9","author":"Hochreiter","year":"1997","journal-title":"Neural computation"},{"key":"10.1016\/j.neucom.2022.02.062_b0105","unstructured":"K. Cho, B. Van Merri\u00ebnboer, C. Gulcehre, D. Bahdanau, F. Bougares, H. Schwenk, Y. Bengio, Learning phrase representations using rnn encoder-decoder for statistical machine translation, arXiv preprint arXiv:1406.1078."},{"key":"10.1016\/j.neucom.2022.02.062_b0110","series-title":"Proceedings of the IEEE international conference on computer vision","first-page":"4193","article-title":"Attention-based multimodal fusion for video description","author":"Hori","year":"2017"},{"key":"10.1016\/j.neucom.2022.02.062_b0115","series-title":"Proceedings of the 57th Conference of the Association for Computational Linguistics","first-page":"6382","article-title":"Dense procedure captioning in narrated instructional videos","author":"Shi","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0120","doi-asserted-by":"crossref","unstructured":"L. Zhou, C. Xu, J.J. Corso, Towards automatic learning of procedures from web instructional videos, in: Thirty-Second AAAI Conference on Artificial Intelligence, 2018.","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"10.1016\/j.neucom.2022.02.062_b0125","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"468","article-title":"Move forward and tell: A progressive generator of video descriptions","author":"Xiong","year":"2018"},{"key":"10.1016\/j.neucom.2022.02.062_b0130","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6598","article-title":"Adversarial inference for multi-sentence video description","author":"Park","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0135","unstructured":"S. Ging, M. Zolfaghari, H. Pirsiavash, T. Brox, Coot: Cooperative hierarchical transformer for video-text representation learning, arXiv preprint arXiv:2011.00597."},{"key":"10.1016\/j.neucom.2022.02.062_b0140","unstructured":"S. Chen, Y. Song, Y. Zhao, Q. Jin, Z. Zeng, B. Liu, J. Fu, A. Hauptmann, Activitynet 2019 task 3: Exploring contexts for dense captioning events in videos, arXiv preprint arXiv:1907.05092."},{"key":"10.1016\/j.neucom.2022.02.062_b0145","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7492","article-title":"Jointly localizing and describing events for dense video captioning","author":"Li","year":"2018"},{"key":"10.1016\/j.neucom.2022.02.062_b0150","unstructured":"A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, \u0141. Kaiser, I. Polosukhin, Attention is all you need, in: Advances in neural information processing systems, 2017, pp. 5998\u20136008."},{"key":"10.1016\/j.neucom.2022.02.062_b0155","unstructured":"C. Sun, F. Baradel, K. Murphy, C. Schmid, Contrastive bidirectional transformer for temporal representation learning, arXiv preprint arXiv:1906.05743."},{"key":"10.1016\/j.neucom.2022.02.062_b0160","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"374","article-title":"Cross-modal and hierarchical modeling of video and text","author":"Zhang","year":"2018"},{"key":"10.1016\/j.neucom.2022.02.062_b0165","unstructured":"G. Huang, B. Pang, Z. Zhu, C. Rivera, R. Soricut, Multimodal pretraining for dense video captioning, arXiv preprint arXiv:2011.11760."},{"key":"10.1016\/j.neucom.2022.02.062_b0170","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"8908","article-title":"Watch, listen and tell: Multi-modal weakly supervised dense event captioning","author":"Rahman","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0175","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10638","article-title":"Fine-grained video-text retrieval with hierarchical graph reasoning","author":"Chen","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0180","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"4041","article-title":"Visual-semantic graph matching for visual grounding","author":"Jing","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0185","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"399","article-title":"Videos as space-time region graphs","author":"Wang","year":"2018"},{"key":"10.1016\/j.neucom.2022.02.062_b0190","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10156","article-title":"G-tad: Sub-graph localization for temporal action detection","author":"Xu","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0195","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"14024","article-title":"Improving action segmentation via graph-based temporal reasoning","author":"Huang","year":"2020"},{"key":"10.1016\/j.neucom.2022.02.062_b0200","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"7094","article-title":"Graph convolutional networks for temporal action localization","author":"Zeng","year":"2019"},{"key":"10.1016\/j.neucom.2022.02.062_b0205","doi-asserted-by":"crossref","unstructured":"J. Park, J. Lee, I.-J. Kim, K. Sohn, Sumgraph: Video summarization via recursive graph modeling, in: 16th European Conference on Computer Vision, ECCV 2020, Springer, 2020, pp. 647\u2013663.","DOI":"10.1007\/978-3-030-58595-2_39"},{"key":"10.1016\/j.neucom.2022.02.062_b0210","unstructured":"W.L. Hamilton, R. Ying, J. Leskovec, Inductive representation learning on large graphs, arXiv preprint arXiv:1706.02216."},{"key":"10.1016\/j.neucom.2022.02.062_b0215","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"305","article-title":"Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification","author":"Xie","year":"2018"},{"key":"10.1016\/j.neucom.2022.02.062_b0220","article-title":"End-to-End Learning of Visual Representations from Uncurated Instructional Videos","author":"Miech","year":"2020","journal-title":"CVPR"},{"key":"10.1016\/j.neucom.2022.02.062_b0225","first-page":"2818","article-title":"Rethinking the inception architecture for computer vision","volume":"2016","author":"Szegedy","year":"2016","journal-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"10.1016\/j.neucom.2022.02.062_b0230","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1049","article-title":"Temporal action localization in untrimmed videos via multi-stage cnns","author":"Shou","year":"2016"},{"key":"10.1016\/j.neucom.2022.02.062_b0235","unstructured":"V. Iashin, E. Rahtu, A better use of audio-visual cues: Dense video captioning with bi-modal transformer, arXiv preprint arXiv:2005.08271."},{"key":"10.1016\/j.neucom.2022.02.062_b0240","series-title":"Proceedings of the 40th annual meeting on association for computational linguistics","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.neucom.2022.02.062_b0245","unstructured":"S. Banerjee, A. Lavie, Meteor: An automatic metric for mt evaluation with improved correlation with human judgments, in: Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization, 2005, pp. 65\u201372."},{"key":"10.1016\/j.neucom.2022.02.062_b0250","series-title":"Proceedings of the 42nd Annual Meeting on Association for Computational Linguistics","first-page":"605","article-title":"Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics","author":"Lin","year":"2004"},{"key":"10.1016\/j.neucom.2022.02.062_b0255","series-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","first-page":"4566","article-title":"Cider: Consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.neucom.2022.02.062_b0260","unstructured":"D.P. Kingma, J. Ba, Adam: A method for stochastic optimization, International Conference on Learning Representations."},{"key":"10.1016\/j.neucom.2022.02.062_b0265","unstructured":"A. Zisserman, J. Carreira, K. Simonyan, W. Kay, B. Zhang, C. Hillier, S. Vijayanarasimhan, F. Viola, T. Green, T. Back, et al., The kinetics human action video dataset."},{"key":"10.1016\/j.neucom.2022.02.062_b0270","unstructured":"A. Miech, D. Zhukov, J.-B. Alayrac, M. Tapaswi, I. Laptev, J. Sivic, Howto100m: Learning a text-video embedding by watching hundred million narrated video clips, ICCV."}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231222002247?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231222002247?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,3,6]],"date-time":"2023-03-06T05:15:54Z","timestamp":1678079754000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231222002247"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6]]},"references-count":54,"alternative-id":["S0925231222002247"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2022.02.062","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2022,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multimodal graph neural network for video procedural captioning","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2022.02.062","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2022 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}