{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,7]],"date-time":"2024-07-07T23:56:42Z","timestamp":1720396602832},"reference-count":66,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,9,1]],"date-time":"2023-09-01T00:00:00Z","timestamp":1693526400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2023,9]]},"DOI":"10.1016\/j.neucom.2023.126372","type":"journal-article","created":{"date-parts":[[2023,6,8]],"date-time":"2023-06-08T06:22:59Z","timestamp":1686205379000},"page":"126372","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Dual attentional transformer for video visual relation prediction"],"prefix":"10.1016","volume":"550","author":[{"given":"Mingcheng","family":"Qu","sequence":"first","affiliation":[]},{"given":"Ganlin","family":"Deng","sequence":"additional","affiliation":[]},{"given":"Donglin","family":"Di","sequence":"additional","affiliation":[]},{"given":"Jianxun","family":"Cui","sequence":"additional","affiliation":[]},{"given":"Tonghua","family":"Su","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2023.126372_b0005","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C., 2021. Vivit: A video vision transformer, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846.","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"10.1016\/j.neucom.2023.126372_b0010","unstructured":"Bashivan, P., Rish, I., Yeasin, M., Codella, N., 2015. Learning representations from eeg with deep recurrent-convolutional neural networks. arXiv preprint arXiv:1511.06448."},{"key":"10.1016\/j.neucom.2023.126372_b0015","unstructured":"Bertasius, G., Wang, H., Torresani, L., 2021. Is space-time attention all you need for video understanding. arXiv preprint arXiv:2102.05095 2, 4."},{"key":"10.1016\/j.neucom.2023.126372_b0020","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.neucom.2023.126372_b0025","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1006\/jvci.1997.0404","article-title":"A survey on the automatic indexing of video data","volume":"10","author":"Brunelli","year":"1999","journal-title":"J. Visual Commun. Image Represent."},{"key":"10.1016\/j.neucom.2023.126372_b0030","doi-asserted-by":"crossref","first-page":"91","DOI":"10.1016\/j.neucom.2020.12.029","article-title":"3-d relation network for visual relation recognition in videos","volume":"432","author":"Cao","year":"2021","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2023.126372_b0035","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A., 2017. Quo vadis, action recognition? a new model and the kinetics dataset, in: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"10.1016\/j.neucom.2023.126372_b0040","doi-asserted-by":"crossref","unstructured":"Chen, C.F.R., Panda, R., Ramakrishnan, K., Feris, R., Cohn, J., Oliva, A., Fan, Q., 2021. Deep analysis of cnn-based spatio-temporal representations for action recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6165\u20136175.","DOI":"10.1109\/CVPR46437.2021.00610"},{"key":"10.1016\/j.neucom.2023.126372_b0045","doi-asserted-by":"crossref","unstructured":"Chen, M.X., Firat, O., Bapna, A., Johnson, M., Macherey, W., Foster, G., Jones, L., Parmar, N., Schuster, M., Chen, Z., et al., 2018. The best of both worlds: Combining recent advances in neural machine translation. arXiv preprint arXiv:1804.09849.","DOI":"10.18653\/v1\/P18-1008"},{"key":"10.1016\/j.neucom.2023.126372_b0050","doi-asserted-by":"crossref","unstructured":"Chen, Y., Cao, Y., Hu, H., Wang, L., 2020. Memory enhanced global-local aggregation for video object detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10337\u201310346.","DOI":"10.1109\/CVPR42600.2020.01035"},{"key":"10.1016\/j.neucom.2023.126372_b0055","doi-asserted-by":"crossref","unstructured":"Dai, Z., Yang, Z., Yang, Y., Carbonell, J., Le, Q.V., Salakhutdinov, R., 2019. Transformer-xl: Attentive language models beyond a fixed-length context. arXiv preprint arXiv:1901.02860.","DOI":"10.18653\/v1\/P19-1285"},{"key":"10.1016\/j.neucom.2023.126372_b0060","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K., 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805."},{"key":"10.1016\/j.neucom.2023.126372_b0065","doi-asserted-by":"crossref","unstructured":"Di, D., Shang, X., Zhang, W., Yang, X., Chua, T.S., 2019. Multiple hypothesis video relation detection, in: 2019 IEEE Fifth International Conference on Multimedia Big Data (BigMM), IEEE. pp. 287\u2013291.","DOI":"10.1109\/BigMM.2019.000-9"},{"key":"10.1016\/j.neucom.2023.126372_b0070","doi-asserted-by":"crossref","unstructured":"Ding, M., Xiao, B., Codella, N., Luo, P., Wang, J., Yuan, L., 2022. Davit: Dual attention vision transformers, in: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXIV, Springer. pp. 74\u201392.","DOI":"10.1007\/978-3-031-20053-3_5"},{"key":"10.1016\/j.neucom.2023.126372_b0075","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al., 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929."},{"key":"10.1016\/j.neucom.2023.126372_b0080","doi-asserted-by":"crossref","unstructured":"Dzabraev, M., Kalashnikov, M., Komkov, S., Petiushko, A., 2021. Mdmmt: Multidomain multimodal transformer for video retrieval, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3354\u20133363.","DOI":"10.1109\/CVPRW53098.2021.00374"},{"key":"10.1016\/j.neucom.2023.126372_b0085","doi-asserted-by":"crossref","unstructured":"Fan, H., Xiong, B., Mangalam, K., Li, Y., Yan, Z., Malik, J., Feichtenhofer, C., 2021. Multiscale vision transformers, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6824\u20136835.","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"10.1016\/j.neucom.2023.126372_b0090","doi-asserted-by":"crossref","unstructured":"Fan, Y., Lu, X., Li, D., Liu, Y., 2016. Video-based emotion recognition using cnn-rnn and c3d hybrid networks, in: Proceedings of the 18th ACM international conference on multimodal interaction, pp. 445\u2013450.","DOI":"10.1145\/2993148.2997632"},{"key":"10.1016\/j.neucom.2023.126372_b0095","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., 2020. X3d: Expanding architectures for efficient video recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 203\u2013213.","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"10.1016\/j.neucom.2023.126372_b0100","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K., 2019. Slowfast networks for video recognition, in: Proceedings of the IEEE\/CVF international conference on computer vision, pp. 6202\u20136211.","DOI":"10.1109\/ICCV.2019.00630"},{"key":"10.1016\/j.neucom.2023.126372_b0105","doi-asserted-by":"crossref","unstructured":"Fu, J., Liu, J., Tian, H., Li, Y., Bao, Y., Fang, Z., Lu, H., 2019. Dual attention network for scene segmentation, in: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 3146\u20133154.","DOI":"10.1109\/CVPR.2019.00326"},{"key":"10.1016\/j.neucom.2023.126372_b0110","first-page":"214","article-title":"Multi-modal transformer for video retrieval","author":"Gabeur","year":"2020","journal-title":"European Conference on Computer Vision, Springer"},{"key":"10.1016\/j.neucom.2023.126372_b0115","doi-asserted-by":"crossref","unstructured":"Gao, K., Chen, L., Huang, Y., Xiao, J., 2021. Video relation detection via tracklet based visual transformer, in: Proceedings of the 29th ACM International Conference on Multimedia, pp. 4833\u20134837.","DOI":"10.1145\/3474085.3479231"},{"key":"10.1016\/j.neucom.2023.126372_b0120","doi-asserted-by":"crossref","unstructured":"Goyal, R., Ebrahimi Kahou, S., Michalski, V., Materzynska, J., Westphal, S., Kim, H., Haenel, V., Fruend, I., Yianilos, P., Mueller-Freitag, M., et al., 2017. The something something video database for learning and evaluating visual common sense, in: Proceedings of the IEEE international conference on computer vision, pp. 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"10.1016\/j.neucom.2023.126372_b0125","first-page":"202","article-title":"Spark: Spatial-aware online incremental attack against visual tracking","author":"Guo","year":"2020","journal-title":"European Conference on Computer Vision, Springer"},{"key":"10.1016\/j.neucom.2023.126372_b0130","unstructured":"Hsieh, J.T., Liu, B., Huang, D.A., Fei-Fei, L.F., Niebles, J.C., 2018. Learning to decompose and disentangle representations for video prediction. Advances in neural information processing systems 31."},{"key":"10.1016\/j.neucom.2023.126372_b0135","doi-asserted-by":"crossref","unstructured":"Kondratyuk, D., Yuan, L., Li, Y., Zhang, L., Tan, M., Brown, M., Gong, B., 2021. Movinets: Mobile video networks for efficient video recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16020\u201316030.","DOI":"10.1109\/CVPR46437.2021.01576"},{"key":"10.1016\/j.neucom.2023.126372_b0140","doi-asserted-by":"crossref","first-page":"582","DOI":"10.3390\/rs12030582","article-title":"Classification of hyperspectral image based on double-branch dual-attention mechanism network","volume":"12","author":"Li","year":"2020","journal-title":"Remote Sens."},{"key":"10.1016\/j.neucom.2023.126372_b0145","doi-asserted-by":"crossref","unstructured":"Li, Y., Yang, X., Shang, X., Chua, T.S., 2021. Interventional video relation detection, in: Proceedings of the 29th ACM International Conference on Multimedia, pp. 4091\u20134099.","DOI":"10.1145\/3474085.3475540"},{"key":"10.1016\/j.neucom.2023.126372_b0150","doi-asserted-by":"crossref","unstructured":"Liu, C., Jin, Y., Xu, K., Gong, G., Mu, Y., 2020. Beyond short-term snippet: Video relation detection with spatio-temporal global context, in: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10840\u201310849.","DOI":"10.1109\/CVPR42600.2020.01085"},{"key":"10.1016\/j.neucom.2023.126372_b0155","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B., 2021. Swin transformer: Hierarchical vision transformer using shifted windows, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.neucom.2023.126372_b0160","doi-asserted-by":"crossref","unstructured":"Monfort, M., Jin, S., Liu, A., Harwath, D., Feris, R., Glass, J., Oliva, A., 2021. Spoken moments: Learning joint audio-visual representations from video descriptions, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14871\u201314881.","DOI":"10.1109\/CVPR46437.2021.01463"},{"key":"10.1016\/j.neucom.2023.126372_b0165","doi-asserted-by":"crossref","first-page":"186","DOI":"10.1177\/1940161220964767","article-title":"Right-wing youtube: a supply and demand perspective","volume":"27","author":"Munger","year":"2022","journal-title":"Int. J. Press\/Politics"},{"key":"10.1016\/j.neucom.2023.126372_b0170","doi-asserted-by":"crossref","unstructured":"Neimark, D., Bar, O., Zohar, M., Asselmann, D., 2021. Video transformer network, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3163\u20133172.","DOI":"10.1109\/ICCVW54120.2021.00355"},{"key":"10.1016\/j.neucom.2023.126372_b0175","doi-asserted-by":"crossref","unstructured":"Park, J., Lee, J., Sohn, K., 2021. Bridge to answer: Structure-aware graph interaction network for video question answering, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15526\u201315535.","DOI":"10.1109\/CVPR46437.2021.01527"},{"key":"10.1016\/j.neucom.2023.126372_b0180","doi-asserted-by":"crossref","unstructured":"Peyre, J., Laptev, I., Schmid, C., Sivic, J., 2019. Detecting unseen visual relations using analogies, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1981\u20131990.","DOI":"10.1109\/ICCV.2019.00207"},{"key":"10.1016\/j.neucom.2023.126372_b0185","doi-asserted-by":"crossref","DOI":"10.1016\/j.artint.2020.103434","article-title":"Spatial relation learning for explainable image classification and annotation in critical applications","volume":"292","author":"Pierrard","year":"2021","journal-title":"Artif. Intell."},{"key":"10.1016\/j.neucom.2023.126372_b0190","doi-asserted-by":"crossref","unstructured":"Qi, X., Liu, C., Schuckers, S., 2018. Cnn based key frame extraction for face in video recognition, in: 2018 IEEE 4th international conference on identity, security, and behavior analysis (ISBA), IEEE. pp. 1\u20138.","DOI":"10.1109\/ISBA.2018.8311477"},{"key":"10.1016\/j.neucom.2023.126372_b0195","doi-asserted-by":"crossref","unstructured":"Qian, X., Zhuang, Y., Li, Y., Xiao, S., Pu, S., Xiao, J., 2019. Video relation detection with spatio-temporal graph, in: Proceedings of the 27th ACM International Conference on Multimedia, pp. 84\u201393.","DOI":"10.1145\/3343031.3351058"},{"key":"10.1016\/j.neucom.2023.126372_b0200","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"10.1016\/j.neucom.2023.126372_b0205","unstructured":"Ren, S., He, K., Girshick, R., Sun, J., 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28."},{"key":"10.1016\/j.neucom.2023.126372_b0210","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"Imagenet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vision"},{"key":"10.1016\/j.neucom.2023.126372_b0215","doi-asserted-by":"crossref","unstructured":"Shang, X., Di, D., Xiao, J., Cao, Y., Yang, X., Chua, T.S., 2019. Annotating objects and relations in user-generated videos, in: Proceedings of the 2019 on International Conference on Multimedia Retrieval, pp. 279\u2013287.","DOI":"10.1145\/3323873.3325056"},{"key":"10.1016\/j.neucom.2023.126372_b0220","doi-asserted-by":"crossref","unstructured":"Shang, X., Li, Y., Xiao, J., Ji, W., Chua, T.S., 2021. Video visual relation detection via iterative inference, in: Proceedings of the 29th ACM International Conference on Multimedia, pp. 3654\u20133663.","DOI":"10.1145\/3474085.3475263"},{"key":"10.1016\/j.neucom.2023.126372_b0225","doi-asserted-by":"crossref","unstructured":"Shang, X., Ren, T., Guo, J., Zhang, H., Chua, T.S., 2017. Video visual relation detection, in: Proceedings of the 25th ACM international conference on Multimedia, pp. 1300\u20131308.","DOI":"10.1145\/3123266.3123380"},{"key":"10.1016\/j.neucom.2023.126372_b0230","doi-asserted-by":"crossref","unstructured":"Su, Z., Shang, X., Chen, J., Jiang, Y.G., Qiu, Z., Chua, T.S., 2020. Video relation detection via multiple hypothesis association, in: Proceedings of the 28th ACM International Conference on Multimedia, pp. 3127\u20133135.","DOI":"10.1145\/3394171.3413764"},{"key":"10.1016\/j.neucom.2023.126372_b0235","doi-asserted-by":"crossref","unstructured":"Sun, M., Xiao, J., Lim, E.G., Zhang, B., Zhao, Y., 2020. Fast template matching and update for video object tracking and segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10791\u201310799.","DOI":"10.1109\/CVPR42600.2020.01080"},{"key":"10.1016\/j.neucom.2023.126372_b0240","doi-asserted-by":"crossref","unstructured":"Sun, X., Ren, T., Zi, Y., Wu, G., 2019. Video visual relation detection via multi-modal feature fusion, in: Proceedings of the 27th ACM International Conference on Multimedia, pp. 2657\u20132661.","DOI":"10.1145\/3343031.3356076"},{"key":"10.1016\/j.neucom.2023.126372_b0245","doi-asserted-by":"crossref","unstructured":"Tang, Y., Ding, D., Rao, Y., Zheng, Y., Zhang, D., Zhao, L., Lu, J., Zhou, J., 2019. Coin: A large-scale dataset for comprehensive instructional video analysis, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1207\u20131216.","DOI":"10.1109\/CVPR.2019.00130"},{"key":"10.1016\/j.neucom.2023.126372_b0250","doi-asserted-by":"crossref","first-page":"64","DOI":"10.1145\/2812802","article-title":"Yfcc100m: The new data in multimedia research","volume":"59","author":"Thomee","year":"2016","journal-title":"Commun. ACM"},{"key":"10.1016\/j.neucom.2023.126372_b0255","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"Touvron","year":"2021","journal-title":"International Conference on Machine Learning, PMLR"},{"key":"10.1016\/j.neucom.2023.126372_b0260","doi-asserted-by":"crossref","unstructured":"Tsai, Y.H.H., Divvala, S., Morency, L.P., Salakhutdinov, R., Farhadi, A., 2019. Video relationship reasoning using gated spatio-temporal energy graph, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10424\u201310433.","DOI":"10.1109\/CVPR.2019.01067"},{"key":"10.1016\/j.neucom.2023.126372_b0265","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"10.1016\/j.neucom.2023.126372_b0270","doi-asserted-by":"crossref","first-page":"2740","DOI":"10.1109\/TPAMI.2018.2868668","article-title":"Temporal segment networks for action recognition in videos","volume":"41","author":"Wang","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2023.126372_b0275","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K., 2018b. Non-local neural networks, in: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 7794\u20137803.","DOI":"10.1109\/CVPR.2018.00813"},{"key":"10.1016\/j.neucom.2023.126372_b0280","unstructured":"Weissenborn, D., T\u00e4ckstr\u00f6m, O., Uszkoreit, J., 2019. Scaling autoregressive video models. arXiv preprint arXiv:1906.02634."},{"key":"10.1016\/j.neucom.2023.126372_b0285","doi-asserted-by":"crossref","unstructured":"Wojke, N., Bewley, A., Paulus, D., 2017. Simple online and realtime tracking with a deep association metric, in: 2017 IEEE international conference on image processing (ICIP), IEEE. pp. 3645\u20133649.","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"10.1016\/j.neucom.2023.126372_b0290","unstructured":"Woo, S., Noh, J., Kim, K., 2021. What and when to look?: Temporal span proposal network for video visual relation detection. arXiv preprint arXiv:2107.07154."},{"key":"10.1016\/j.neucom.2023.126372_b0295","doi-asserted-by":"crossref","first-page":"1143","DOI":"10.1109\/TIP.2020.3040521","article-title":"Learning to anticipate egocentric actions by imagination","volume":"30","author":"Wu","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.neucom.2023.126372_b0300","first-page":"447","article-title":"Visual relation grounding in videos","author":"Xiao","year":"2020","journal-title":"European conference on computer vision, Springer"},{"key":"10.1016\/j.neucom.2023.126372_b0305","doi-asserted-by":"crossref","unstructured":"Xie, W., Ren, G., Liu, S., 2020. Video relation detection with trajectory-aware multi-modal features, in: Proceedings of the 28th ACM International Conference on Multimedia, pp. 4590\u20134594.","DOI":"10.1145\/3394171.3416284"},{"key":"10.1016\/j.neucom.2023.126372_b0310","doi-asserted-by":"crossref","unstructured":"Yang, C., Xu, Y., Shi, J., Dai, B., Zhou, B., 2020. Temporal pyramid network for action recognition, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 591\u2013600.","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"10.1016\/j.neucom.2023.126372_b0315","doi-asserted-by":"crossref","unstructured":"Yao, Y., Zhang, A., Han, X., Li, M., Weber, C., Liu, Z., Wermter, S., Sun, M., 2021. Visual distant supervision for scene graph generation, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15816\u201315826.","DOI":"10.1109\/ICCV48922.2021.01552"},{"key":"10.1016\/j.neucom.2023.126372_b0320","doi-asserted-by":"crossref","unstructured":"Ying, X., Li, X., Chuah, M.C., 2021. Srnet: Spatial relation network for efficient single-stage instance segmentation in videos, in: Proceedings of the 29th ACM International Conference on Multimedia, pp. 347\u2013356.","DOI":"10.1145\/3474085.3475626"},{"key":"10.1016\/j.neucom.2023.126372_b0325","doi-asserted-by":"crossref","unstructured":"Zhang, H., Kyaw, Z., Yu, J., Chang, S.F., 2017. Ppr-fcn: Weakly supervised visual relation detection via parallel pairwise r-fcn, in: Proceedings of the IEEE international conference on computer vision, pp. 4233\u20134241.","DOI":"10.1109\/ICCV.2017.454"},{"key":"10.1016\/j.neucom.2023.126372_b0330","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Li, X., Liu, C., Shuai, B., Zhu, Y., Brattoli, B., Chen, H., Marsic, I., Tighe, J., 2021. Vidtr: Video transformer without convolutions, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13577\u201313587.","DOI":"10.1109\/ICCV48922.2021.01332"}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231223004952?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0925231223004952?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,1,29]],"date-time":"2024-01-29T13:58:31Z","timestamp":1706536711000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0925231223004952"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,9]]},"references-count":66,"alternative-id":["S0925231223004952"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2023.126372","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2023,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Dual attentional transformer for video visual relation prediction","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2023.126372","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"126372"}}