{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T21:19:18Z","timestamp":1730323158324,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100004739","name":"Youth Innovation Promotion Association of the Chinese Academy of Sciences","doi-asserted-by":"publisher","award":["2018497"],"id":[{"id":"10.13039\/501100004739","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundation of China","award":["61836011, 61822208, 61632019"]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,12]]},"DOI":"10.1145\/3394171.3413931","type":"proceedings-article","created":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T08:26:18Z","timestamp":1602491178000},"page":"1497-1505","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":57,"title":["Boosting Continuous Sign Language Recognition via Cross Modality Augmentation"],"prefix":"10.1145","author":[{"given":"Junfu","family":"Pu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China & Hefei Comprehensive National Science Center, Hefei, China"}]},{"given":"Hezhen","family":"Hu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}]},{"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China & Hefei Comprehensive National Science Center, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2020,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Patrick Buehler Andrew Zisserman and Mark Everingham. 2009. Learning sign language by watching TV (using weakly aligned subtitles). In CVPR. 2961--2968. Patrick Buehler Andrew Zisserman and Mark Everingham. 2009. Learning sign language by watching TV (using weakly aligned subtitles). In CVPR. 2961--2968.","DOI":"10.1109\/CVPR.2009.5206523"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Necati Cihan Camgoz Simon Hadfield Oscar Koller and Richard Bowden. 2017. SubUNets: End-to-end hand shape and continuous sign language recognition. In ICCV. 3075--3084. Necati Cihan Camgoz Simon Hadfield Oscar Koller and Richard Bowden. 2017. SubUNets: End-to-end hand shape and continuous sign language recognition. In ICCV. 3075--3084.","DOI":"10.1109\/ICCV.2017.332"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira and Andrew Zisserman. 2017. Quo vadis action recognition? A new model and the kinetics dataset. In CVPR. 6299--6308. Joao Carreira and Andrew Zisserman. 2017. Quo vadis action recognition? A new model and the kinetics dataset. In CVPR. 6299--6308.","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Chien-Yi Chang De-An Huang Yanan Sui Li Fei-Fei and Juan Carlos Niebles. 2019. Dtextsuperscript3TW: Discriminative differentiable dynamic time warping for weakly supervised action alignment and segmentation. In CVPR. 3546--3555. Chien-Yi Chang De-An Huang Yanan Sui Li Fei-Fei and Juan Carlos Niebles. 2019. Dtextsuperscript3TW: Discriminative differentiable dynamic time warping for weakly supervised action alignment and segmentation. In CVPR. 3546--3555.","DOI":"10.1109\/CVPR.2019.00366"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Xi Chen and Markus Koskela. 2014. Using appearance-based hand features for dynamic RGB-D gesture recognition. In ICPR. 411--416. Xi Chen and Markus Koskela. 2014. Using appearance-based hand features for dynamic RGB-D gesture recognition. In ICPR. 411--416.","DOI":"10.1109\/ICPR.2014.79"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"crossref","unstructured":"Changmao Cheng Chi Zhang Yichen Wei and Yu-Gang Jiang. 2019. Sparse temporal causal convolution for efficient action modeling. In ACM MM. 592--600. Changmao Cheng Chi Zhang Yichen Wei and Yu-Gang Jiang. 2019. Sparse temporal causal convolution for efficient action modeling. In ACM MM. 592--600.","DOI":"10.1145\/3343031.3351054"},{"key":"e_1_3_2_2_7_1","unstructured":"Kyunghyun Cho Bart Van Merri\u00ebnboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014). Kyunghyun Cho Bart Van Merri\u00ebnboer Caglar Gulcehre Dzmitry Bahdanau Fethi Bougares Holger Schwenk and Yoshua Bengio. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Necati Cihan Camgoz Simon Hadfield Oscar Koller Hermann Ney and Richard Bowden. 2018. Neural sign language translation. In CVPR. 7784--7793. Necati Cihan Camgoz Simon Hadfield Oscar Koller Hermann Ney and Richard Bowden. 2018. Neural sign language translation. In CVPR. 7784--7793.","DOI":"10.1109\/CVPR.2018.00812"},{"key":"e_1_3_2_2_9_1","unstructured":"Runpeng Cui Hu Liu and Changshui Zhang. 2017. Recurrent convolutional neural networks for continuous sign language recognition by staged optimization. In CVPR. 7361--7369. Runpeng Cui Hu Liu and Changshui Zhang. 2017. Recurrent convolutional neural networks for continuous sign language recognition by staged optimization. In CVPR. 7361--7369."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2889563"},{"key":"e_1_3_2_2_11_1","unstructured":"Marco Cuturi and Mathieu Blondel. 2017. Soft-DTW: A differentiable loss function for time-series. In ICML. 894--903. Marco Cuturi and Mathieu Blondel. 2017. Soft-DTW: A differentiable loss function for time-series. In ICML. 894--903."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Georgios D Evangelidis Gurkirt Singh and Radu Horaud. 2014. Continuous gesture recognition from articulated poses. In ECCV. 595--607. Georgios D Evangelidis Gurkirt Singh and Radu Horaud. 2014. Continuous gesture recognition from articulated poses. In ECCV. 595--607.","DOI":"10.1007\/978-3-319-16178-5_42"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Alex Graves Santiago Fern\u00e1ndez Faustino Gomez and J\u00fcrgen Schmidhuber. 2006. Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In ICML. 369--376. Alex Graves Santiago Fern\u00e1ndez Faustino Gomez and J\u00fcrgen Schmidhuber. 2006. Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In ICML. 369--376.","DOI":"10.1145\/1143844.1143891"},{"key":"e_1_3_2_2_14_1","unstructured":"Alex Graves and Navdeep Jaitly. 2014. Towards end-to-end speech recognition with recurrent neural networks. In ICML. 1764--1772. Alex Graves and Navdeep Jaitly. 2014. Towards end-to-end speech recognition with recurrent neural networks. In ICML. 1764--1772."},{"key":"e_1_3_2_2_15_1","first-page":"1575","article-title":"Hierarchical recurrent deep fusion using adaptive clip summarization for sign language translation","volume":"29","author":"Guo Dan","year":"2019","journal-title":"TIP"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Dan Guo Wengang Zhou Houqiang Li and Meng Wang. 2018. Hierarchical LSTM for sign language translation. In AAAI. Dan Guo Wengang Zhou Houqiang Li and Meng Wang. 2018. Hierarchical LSTM for sign language translation. In AAAI.","DOI":"10.1609\/aaai.v32i1.12235"},{"key":"e_1_3_2_2_17_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778. Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24261-3_7"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"crossref","unstructured":"Jie Huang Wengang Zhou Qilin Zhang Houqiang Li and Weiping Li. 2018. Video-based sign language recognition without temporal segmentation. In AAAI. Jie Huang Wengang Zhou Qilin Zhang Houqiang Li and Weiping Li. 2018. Video-based sign language recognition without temporal segmentation. In AAAI.","DOI":"10.1609\/aaai.v32i1.11903"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Oscar Koller Cihan Camgoz Hermann Ney and Richard Bowden. 2019. Weakly supervised learning with multi-stream CNN-LSTM-HMMs to discover sequential parallelism in sign language videos. TPAMI (2019). Oscar Koller Cihan Camgoz Hermann Ney and Richard Bowden. 2019. Weakly supervised learning with multi-stream CNN-LSTM-HMMs to discover sequential parallelism in sign language videos. TPAMI (2019).","DOI":"10.1109\/TPAMI.2019.2911077"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2015.09.013"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Oscar Koller Hermann Ney and Richard Bowden. 2016a. Deep hand: How to train a CNN on 1 million hand images when your data is continuous and weakly labelled. In CVPR. 3793--3802. Oscar Koller Hermann Ney and Richard Bowden. 2016a. Deep hand: How to train a CNN on 1 million hand images when your data is continuous and weakly labelled. In CVPR. 3793--3802.","DOI":"10.1109\/CVPR.2016.412"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Oscar Koller O Zargaran Hermann Ney and Richard Bowden. 2016b. Deep sign: Hybrid CNN-HMM for continuous sign language recognition. In BMVC. Oscar Koller O Zargaran Hermann Ney and Richard Bowden. 2016b. Deep sign: Hybrid CNN-HMM for continuous sign language recognition. In BMVC.","DOI":"10.5244\/C.30.136"},{"volume-title":"Re-sign: Re-aligned end-to-end sequence modelling with deep recurrent CNN-HMMs. In CVPR. 4297--4305.","year":"2017","author":"Koller Oscar","key":"e_1_3_2_2_25_1"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1121-3"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Tomas Pfister James Charles and Andrew Zisserman. 2013. Large-scale learning of sign language by watching TV (Using Co-occurrences).. In BMVC. Tomas Pfister James Charles and Andrew Zisserman. 2013. Large-scale learning of sign language by watching TV (Using Co-occurrences).. In BMVC.","DOI":"10.5244\/C.27.20"},{"key":"e_1_3_2_2_28_1","unstructured":"Junfu Pu Wengang Zhou and Houqiang Li. 2018. Dilated convolutional network with iterative optimization for continuous sign language recognition. In IJCAI. Junfu Pu Wengang Zhou and Houqiang Li. 2018. Dilated convolutional network with iterative optimization for continuous sign language recognition. In IJCAI."},{"key":"e_1_3_2_2_29_1","unstructured":"Junfu Pu Wengang Zhou and Houqiang Li. 2019. Iterative alignment network for continuous sign language recognition. In CVPR. 4165--4174. Junfu Pu Wengang Zhou and Houqiang Li. 2019. Iterative alignment network for continuous sign language recognition. In CVPR. 4165--4174."},{"key":"e_1_3_2_2_30_1","unstructured":"Zhaofan Qiu Ting Yao and Tao Mei. 2017. Learning spatio-temporal representation with pseudo-3D residual networks. In ICCV. 5533--5541. Zhaofan Qiu Ting Yao and Tao Mei. 2017. Learning spatio-temporal representation with pseudo-3D residual networks. In ICCV. 5533--5541."},{"key":"e_1_3_2_2_31_1","unstructured":"Zhaofan Qiu Ting Yao Chong-Wah Ngo Xinmei Tian and Tao Mei. 2019. Learning spatio-temporal representation with local and global diffusion. In CVPR. 12056--12065. Zhaofan Qiu Ting Yao Chong-Wah Ngo Xinmei Tian and Tao Mei. 2019. Learning spatio-temporal representation with local and global diffusion. In CVPR. 12056--12065."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Olga Russakovsky Jia Deng Hao Su Jonathan Krause Sanjeev Satheesh Sean Ma Zhiheng Huang Andrej Karpathy Aditya Khosla Michael Bernstein etal 2015. Imagenet large scale visual recognition challenge. IJCV (2015) 211--252. Olga Russakovsky Jia Deng Hao Su Jonathan Krause Sanjeev Satheesh Sean Ma Zhiheng Huang Andrej Karpathy Aditya Khosla Michael Bernstein et al. 2015. Imagenet large scale visual recognition challenge. IJCV (2015) 211--252.","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_2_33_1","unstructured":"Xiangxi Shi Jianfei Cai Shafiq Joty and Jiuxiang Gu. 2019. Watch it twice: Video captioning with a refocused video encoder. In ACM MM. 818--826. Xiangxi Shi Jianfei Cai Shafiq Joty and Jiuxiang Gu. 2019. Watch it twice: Video captioning with a refocused video encoder. In ACM MM. 818--826."},{"key":"e_1_3_2_2_34_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. In NeurIPS. 568--576. Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. In NeurIPS. 568--576."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.735811"},{"key":"e_1_3_2_2_36_1","unstructured":"Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks. In NeurIPS. 3104--3112. Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks. In NeurIPS. 3104--3112."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Wei Liu Yangqing Jia Pierre Sermanet Scott Reed Dragomir Anguelov Dumitru Erhan Vincent Vanhoucke and Andrew Rabinovich. 2015. Going deeper with convolutions. In CVPR. 1--9. Christian Szegedy Wei Liu Yangqing Jia Pierre Sermanet Scott Reed Dragomir Anguelov Dumitru Erhan Vincent Vanhoucke and Andrew Rabinovich. 2015. Going deeper with convolutions. In CVPR. 1--9.","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Du Tran Lubomir Bourdev Rob Fergus Lorenzo Torresani and Manohar Paluri. 2015. Learning spatio-temporal features with 3D convolutional networks. In ICCV. 4489--4497. Du Tran Lubomir Bourdev Rob Fergus Lorenzo Torresani and Manohar Paluri. 2015. Learning spatio-temporal features with 3D convolutional networks. In ICCV. 4489--4497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Subhashini Venugopalan Marcus Rohrbach Jeffrey Donahue Raymond Mooney Trevor Darrell and Kate Saenko. 2015. Sequence to sequence-video to text. In ICCV. Subhashini Venugopalan Marcus Rohrbach Jeffrey Donahue Raymond Mooney Trevor Darrell and Kate Saenko. 2015. Sequence to sequence-video to text. In ICCV.","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"crossref","unstructured":"Limin Wang Yuanjun Xiong Zhe Wang Yu Qiao Dahua Lin Xiaoou Tang and Luc Van Gool. 2016. Temporal segment networks: Towards good practices for deep action recognition. In ECCV. 20--36. Limin Wang Yuanjun Xiong Zhe Wang Yu Qiao Dahua Lin Xiaoou Tang and Luc Van Gool. 2016. Temporal segment networks: Towards good practices for deep action recognition. In ECCV. 20--36.","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Shuo Wang Dan Guo Wen-gang Zhou Zheng-Jun Zha and Meng Wang. 2018. Connectionist temporal fusion for sign language translation. In ACM MM. 1483--1491. Shuo Wang Dan Guo Wen-gang Zhou Zheng-Jun Zha and Meng Wang. 2018. Connectionist temporal fusion for sign language translation. In ACM MM. 1483--1491.","DOI":"10.1145\/3240508.3240671"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2537340"},{"key":"e_1_3_2_2_43_1","unstructured":"Saining Xie Chen Sun Jonathan Huang Zhuowen Tu and Kevin Murphy. 2018. Rethinking spatio-temporal feature learning: Speed-accuracy trade-offs in video classification. In ECCV. 305--321. Saining Xie Chen Sun Jonathan Huang Zhuowen Tu and Kevin Murphy. 2018. Rethinking spatio-temporal feature learning: Speed-accuracy trade-offs in video classification. In ECCV. 305--321."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Shijie Yang Liang Li Shuhui Wang Dechao Meng Qingming Huang and Qi Tian. 2019. Structured stochastic recurrent network for linguistic video prediction. In ACM MM. 21--29. Shijie Yang Liang Li Shuhui Wang Dechao Meng Qingming Huang and Qi Tian. 2019. Structured stochastic recurrent network for linguistic video prediction. In ACM MM. 21--29.","DOI":"10.1145\/3343031.3350859"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Jihai Zhang Wengang Zhou Chao Xie Junfu Pu and Houqiang Li. 2016. Chinese sign language recognition with adaptive HMM. In ICME. 1--6. Jihai Zhang Wengang Zhou Chao Xie Junfu Pu and Houqiang Li. 2016. Chinese sign language recognition with adaptive HMM. In ICME. 1--6.","DOI":"10.1109\/ICME.2016.7552950"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","unstructured":"Hao Zhou Wengang Zhou and Houqiang Li. 2019. Dynamic pseudo label decoding for continuous sign language recognition. In ICME. 1282--1287. Hao Zhou Wengang Zhou and Houqiang Li. 2019. Dynamic pseudo label decoding for continuous sign language recognition. In ICME. 1282--1287.","DOI":"10.1109\/ICME.2019.00223"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Hao Zhou Wengang Zhou Yun Zhou and Houqiang Li. 2020. Spatial-temporal multi-cue network for continuous sign language recognition. In AAAI. Hao Zhou Wengang Zhou Yun Zhou and Houqiang Li. 2020. Spatial-temporal multi-cue network for continuous sign language recognition. In AAAI.","DOI":"10.1609\/aaai.v34i07.7001"},{"key":"e_1_3_2_2_48_1","unstructured":"Yongqing Zhu and Shuqiang Jiang. 2019. Attention-based densely connected LSTM for video captioning. In ACM MM. 802--810. Yongqing Zhu and Shuqiang Jiang. 2019. Attention-based densely connected LSTM for video captioning. In ACM MM. 802--810."},{"volume-title":"ECO: Efficient convolutional network for online video understanding. In ECCV. 695--712.","year":"2018","author":"Zolfaghari Mohammadreza","key":"e_1_3_2_2_49_1"}],"event":{"name":"MM '20: The 28th ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Seattle WA USA","acronym":"MM '20"},"container-title":["Proceedings of the 28th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3394171.3413931","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,5]],"date-time":"2023-01-05T15:33:54Z","timestamp":1672932834000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413931"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,12]]},"references-count":49,"alternative-id":["10.1145\/3394171.3413931","10.1145\/3394171"],"URL":"https:\/\/doi.org\/10.1145\/3394171.3413931","relation":{},"subject":[],"published":{"date-parts":[[2020,10,12]]},"assertion":[{"value":"2020-10-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}