{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,5]],"date-time":"2024-11-05T05:03:42Z","timestamp":1730783022899,"version":"3.28.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"name":"National Science Foundation of China","award":["62322209,62206245"]},{"name":"National Key Research and Development Program of China","award":["2022YFF0902302"]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680847","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"6774-6783","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Enabling Synergistic Full-Body Control in Prompt-Based Co-Speech Motion Generation"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0009-0007-1036-7737","authenticated-orcid":false,"given":"Bohong","family":"Chen","sequence":"first","affiliation":[{"name":"State Key Lab for CAD&CG, Zhejiang University, Hangzhou, Zhengjiang, China"}]},{"ORCID":"http:\/\/orcid.org\/0009-0007-6558-4165","authenticated-orcid":false,"given":"Yumeng","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Lab for CAD&CG, Zhejiang University, Hangzhou, Zhengjiang, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-8580-1103","authenticated-orcid":false,"given":"Yao-Xiang","family":"Ding","sequence":"additional","affiliation":[{"name":"State Key Lab for CAD&CG, Zhejiang University, Hangzhou, Zhengjiang, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-5485-3752","authenticated-orcid":false,"given":"Tianjia","family":"Shao","sequence":"additional","affiliation":[{"name":"State Key Lab for CAD&CG, Zhejiang University, Hangzhou, Zhengjiang, China"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-4243-6112","authenticated-orcid":false,"given":"Kun","family":"Zhou","sequence":"additional","affiliation":[{"name":"State Key Lab for CAD&CG, Zhejiang University, Hangzhou, Zhengjiang, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3550454.3555435"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"Tenglong Ao Zeyi Zhang and Libin Liu. 2023. GestureDiffuCLIP: Gesture Diffusion Model with CLIP Latents. ACM Trans. Graph. (2023) 18 pages. https:\/\/doi.org\/10.1145\/3592097","DOI":"10.1145\/3592097"},{"volume-title":"SINC: Spatial Composition of 3D Human Motions for Simultaneous Action Generation. ICCV","year":"2023","author":"Athanasiou Nikos","key":"e_1_3_2_1_3_1","unstructured":"Nikos Athanasiou, Mathis Petrovich, Michael J. Black, and G\u00fcl Varol. 2023. SINC: Spatial Composition of 3D Human Motions for Simultaneous Action Generation. ICCV (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475223"},{"volume-title":"Jorma Laaksonen, Mubarak Shah, and Fahad Shahbaz Khan.","year":"2023","author":"Bhunia Ankan Kumar","key":"e_1_3_2_1_5_1","unstructured":"Ankan Kumar Bhunia, Salman Khan, Hisham Cholakkal, Rao Muhammad Anwer, Jorma Laaksonen, Mubarak Shah, and Fahad Shahbaz Khan. 2023. Person Image Synthesis via Denoising Diffusion Model. CVPR (2023)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/192161.192272"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383315"},{"volume-title":"https:\/\/github.com\/LinghaoChan\/OpenTMA","year":"2024","author":"Chen Ling-Hao","key":"e_1_3_2_1_8_1","unstructured":"Ling-Hao Chen and Contributors OpenTMA. 2024. OpenTMA. https:\/\/github.com\/LinghaoChan\/OpenTMA (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00875"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01726"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Kiran Chhatre Radek Dan\u011b\u010dek Nikos Athanasiou Giorgio Becherini Christopher Peters Michael J. Black and Timo Bolkart. 2023. Emotional Speech-driven 3D Body Animation via Disentangled Latent Diffusion. arXiv:2312.04466 [cs.CV]","DOI":"10.1109\/CVPR52733.2024.00190"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","unstructured":"Radek Dan\u011b\u010dek Kiran Chhatre Shashank Tripathi YandongWen Michael Black and Timo Bolkart. 2023. Emotional Speech-Driven Animation with Content-Emotion Disentanglement. ACM. https:\/\/doi.org\/10.1145\/3610548.3618183","DOI":"10.1145\/3610548.3618183"},{"key":"e_1_3_2_1_13_1","unstructured":"Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. arXiv:2105.05233 [cs.LG]"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14734"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00912"},{"volume-title":"Sen Wang, and Li Cheng.","year":"2023","author":"Guo Chuan","key":"e_1_3_2_1_17_1","unstructured":"Chuan Guo, Yuxuan Mu, Muhammad Gohar Javed, Sen Wang, and Li Cheng. 2023. MoMask: Generative Masked Modeling of 3D Human Motions. (2023). arXiv:2312.00063 [cs.CV]"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00509"},{"volume-title":"Learning Speech-driven 3D Conversational Gestures from Video. arXiv preprint arXiv:2102.06837","year":"2021","author":"Habibie Ikhsanul","key":"e_1_3_2_1_19_1","unstructured":"Ikhsanul Habibie, Weipeng Xu, Dushyant Mehta, Lingjie Liu, Hans-Peter Seidel, Gerard Pons-Moll, Mohamed Elgharib, and Christian Theobalt. 2021. Learning Speech-driven 3D Conversational Gestures from Video. arXiv preprint arXiv:2102.06837 (2021)."},{"volume-title":"Denoising Diffusion Probabilistic Models. arXiv preprint arxiv:2006.11239","year":"2020","author":"Ho Jonathan","key":"e_1_3_2_1_20_1","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. arXiv preprint arxiv:2006.11239 (2020)."},{"key":"e_1_3_2_1_21_1","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-Free Diffusion Guidance. arXiv:2207.12598 [cs.LG]"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00205"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/11821830_17"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472306.3478333"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01110"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548400"},{"volume-title":"Black","year":"2024","author":"Liu Haiyang","key":"e_1_3_2_1_28_1","unstructured":"Haiyang Liu, Zihao Zhu, Giorgio Becherini, Yichen Peng, Mingyang Su, You Zhou, Naoya Iwamoto, Bo Zheng, and Michael J. Black. 2024. EMAGE: Towards Unified Holistic Co-Speech Gesture Generation via Masked Audio Gesture Modeling. arXiv:2401.00374 [cs.CV]"},{"volume-title":"BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. arXiv preprint arXiv:2203.05297","year":"2022","author":"Liu Haiyang","key":"e_1_3_2_1_29_1","unstructured":"Haiyang Liu, Zihao Zhu, Naoya Iwamoto, Yichen Peng, Zhengqing Li, You Zhou, Elif Bozkurt, and Bo Zheng. 2022. BEAT: A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis. arXiv preprint arXiv:2203.05297 (2022)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01021"},{"volume-title":"HumanTOMATO: Text-aligned Whole-body Motion Generation. arxiv:2310.12978","year":"2023","author":"Lu Shunlin","key":"e_1_3_2_1_31_1","unstructured":"Shunlin Lu, Ling-Hao Chen, Ailing Zeng, Jing Lin, Ruimao Zhang, Lei Zhang, and Heung-Yeung Shum. 2023. HumanTOMATO: Text-aligned Whole-body Motion Generation. arxiv:2310.12978 (2023)."},{"volume-title":"International Conference on Computer Vision. 5442--5451","author":"Mahmood Naureen","key":"e_1_3_2_1_32_1","unstructured":"Naureen Mahmood, Nima Ghorbani, Nikolaus F. Troje, Gerard Pons-Moll, and Michael J. Black. 2019. AMASS: Archive of Motion Capture as Surface Shapes. In International Conference on Computer Vision. 5442--5451."},{"key":"e_1_3_2_1_33_1","unstructured":"Evonne Ng Javier Romero Timur Bagautdinov Shaojie Bai Trevor Darrell Angjoo Kanazawa and Alexander Richard. 2024. From Audio to Photoreal Embodiment: Synthesizing Humans in Conversations. In ArXiv."},{"volume-title":"Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR).","author":"Pavlakos Georgios","key":"e_1_3_2_1_34_1","unstructured":"Georgios Pavlakos, Vasileios Choutas, Nima Ghorbani, Timo Bolkart, Ahmed A. A. Osman, Dimitrios Tzionas, and Michael J. Black. 2019. Expressive Body Capture: 3D Hands, Face, and Body from a Single Image. In Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_28"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00870"},{"volume-title":"Proceedings IEEE\/CVF Conf. on Computer Vision and Pattern Recognition (CVPR). 722--731","author":"Punnakkal Abhinanda R.","key":"e_1_3_2_1_37_1","unstructured":"Abhinanda R. Punnakkal, Arjun Chandrasekaran, Nikos Athanasiou, Alejandra Quiros-Ramirez, and Michael J. Black. 2021. BABEL: Bodies, Action and Behavior with English Labels. In Proceedings IEEE\/CVF Conf. on Computer Vision and Pattern Recognition (CVPR). 722--731."},{"volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","year":"2021","author":"Radford Alec","key":"e_1_3_2_1_38_1","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020 [cs.CV]"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2021. High-Resolution Image Synthesis with Latent Diffusion Models. arXiv:2112.10752 [cs.CV]","DOI":"10.1109\/CVPR52688.2022.01042"},{"volume-title":"Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418","year":"2023","author":"Shafir Yonatan","key":"e_1_3_2_1_40_1","unstructured":"Yonatan Shafir, Guy Tevet, Roy Kapon, and Amit H Bermano. 2023. Human motion diffusion as a generative prior. arXiv preprint arXiv:2303.01418 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Hao Tang Wei Wang Dan Xu Yan Yan and Nicu Sebe. 2018. GestureGAN for Hand Gesture-to-Gesture Translation in the Wild. In ACM MM.","DOI":"10.1145\/3240508.3240704"},{"volume-title":"Tel Aviv","year":"2022","author":"Tevet Guy","key":"e_1_3_2_1_42_1","unstructured":"Guy Tevet, Brian Gordon, Amir Hertz, Amit H Bermano, and Daniel Cohen-Or. 2022. Motionclip: Exposing human motion generation to clip space. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXII. Springer, 358--374."},{"volume-title":"Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/ forum?id=SJ1kSyO2jwu","year":"2023","author":"Tevet Guy","key":"e_1_3_2_1_43_1","unstructured":"Guy Tevet, Sigal Raab, Brian Gordon, Yoni Shafir, Daniel Cohen-or, and Amit Haim Bermano. 2023. Human Motion Diffusion Model. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/ forum?id=SJ1kSyO2jwu"},{"key":"e_1_3_2_1_44_1","unstructured":"Aaron Van Den Oord Oriol Vinyals et al. 2017. Neural discrete representation learning. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_45_1","unstructured":"Aaron van den Oord Oriol Vinyals and Koray Kavukcuoglu. 2018. Neural Discrete Representation Learning. arXiv:1711.00937 [cs.LG]"},{"volume-title":"TLControl: Trajectory and Language Control for Human Motion Synthesis. arXiv preprint arXiv:2311.17135","year":"2023","author":"Wan Weilin","key":"e_1_3_2_1_46_1","unstructured":"Weilin Wan, Zhiyang Dou, Taku Komura, Wenping Wang, Dinesh Jayaraman, and Lingjie Liu. 2023. TLControl: Trajectory and Language Control for Human Motion Synthesis. arXiv preprint arXiv:2311.17135 (2023)."},{"key":"e_1_3_2_1_47_1","unstructured":"Yiming Xie Varun Jampani Lei Zhong Deqing Sun and Huaizu Jiang. 2023. OmniControl: Control Any Joint at Any Time for Human Motion Generation. arXiv:2310.08580"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612503"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/650"},{"volume-title":"QPGesture: Quantization-Based and Phase-Guided Motion Matching for Natural Speech-Driven Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 2321--2330","year":"2023","author":"Yang Sicheng","key":"e_1_3_2_1_50_1","unstructured":"Sicheng Yang, Zhiyong Wu, Minglei Li, Zhensong Zhang, Lei Hao, Weihong Bao, and Haolin Zhuang. 2023. QPGesture: Quantization-Based and Phase-Guided Motion Matching for Natural Speech-Driven Gesture Generation. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR. IEEE, 2321--2330."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Sicheng Yang Zunnan Xu Haiwei Xue Yongkang Cheng Shaoli Huang Mingming Gong and Zhiyong Wu. 2024. Freetalker: Controllable Speech and Text-Driven Gesture Generation Based on Diffusion Models for Enhanced Speaker Naturalness. In ICASSP 2024 - 2024 IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP).","DOI":"10.1109\/ICASSP48485.2024.10447978"},{"key":"e_1_3_2_1_52_1","unstructured":"Hongwei Yi Hualin Liang Yifei Liu Qiong Cao Yandong Wen Timo Bolkart Dacheng Tao and Michael J Black. 2023. Generating Holistic 3D Human Motion from Speech. In CVPR."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3414685.3417838"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Neil Zeghidour Alejandro Luebs Ahmed Omran Jan Skoglund and Marco Tagliasacchi. 2021. SoundStream: An End-to-End Neural Audio Codec. arXiv:2107.03312 [cs.SD]","DOI":"10.1109\/TASLP.2021.3129994"},{"volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR).","year":"2023","author":"Zhang Jianrong","key":"e_1_3_2_1_55_1","unstructured":"Jianrong Zhang, Yangsong Zhang, Xiaodong Cun, Shaoli Huang, Yong Zhang, Hongwei Zhao, Hongtao Lu, and Xi Shen. 2023. T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)."},{"volume-title":"MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model. arXiv preprint arXiv:2208.15001","year":"2022","author":"Zhang Mingyuan","key":"e_1_3_2_1_56_1","unstructured":"Mingyuan Zhang, Zhongang Cai, Liang Pan, Fangzhou Hong, Xinying Guo, Lei Yang, and Ziwei Liu. 2022. MotionDiffuse: Text-Driven Human Motion Generation with Diffusion Model. arXiv preprint arXiv:2208.15001 (2022)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01016"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680847","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T04:41:17Z","timestamp":1730695277000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680847"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":57,"alternative-id":["10.1145\/3664647.3680847","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680847","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}