{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T01:56:36Z","timestamp":1740102996373,"version":"3.37.3"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","funder":[{"name":"NSF China","award":["62272292"]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,3]]},"DOI":"10.1145\/3663408.3663411","type":"proceedings-article","created":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T16:23:29Z","timestamp":1719937409000},"page":"16-22","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LubeRDMA: A Fail-safe Mechanism of RDMA"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-2615-0525","authenticated-orcid":false,"given":"Shengkai","family":"Lin","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0337-1812","authenticated-orcid":false,"given":"Qinwei","family":"Yang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6307-7310","authenticated-orcid":false,"given":"Zengyin","family":"Yang","sequence":"additional","affiliation":[{"name":"Huawei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1564-6732","authenticated-orcid":false,"given":"Yuchuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8395-5109","authenticated-orcid":false,"given":"Shizhen","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Empowering Azure Storage with RDMA. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Bai Wei","year":"2023","unstructured":"Wei Bai, Shanim\u00a0Sainul Abdeen, Ankit Agrawal, Krishan\u00a0Kumar Attre, Paramvir Bahl, Ameya Bhagat, Gowri Bhaskara, Tanya Brokhman, Lei Cao, Ahmad Cheema, Rebecca Chow, Jeff Cohen, Mahmoud Elhaddad, Vivek Ette, Igal Figlin, Daniel Firestone, Mathew George, Ilya German, Lakhmeet Ghai, Eric Green, Albert Greenberg, Manish Gupta, Randy Haagens, Matthew Hendel, Ridwan Howlader, Neetha John, Julia Johnstone, Tom Jolly, Greg Kramer, David Kruse, Ankit Kumar, Erica Lan, Ivan Lee, Avi Levy, Marina Lipshteyn, Xin Liu, Chen Liu, Guohan Lu, Yuemin Lu, Xiakun Lu, Vadim Makhervaks, Ulad Malashanka, David\u00a0A. Maltz, Ilias Marinos, Rohan Mehta, Sharda Murthi, Anup Namdhari, Aaron Ogus, Jitendra Padhye, Madhav Pandya, Douglas Phillips, Adrian Power, Suraj Puri, Shachar Raindel, Jordan Rhee, Anthony Russo, Maneesh Sah, Ali Sheriff, Chris Sparacino, Ashutosh Srivastava, Weixiang Sun, Nick Swanson, Fuhou Tian, Lukasz Tomczyk, Vamsi Vadlamuri, Alec Wolman, Ying Xie, Joyce Yom, Lihua Yuan, Yanzhao Zhang, and Brian Zill. 2023. Empowering Azure Storage with RDMA. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 49\u201367. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/bai"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.14778\/3137765.3137777"},{"key":"e_1_3_2_1_3_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran\u00a0Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, 929\u2013943. https:\/\/www.usenix.org\/conference\/nsdi22\/presentation\/eisenman"},{"key":"e_1_3_2_1_4_1","unstructured":"Elastic Horovod 2024. Elastic Horovod. https:\/\/horovod.readthedocs.io\/en\/latest\/elastic_include.html."},{"key":"e_1_3_2_1_5_1","unstructured":"etcd 2024. etcd. https:\/\/etcd.io\/."},{"key":"e_1_3_2_1_6_1","unstructured":"etcdscale 2024. What is maximum cluster size?https:\/\/etcd.io\/docs\/v3.5\/faq\/#what-is-maximum-cluster-size."},{"key":"e_1_3_2_1_7_1","unstructured":"gdr 2024. GPUDirect RDMA. https:\/\/docs.nvidia.com\/cuda\/gpudirect-rdma\/index.html."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2043164.2018477"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_10_1","volume-title":"Elastic Resource Sharing for Distributed Deep Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Hwang Changho","year":"2021","unstructured":"Changho Hwang, Taehyun Kim, Sunghyun Kim, Jinwoo Shin, and KyoungSoo Park. 2021. Elastic Resource Sharing for Distributed Deep Learning. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21). USENIX Association, 721\u2013739. https:\/\/www.usenix.org\/conference\/nsdi21\/presentation\/hwang"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613152"},{"key":"e_1_3_2_1_12_1","volume-title":"Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Mohan Jayashree","year":"2021","unstructured":"Jayashree Mohan, Amar Phanishayee, and Vijay Chidambaram. 2021. CheckFreq: Frequent, Fine-Grained DNN Checkpointing. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 203\u2013216. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/mohan"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522738"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-76"},{"key":"e_1_3_2_1_15_1","unstructured":"perftest 2024. linux-rdma\/perftest. https:\/\/github.com\/linux-rdma\/perftest."},{"key":"e_1_3_2_1_16_1","unstructured":"rdma-core 2024. rdma-core. https:\/\/github.com\/linux-rdma\/rdma-core."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2005.34"},{"key":"e_1_3_2_1_18_1","volume-title":"Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Thorpe John","year":"2023","unstructured":"John Thorpe, Pengzhan Zhao, Jonathan Eyolfson, Yifan Qiao, Zhihao Jia, Minjia Zhang, Ravi Netravali, and Guoqing\u00a0Harry Xu. 2023. Bamboo: Making Preemptible Instances Resilient for Affordable Training of Large DNNs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 497\u2013513. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/thorpe"},{"key":"e_1_3_2_1_19_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian\u00a0Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit\u00a0Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric\u00a0Michael Smith Ranjan Subramanian Xiaoqing\u00a0Ellen Tan Binh Tang Ross Taylor Adina Williams Jian\u00a0Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arxiv:2307.09288\u00a0[cs.CL]"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359653"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"e_1_3_2_1_22_1","volume-title":"SRNIC: A Scalable Architecture for RDMA NICs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Zilong","year":"2023","unstructured":"Zilong Wang, Layong Luo, Qingsong Ning, Chaoliang Zeng, Wenxue Li, Xinchen Wan, Peng Xie, Tao Feng, Ke Cheng, Xiongfei Geng, Tianhao Wang, Weicheng Ling, Kejia Huo, Pingbo An, Kui Ji, Shideng Zhang, Bin Xu, Ruiqing Feng, Tao Ding, Kai Chen, and Chuanxiong Guo. 2023. SRNIC: A Scalable Architecture for RDMA NICs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 1\u201314. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/wang-zilong"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS47774.2020.00018"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522737"},{"key":"e_1_3_2_1_25_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. arxiv:2205.01068\u00a0[cs.CL]","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit\u00a0Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arxiv:2205.01068\u00a0[cs.CL]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3539597.3573037"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3452296.3472897"}],"event":{"name":"APNet 2024: The 8th Asia-Pacific Workshop on Networking","acronym":"APNet 2024","location":"Sydney Australia"},"container-title":["Proceedings of the 8th Asia-Pacific Workshop on Networking"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3663408.3663411","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,5]],"date-time":"2024-08-05T22:30:22Z","timestamp":1722897022000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3663408.3663411"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,3]]},"references-count":27,"alternative-id":["10.1145\/3663408.3663411","10.1145\/3663408"],"URL":"https:\/\/doi.org\/10.1145\/3663408.3663411","relation":{},"subject":[],"published":{"date-parts":[[2024,8,3]]},"assertion":[{"value":"2024-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}