{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,5]],"date-time":"2024-09-05T00:29:40Z","timestamp":1725496180238},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272292"],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,4]]},"DOI":"10.1145\/3672198.3673794","type":"proceedings-article","created":{"date-parts":[[2024,7,16]],"date-time":"2024-07-16T16:24:10Z","timestamp":1721147050000},"page":"18-25","update-policy":"http:\/\/dx.doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Network Load Balancing with Parallel Flowlets for AI Training Clusters"],"prefix":"10.1145","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-0222-4943","authenticated-orcid":false,"given":"Peirui","family":"Cao","sequence":"first","affiliation":[{"name":"Microsoft Research Asia and Shanghai Jiao Tong university"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-6984-8913","authenticated-orcid":false,"given":"Wenxue","family":"Cheng","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-8395-5109","authenticated-orcid":false,"given":"Shizhen","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong university"}]},{"ORCID":"http:\/\/orcid.org\/0000-0003-4175-0097","authenticated-orcid":false,"given":"Yongqiang","family":"Xiong","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]}],"member":"320","published-online":{"date-parts":[[2024,8,4]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Introduction to infiniband for end users. White paper","author":"Grun Paul","year":"2010","unstructured":"Paul Grun. Introduction to infiniband for end users. White paper, InfiniBand Trade Association, 55, 2010."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342085"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934908"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.17487\/RFC2992"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2592798.2592803"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2013.6567015"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICNP.2018.00017"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098839"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1232919.1232925"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544216.3544226"},{"key":"e_1_3_2_1_12_1","first-page":"407","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Vanini Erico","year":"2017","unstructured":"Erico Vanini, Rong Pan, Mohammad Alizadeh, Parvin Taheri, and Tom Edsall. Let it flow: Resilient asymmetric load balancing with flowlet switching. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), pages 407--420, 2017."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2619239.2626316"},{"key":"e_1_3_2_1_14_1","first-page":"11","article-title":"Nvidia spectrum-x network platform architecture - the first ethernet network designed to accelerate ai workloads. Technical report","author":"NVIDIA","year":"2023","unstructured":"NVIDIA. Nvidia spectrum-x network platform architecture - the first ethernet network designed to accelerate ai workloads. Technical report, NVIDIA, 11 2023.","journal-title":"NVIDIA"},{"key":"e_1_3_2_1_15_1","volume-title":"A scalable, commodity data center network architecture. ACM SIGCOMM computer communication review, 38(4):63--74","author":"Al-Fares Mohammad","year":"2008","unstructured":"Mohammad Al-Fares, Alexander Loukissas, and Amin Vahdat. A scalable, commodity data center network architecture. ACM SIGCOMM computer communication review, 38(4):63--74, 2008."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787472"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICNP52444.2021.9651977"},{"key":"e_1_3_2_1_18_1","volume-title":"Yunzhuo Liu, Xinbing Wang, and Chenghu Zhou. Threshold-based routing-topology co-design for optical data center","author":"Cao Peirui","year":"2023","unstructured":"Peirui Cao, Shizhen Zhao, Dai Zhang, Zhuotao Liu, Mingwei Xu, Min Yee Teh, Yunzhuo Liu, Xinbing Wang, and Chenghu Zhou. Threshold-based routing-topology co-design for optical data center. IEEE\/ACM Transactions on Networking, 2023."},{"key":"e_1_3_2_1_19_1","first-page":"739","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Weiyang","year":"2023","unstructured":"Weiyang Wang, Moein Khazraee, Zhizhen Zhong, Manya Ghobadi, Zhihao Jia, Dheevatsa Mudigere, Ying Zhang, and Anthony Kewitsch. {TopoOpt}: Co-optimizing network topology and parallelization strategy for distributed training jobs. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 739--767, 2023."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQOS52092.2021.9521326"},{"key":"e_1_3_2_1_21_1","first-page":"231","volume-title":"irdma: Efficient use of rdma in distributed deep learning systems. In 2017 IEEE 19th International Conference on High Performance Computing and Communications","author":"Ren Yufei","year":"2017","unstructured":"Yufei Ren, Xingbo Wu, Li Zhang, Yandong Wang, Wei Zhang, Zijun Wang, Michel Hack, and Song Jiang. irdma: Efficient use of rdma in distributed deep learning systems. In 2017 IEEE 19th International Conference on High Performance Computing and Communications; IEEE 15th International Conference on Smart City; IEEE 3rd International Conference on Data Science and Systems (HPCC\/SmartCity\/DSS), pages 231--238. IEEE, 2017."},{"key":"e_1_3_2_1_23_1","volume-title":"Tacos: Topology-aware collective algorithm synthesizer for distributed training. arXiv preprint arXiv:2304.05301","author":"Won William","year":"2023","unstructured":"William Won, Midhilesh Elavazhagan, Sudarshan Srinivasan, Ajaya Durg, Swati Gupta, and Tushar Krishna. Tacos: Topology-aware collective algorithm synthesizer for distributed training. arXiv preprint arXiv:2304.05301, 2023."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11390-023-2894-6"},{"key":"e_1_3_2_1_26_1","volume-title":"Nccl: Accelerated multi-gpu collective communications","author":"Woolley Cliff","year":"2015","unstructured":"Cliff Woolley. Nccl: Accelerated multi-gpu collective communications, 2015."},{"key":"e_1_3_2_1_27_1","volume-title":"Msccl: Microsoft collective communication library. arXiv preprint arXiv:2201.11840","author":"Cowan Meghan","year":"2022","unstructured":"Meghan Cowan, Saeed Maleki, Madanlal Musuvathi, Olli Saarikivi, and Yifan Xiong. Msccl: Microsoft collective communication library. arXiv preprint arXiv:2201.11840, 2022."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3091475"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343180.3343186"},{"key":"e_1_3_2_1_30_1","first-page":"785","volume-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)","author":"Sapio Amedeo","year":"2021","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan Ports, and Peter Richt\u00e1rik. Scaling distributed machine learning with {InNetwork} aggregation. In 18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21), pages 785--808, 2021."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3604849"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICNP59255.2023.10355615"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2890955.2890968"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.3016891"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2021.3056601"},{"key":"e_1_3_2_1_36_1","first-page":"357","volume-title":"15th USENIX symposium on networked systems design and implementation (NSDI 18)","author":"Lu Yuanwei","year":"2018","unstructured":"Yuanwei Lu, Guo Chen, Bojie Li, Kun Tan, Yongqiang Xiong, Peng Cheng, Jiansong Zhang, Enhong Chen, and Thomas Moscibroda. {Multi-Path} transport for {RDMA} in datacenters. In 15th USENIX symposium on networked systems design and implementation (NSDI 18), pages 357--371, 2018."},{"key":"e_1_3_2_1_37_1","first-page":"1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang Zilong","year":"2023","unstructured":"Zilong Wang, Layong Luo, Qingsong Ning, Chaoliang Zeng, Wenxue Li, Xinchen Wan, Peng Xie, Tao Feng, Ke Cheng, Xiongfei Geng, et al. {SRNIC}: A scalable architecture for {RDMA}{NICs}. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23), pages 1--14, 2023."}],"event":{"name":"ACM SIGCOMM '24: ACM SIGCOMM 2024 Conference","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"],"location":"Sydney NSW Australia","acronym":"ACM SIGCOMM '24"},"container-title":["Proceedings of the 2024 SIGCOMM Workshop on Networks for AI Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3672198.3673794","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,4]],"date-time":"2024-09-04T10:25:29Z","timestamp":1725445529000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3672198.3673794"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,4]]},"references-count":36,"alternative-id":["10.1145\/3672198.3673794","10.1145\/3672198"],"URL":"https:\/\/doi.org\/10.1145\/3672198.3673794","relation":{},"subject":[],"published":{"date-parts":[[2024,8,4]]},"assertion":[{"value":"2024-08-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}