{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T05:34:32Z","timestamp":1733376872701,"version":"3.30.1"},"reference-count":79,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2023,5,1]],"date-time":"2023-05-01T00:00:00Z","timestamp":1682899200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2023,5]]},"DOI":"10.1109\/tpds.2023.3247883","type":"journal-article","created":{"date-parts":[[2023,3,20]],"date-time":"2023-03-20T18:02:19Z","timestamp":1679335339000},"page":"1432-1449","source":"Crossref","is-referenced-by-count":2,"title":["Fold3D: Rethinking and Parallelizing Computational and Communicational Tasks in the Training of Large DNN Models"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2268-3036","authenticated-orcid":false,"given":"Fanxin","family":"Li","sequence":"first","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1643-2583","authenticated-orcid":false,"given":"Shixiong","family":"Zhao","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Hong Kong, SAR, China"}]},{"given":"Yuhao","family":"Qing","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2807-9780","authenticated-orcid":false,"given":"Xusheng","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6133-8388","authenticated-orcid":false,"given":"Xiuxian","family":"Guan","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Hong Kong, SAR, China"}]},{"given":"Sen","family":"Wang","sequence":"additional","affiliation":[{"name":"Theory Lab, 2012 Labs, Huawei Technoloies, Co. Ltd, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0283-7050","authenticated-orcid":false,"given":"Gong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Theory Lab, 2012 Labs, Huawei Technoloies, Co. Ltd, Hong Kong, SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7746-440X","authenticated-orcid":false,"given":"Heming","family":"Cui","sequence":"additional","affiliation":[{"name":"Department of Computer Science, The University of Hong Kong, Hong Kong, SAR, China"}]}],"member":"263","reference":[{"article-title":"Language models are few-shot learners","year":"2020","author":"Brown","key":"ref1"},{"key":"ref2","article-title":"XLNet: Generalized autoregressive pretraining for language understanding","volume-title":"Advances in Neural Information Processing Systems","author":"Yang","year":"2019"},{"key":"ref3","article-title":"Unified language model pre-training for natural language understanding and generation","volume-title":"Advances in Neural Information Processing Systems","author":"Dong","year":"2019"},{"key":"ref4","first-page":"8342","article-title":"Dont stop pretraining: Adapt language models to domains and tasks","volume-title":"Proc. 58th Annu. Meeting Assoc. Comput. Linguistics","author":"Gururangan"},{"key":"ref5","first-page":"1","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"year":"2020","key":"ref9","article-title":"microsoft\/deepspeed"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref11","first-page":"269","article-title":"PipeMare: Asynchronous pipeline parallel DNN training","volume-title":"Proc. Mach. Learn. Syst. Conf.","author":"Yang"},{"article-title":"Parallelized stochastic gradient descent","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zinkevich","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"year":"2014","key":"ref14","article-title":"NVLink"},{"year":"2021","key":"ref15","article-title":"Infiniband and remote DMA (RDMA) interfaces"},{"key":"ref16","first-page":"24829","article-title":"Piper: Multidimensional planner for DNN parallelization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tarnawski"},{"article-title":"Alpa: Automating inter-and intra-operator parallelism for distributed deep learning","year":"2022","author":"Zheng","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref20","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Narayanan"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519563"},{"article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","year":"2018","author":"Huang","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2018.2791442"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","year":"2019","author":"Shoeybi","key":"ref25"},{"year":"2022","key":"ref26","article-title":"NVIDIA\/Megatron-LM"},{"key":"ref27","first-page":"551","article-title":"ZeRO-Offload: Democratizing billion-scale model training","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Ren"},{"article-title":"Reducing activation recomputation in large transformer models","year":"2022","author":"Korthikanti","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"article-title":"Attention is all you need","year":"2017","author":"Vaswani","key":"ref30"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.07.001"},{"key":"ref33","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref34","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2021","journal-title":"J. Mach. Learn. Res."},{"article-title":"FastMoE: A fast mixture-of-expert training system","year":"2021","author":"He","key":"ref35"},{"article-title":"PaLM: Scaling language modeling with pathways","year":"2022","author":"Chowdhery","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"ref38","doi-asserted-by":"crossref","DOI":"10.1145\/2640087.2644155","article-title":"Parameter server for distributed machine learning","volume-title":"Proc. Big Learn. NIPS Workshop","author":"Li"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref40","first-page":"418","article-title":"TicTac: Accelerating distributed deep learning with communication scheduling","volume-title":"Proc. 2nd SysML Conf.","author":"Hashemi"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3094364"},{"article-title":"An overview of gradient descent optimization algorithms","year":"2016","author":"Ruder","key":"ref44"},{"key":"ref45","first-page":"307","article-title":"HetPipe: Enabling large $\\lbrace${DNN $\\rbrace$} training on (whimpy) heterogeneous $\\lbrace${ GPU$\\rbrace$} clusters through integration of pipelined model parallelism and data parallelism","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Park"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/79173.79181"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/0165-1684(84)90013-6"},{"key":"ref48","first-page":"21","article-title":"A theoretical framework for back-propagation","volume-title":"Proceedings of Connectionist Models Summer School","author":"LeCun","year":"1988"},{"article-title":"Training deep nets with sublinear memory cost","year":"2016","author":"Chen","key":"ref49"},{"year":"2021","key":"ref50","article-title":"NCCL hangs during ncclSend and ncclRecv"},{"year":"2019","key":"ref51","article-title":"jcpeterson\/openwebtext"},{"year":"2022","key":"ref52","article-title":"huggingface\/wikipedia"},{"year":"2021","key":"ref53","article-title":"Wudaocorpora 2.0"},{"year":"2021","key":"ref54","article-title":"allenai\/c4"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"ref56","first-page":"1","article-title":"Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning","volume-title":"Proc. 15th USENIX Symp. Operating Syst. Des. Implementation","author":"Qiao"},{"article-title":"An empirical model of large-batch training","year":"2018","author":"McCandlish","key":"ref57"},{"article-title":"Curriculum learning: A regularization method for efficient and stable billion-scale GPT model pre-training","year":"2021","author":"Li","key":"ref58"},{"year":"2020","key":"ref59","article-title":"NVIDIA selene: Leadership-class supercomputing infrastructure"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"article-title":"Horovod: Fast and easy distributed deep learning in tensorflow","year":"2018","author":"Sergeev","key":"ref61"},{"article-title":"Automatic cross-replica sharding of weight update in data-parallel training","year":"2020","author":"Xu","key":"ref62"},{"article-title":"XPipe: Efficient pipeline model parallelism for multi-GPU DNN training","year":"2019","author":"Guan","key":"ref63"},{"key":"ref64","first-page":"6543","article-title":"TeraPipe: Token-level pipeline parallelism for training large-scale language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"article-title":"GSPMD: General and scalable parallelization for ML computation graphs","year":"2021","author":"Xu","key":"ref65"},{"key":"ref66","first-page":"10435","article-title":"Mesh-TensorFlow: Deep learning for supercomputers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Shazeer"},{"article-title":"GShard: Scaling giant models with conditional computation and automatic sharding","year":"2020","author":"Lepikhin","key":"ref67"},{"key":"ref68","first-page":"336","article-title":"MLPerf training benchmark","volume-title":"Proc. Mach. Learn. Syst. Conf.","author":"Mattson"},{"key":"ref69","first-page":"132","article-title":"Priority-based parameter propagation for distributed DNN training","volume-title":"Proc. Mach. Learn. Syst. Conf.","author":"Jayarajan"},{"key":"ref70","first-page":"463","article-title":"A unified architecture for accelerating distributed $\\lbrace${DNN$\\rbrace$} training in heterogeneous $\\lbrace${GPU\/CPU$\\rbrace$} clusters","volume-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation","author":"Jiang","year":"2020"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"article-title":"PanGu-: Large-scale autoregressive pretrained chinese language models with auto-parallel computation","year":"2021","author":"Zeng","key":"ref72"},{"article-title":"PipeTransformer: Automated elastic pipelining for distributed training of transformers","year":"2021","author":"He","key":"ref73"},{"key":"ref74","first-page":"15 451","article-title":"Efficient algorithms for device placement of DNN graph operators","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tarnawski"},{"key":"ref75","first-page":"1","article-title":"Beyond data and model parallelism for deep neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref76","first-page":"3981","article-title":"Learning generalizable device placement algorithms for distributed machine learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Bojja Venkatakrishnan"},{"article-title":"Reinforced genetic algorithm learning for optimizing computation graphs","year":"2019","author":"Paliwal","key":"ref77"},{"key":"ref78","first-page":"430","article-title":"Pathways: Asynchronous distributed dataflow for ML","volume-title":"Proc. Mach. Learn. Syst. Conf","author":"Barham"},{"article-title":"Tutel: Adaptive mixture-of-experts at scale","year":"2022","author":"Hwang","key":"ref79"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/10075651\/10050126.pdf?arnumber=10050126","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T19:19:38Z","timestamp":1733339978000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10050126\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5]]},"references-count":79,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2023.3247883","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"type":"print","value":"1045-9219"},{"type":"electronic","value":"1558-2183"},{"type":"electronic","value":"2161-9883"}],"subject":[],"published":{"date-parts":[[2023,5]]}}}