{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T05:33:59Z","timestamp":1733376839246,"version":"3.30.1"},"reference-count":62,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"3","license":[{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"Huawei Innovation Research Program","award":["HK RGC ECS 27200916","HK RGC GRF 17207117","17202318","27208720"]},{"name":"Croucher Innovation Award"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802358"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"USTC Research Funds of Double First-Class Initiative","award":["YD2150002006"]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2022,3,1]]},"DOI":"10.1109\/tpds.2021.3094364","type":"journal-article","created":{"date-parts":[[2021,7,2]],"date-time":"2021-07-02T19:29:50Z","timestamp":1625254190000},"page":"489-506","source":"Crossref","is-referenced-by-count":20,"title":["vPipe: A Virtualized Acceleration System for Achieving Efficient and Scalable Pipeline Parallel DNN Training"],"prefix":"10.1109","volume":"33","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1643-2583","authenticated-orcid":false,"given":"Shixiong","family":"Zhao","sequence":"first","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Fanxin","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2807-9780","authenticated-orcid":false,"given":"Xusheng","family":"Chen","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Xiuxian","family":"Guan","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8684-8509","authenticated-orcid":false,"given":"Jianyu","family":"Jiang","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Dong","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Yuhao","family":"Qing","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"given":"Sen","family":"Wang","sequence":"additional","affiliation":[{"name":"2012 Labs, Theory Lab, Huawei Technoloies, Co. Ltd, Shenzhen, China"}]},{"given":"Peng","family":"Wang","sequence":"additional","affiliation":[{"name":"2012 Labs, Theory Lab, Huawei Technoloies, Co. Ltd, Shenzhen, China"}]},{"given":"Gong","family":"Zhang","sequence":"additional","affiliation":[{"name":"2012 Labs, Theory Lab, Huawei Technoloies, Co. Ltd, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7064-6120","authenticated-orcid":false,"given":"Cheng","family":"Li","sequence":"additional","affiliation":[{"name":"School of Computer Science and Technology, University of Science and Technology of China, Hefei, Anhui, China"}]},{"given":"Ping","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7746-440X","authenticated-orcid":false,"given":"Heming","family":"Cui","sequence":"additional","affiliation":[{"name":"Department of Computer Computer Science, The University of Hong Kong, Hong Kong, China"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33014780"},{"key":"ref38","first-page":"1","article-title":"ZeRO: Memory optimizations toward training trillion parameter models","author":"rajbhandari","year":"2020","journal-title":"Proc Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1287\/opre.43.3.477"},{"article-title":"Regularizing and optimizing LSTM language models","year":"2017","author":"merity","key":"ref31"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378505"},{"key":"ref36","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref35","first-page":"25","article-title":"Hermes: Dynamic partitioning for distributed social network graph databases","author":"nicoara","year":"2015","journal-title":"Proc Intl Conf Extending Database Technology"},{"article-title":"DyNet: The dynamic neural network toolkit","year":"2017","author":"neubig","key":"ref34"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/BigComp.2018.00136"},{"article-title":"Few-shot neural architecture search","year":"2020","author":"zhao","key":"ref62"},{"year":"0","key":"ref61"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICSI.1990.138741"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2016.12.038"},{"key":"ref2","first-page":"97","article-title":"An integrated genetic algorithm with dynamic hill climbing for VLSI circuit partitioning","author":"areibi","year":"2000","journal-title":"Proc Genet Evol Comput Conf"},{"key":"ref1","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","author":"abadi","year":"2016","journal-title":"Proc 12th USENIX Symp Operating Syst Des Implementation"},{"key":"ref20","first-page":"6869","article-title":"Quantized neural networks: Training neural networks with low precision weights and activations","volume":"18","author":"hubara","year":"2017","journal-title":"J Mach Learn Res"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2189750.2151001"},{"article-title":"Beyond data and model parallelism for deep neural networks","year":"2018","author":"jia","key":"ref21"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/224170.224229"},{"key":"ref23","first-page":"113","article-title":"Multilevel graph partitioning schemes","author":"karypis","year":"0"},{"article-title":"One weird trick for parallelizing convolutional neural networks","year":"2014","author":"krizhevsky","key":"ref26"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1002\/j.1538-7305.1970.tb01770.x"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1016\/j.orl.2005.10.003"},{"year":"0","key":"ref51"},{"article-title":"PipeMare: Asynchronous pipeline parallel DNN training","year":"2019","author":"yang","key":"ref59"},{"article-title":"Google’s neural machine translation system: Bridging the gap between human and machine translation","year":"2016","author":"wu","key":"ref58"},{"article-title":"AlphaX: Exploring neural architectures with deep neural networks and Monte Carlo tree search","year":"2019","author":"wang","key":"ref57"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"article-title":"Sample-efficient neural architecture search by learning action space","year":"2019","author":"wang","key":"ref55"},{"key":"ref54","doi-asserted-by":"crossref","DOI":"10.1109\/SC41405.2020.00023","article-title":"Scaling distributed deep learning workloads beyond the memory capacity with KARMA","author":"wahib","year":"2020"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref52","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"devlin","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD.1993.580083"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/BF01201263"},{"key":"ref13","volume":"3","author":"fj\u00e4llstr\u00f6m","year":"1998","journal-title":"Algorithms for graph partitioning A survey"},{"article-title":"XPipe: Efficient pipeline model parallelism for multi-GPU DNN training","year":"2019","author":"guan","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1137\/S0895479892238270"},{"key":"ref17","first-page":"28-es","article-title":"A multi-level algorithm for partitioning graphs","author":"hendrickson","year":"0","journal-title":"Proc ACM\/IEEE Conf Supercomputing"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"ref19","first-page":"103","article-title":"GPipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Proc Adv Neural Inf Process Syst"},{"year":"0","key":"ref4"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CCECE.1998.685556"},{"article-title":"Language models are few-shot learners","year":"2020","author":"brown","key":"ref6"},{"key":"ref5","first-page":"499","article-title":"PipeSwitch: Fast pipelined context switching for deep learning applications","author":"bai","year":"2020","journal-title":"Proc 14th USENIX Symp Operating Syst Des Implementation"},{"article-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems","year":"2015","author":"chen","key":"ref8"},{"key":"ref7","article-title":"A heuristic for reducing fill-in in sparse matrix factorization","author":"bui","year":"1993","journal-title":"Tech Rep"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/115992.116012"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1355"},{"article-title":"The evolved transformer","year":"2019","author":"so","key":"ref45"},{"key":"ref48","first-page":"538","article-title":"Unified geometric approach to graph separators","author":"teng","year":"0"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TEVC.2017.2778089"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-2323"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/2670338"},{"article-title":"Very deep convolutional networks for large-scale image recognition","year":"2014","author":"simonyan","key":"ref44"},{"article-title":"Horovod: Fast and easy distributed deep learning in tensorflow","year":"2018","author":"sergeev","key":"ref43"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9497774\/09472938.pdf?arnumber=9472938","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,4]],"date-time":"2024-12-04T19:20:48Z","timestamp":1733340048000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9472938\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3,1]]},"references-count":62,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2021.3094364","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"type":"print","value":"1045-9219"},{"type":"electronic","value":"1558-2183"},{"type":"electronic","value":"2161-9883"}],"subject":[],"published":{"date-parts":[[2022,3,1]]}}}