{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T16:04:02Z","timestamp":1729613042807,"version":"3.28.0"},"reference-count":61,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,12,1]],"date-time":"2021-12-01T00:00:00Z","timestamp":1638316800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,12,1]],"date-time":"2021-12-01T00:00:00Z","timestamp":1638316800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,12,1]],"date-time":"2021-12-01T00:00:00Z","timestamp":1638316800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,12]]},"DOI":"10.1109\/icdm51629.2021.00027","type":"proceedings-article","created":{"date-parts":[[2022,1,24]],"date-time":"2022-01-24T21:04:04Z","timestamp":1643058244000},"page":"171-180","source":"Crossref","is-referenced-by-count":2,"title":["LAGA: Lagged AllReduce with Gradient Accumulation for Minimal Idle Time"],"prefix":"10.1109","author":[{"given":"Ido","family":"Hakimi","sequence":"first","affiliation":[]},{"given":"Rotem Zamir","family":"Aviv","sequence":"additional","affiliation":[]},{"given":"Kfir Y.","family":"Levy","sequence":"additional","affiliation":[]},{"given":"Assaf","family":"Schuster","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/0041-5553(64)90137-5"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ALLERTON.2016.7852343"},{"key":"ref32","article-title":"Docker: Lightweight linux containers for consistent development and deployment","author":"merkel","year":"2014","journal-title":"Linux J"},{"key":"ref31","article-title":"Decoupled weight decay regularization","author":"loshchilov","year":"2019","journal-title":"International Conference on Learning Representations"},{"key":"ref30","article-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training","author":"lin","year":"2018","journal-title":"International Conference on Learning Representations"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"ref36","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3302260"},{"key":"ref34","first-page":"543","article-title":"A method for unconstrained convex minimization problem with the rate of convergence o (1\/kˆ 2)","volume":"269","author":"nesterov","year":"1983","journal-title":"Doklady an USSR"},{"key":"ref60","first-page":"412","article-title":"Distributed hierarchical gpu parameter server for massive scale deep learning ads systems","volume":"2","author":"zhao","year":"2020","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"ref61","first-page":"4120","article-title":"Asynchronous stochastic gradient descent with delay compensation","author":"zheng","year":"2017","journal-title":"Proceedings of the 34th International Conference on Machine Learning volume 70 of Proceedings of Machine Learning Research"},{"key":"ref28","first-page":"583","article-title":"Scaling distributed machine learning with the parameter server","author":"li","year":"2014","journal-title":"Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation"},{"key":"ref27","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"arXiv preprint arXiv 1412 6980"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"journal-title":"Nvidia GeForce RTX 2080 Ti Review","year":"0","key":"ref2"},{"journal-title":"Github","year":"0","key":"ref1"},{"key":"ref20","first-page":"439","article-title":"Accumulated gradient normalization","author":"hermans","year":"2017","journal-title":"Asian Conference on Machine Learning"},{"key":"ref22","article-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism","author":"huang","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref21","article-title":"Train longer, generalize better: closing the generalization gap in large batch training of neural networks","volume":"30","author":"hoffer","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref24","article-title":"Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes","author":"jia","year":"2018","journal-title":"arXiv preprint arXiv 1807 11205"},{"journal-title":"Optimized inter-gpu collective operations with nccl 2","year":"2017","author":"jeaugey","key":"ref23"},{"key":"ref26","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","author":"keskar","year":"2016","journal-title":"arXiv preprint arXiv 1609 04802"},{"key":"ref25","article-title":"A unified architecture for accelerating distributed DNN training in heterogeneous gpu\/cpu clusters","author":"jiang","year":"2020","journal-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00048"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1145\/3377454"},{"key":"ref59","first-page":"2350","article-title":"Staleness-aware async-sgd for distributed deep learning","author":"zhang","year":"2016","journal-title":"Proceedings of the Twenty-Fifth International Joint Conference on Artificial Intelligence"},{"key":"ref58","article-title":"Pangu-?: Large-scale autoregressive pretrained chinese language models with auto-parallel computation","author":"zeng","year":"2021","journal-title":"arXiv preprint arXiv 2104 12369"},{"key":"ref57","first-page":"87.1","article-title":"Wide residual networks","author":"zagoruyko","year":"2016","journal-title":"Proceedings of the British Machine Vision Conference (BMVC)"},{"key":"ref56","article-title":"Layered sgd: A decentralized and synchronous sgd algorithm for scalable deep neural network training","author":"yu","year":"2019","journal-title":"arXiv preprint arXiv 1906 03008"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015693"},{"key":"ref54","article-title":"Imagenet training in 24 minutes","author":"you","year":"2017","journal-title":"arXiv preprint arXiv 1709 05584"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00060"},{"key":"ref52","article-title":"Powersgd: Practical low-rank gradient compression for distributed optimization","volume":"32","author":"vogels","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472805"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421307"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref12","article-title":"Large scale distributed deep networks","volume":"25","author":"dean","year":"2012","journal-title":"Advances in neural information processing systems"},{"key":"ref13","article-title":"Optimal distributed online prediction using mini-batches","author":"dekel","year":"2012","journal-title":"Journal of Machine Learning Research"},{"key":"ref14","article-title":"Pytorch lightning","author":"falcon","year":"2019","journal-title":"Github"},{"key":"ref15","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","author":"fedus","year":"2021","journal-title":"arXiv preprint arXiv 2101 06286"},{"article-title":"Analysis and comparison of distributed training techniques for deep neural networks in a dynamic environment","year":"2018","author":"gebremeskel","key":"ref16"},{"key":"ref17","article-title":"Accurate, large minibatch sgd: Training imagenet in 1 hour","author":"goyal","year":"2017","journal-title":"arXiv preprint arXiv 1706 02677"},{"key":"ref18","article-title":"Taming momentum in a distributed asynchronous environment","author":"hakimi","year":"2019","journal-title":"arXiv preprint arXiv 1907 11634"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref4","article-title":"Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes","author":"akiba","year":"2017","journal-title":"arXiv preprint arXiv 1711 11585"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5608"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3236367.3236381"},{"key":"ref5","article-title":"Performance analysis and comparison of distributed machine learning systems","author":"alqahtani","year":"2019","journal-title":"arXiv preprint arXiv 1909 11324"},{"key":"ref8","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP.2013.6639349","article-title":"Advances in optimizing recurrent networks","author":"bengio","year":"2013","journal-title":"IEEE International Conference on Acoustics Speech and Signal Processing"},{"key":"ref7","article-title":"Gap-aware mitigation of gradient staleness","author":"barkai","year":"2020","journal-title":"International Conference on Learning Representations"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI51249.2020.00021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2019.10.004"},{"key":"ref46","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"3rd International Conference on Learning Representations"},{"key":"ref45","doi-asserted-by":"crossref","first-page":"354","DOI":"10.1038\/nature24270","article-title":"Mastering the game of go without human knowledge","volume":"550","author":"silver","year":"2017","journal-title":"Nature"},{"key":"ref48","article-title":"On the importance of initialization and momentum in deep learning","author":"sutskever","year":"2013","journal-title":"Proceedings of the 30th International Conference on Machine Learning"},{"key":"ref47","article-title":"Don’t decay the learning rate, increase the batch size","author":"smith","year":"2018","journal-title":"International Conference on Learning Representations"},{"key":"ref42","article-title":"Horovod: fast and easy distributed deep learning in tensorflow","author":"sergeev","year":"2018","journal-title":"arXiv preprint arxiv 1802 05807"},{"key":"ref41","first-page":"785","article-title":"Scaling distributed machine learning with in-network aggregation","author":"sapio","year":"2021","journal-title":"18th USENIX Symposium on Networked Systems Design and Implementation (NSDI 21)"},{"key":"ref44","article-title":"Megatron-lm: Training multi-billion parameter language models using gpu model parallelism","author":"shoeybi","year":"2019","journal-title":"arXiv preprint arXiv 1909 08072"},{"key":"ref43","first-page":"1","article-title":"Measuring the effects of data parallelism on neural network training","volume":"20","author":"shallue","year":"2019","journal-title":"Journal of Machine Learning Research"}],"event":{"name":"2021 IEEE International Conference on Data Mining (ICDM)","start":{"date-parts":[[2021,12,7]]},"location":"Auckland, New Zealand","end":{"date-parts":[[2021,12,10]]}},"container-title":["2021 IEEE International Conference on Data Mining (ICDM)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9678506\/9678989\/09679133.pdf?arnumber=9679133","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,24]],"date-time":"2023-01-24T14:05:44Z","timestamp":1674569144000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9679133\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12]]},"references-count":61,"URL":"https:\/\/doi.org\/10.1109\/icdm51629.2021.00027","relation":{},"subject":[],"published":{"date-parts":[[2021,12]]}}}