{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,7,27]],"date-time":"2024-07-27T09:48:48Z","timestamp":1722073728476},"reference-count":62,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2020,3,1]],"date-time":"2020-03-01T00:00:00Z","timestamp":1583020800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Parallel and Distributed Computing"],"published-print":{"date-parts":[[2020,3]]},"DOI":"10.1016\/j.jpdc.2019.10.004","type":"journal-article","created":{"date-parts":[[2019,11,2]],"date-time":"2019-11-02T13:38:53Z","timestamp":1572701933000},"page":"65-76","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":37,"special_numbering":"C","title":["A Hitchhiker\u2019s Guide On Distributed Training Of Deep Neural Networks"],"prefix":"10.1016","volume":"137","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-0971-4873","authenticated-orcid":false,"given":"Karanbir Singh","family":"Chahal","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0001-5696-8107","authenticated-orcid":false,"given":"Manraj Singh","family":"Grover","sequence":"additional","affiliation":[]},{"given":"Kuntal","family":"Dey","sequence":"additional","affiliation":[]},{"given":"Rajiv Ratn","family":"Shah","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.jpdc.2019.10.004_b1","unstructured":"M. Abadi, P. Barham, J. Chen, Z. Chen, A. Davis, J. Dean, M. Devin, S. Ghemawat, G. Irving, M. Isard, et al. Tensorflow: A system for large-scale machine learning, in: 12th {USENIX} Symposium on Operating Systems Design and Implementation, {OSDI} 16, 2016, pp. 265\u2013283."},{"key":"10.1016\/j.jpdc.2019.10.004_b2","series-title":"Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing","first-page":"440","article-title":"Sparse communication for distributed gradient descent","author":"Aji","year":"2017"},{"key":"10.1016\/j.jpdc.2019.10.004_b3","series-title":"Advances in Neural Information Processing Systems, vol. 30","first-page":"1709","article-title":"QSGD: Communication-efficient SGD via gradient quantization and encoding","author":"Alistarh","year":"2017"},{"key":"10.1016\/j.jpdc.2019.10.004_b4","series-title":"Proceedings of the 24th European MPI Users\u2019 Group Meeting","first-page":"13:1","article-title":"What does fault tolerant deep learning need from MPI?","author":"Amatya","year":"2017"},{"key":"10.1016\/j.jpdc.2019.10.004_b5","series-title":"Proceedings of COMPSTAT\u20192010","first-page":"177","article-title":"Large-scale machine learning with stochastic gradient descent","author":"Bottou","year":"2010"},{"key":"10.1016\/j.jpdc.2019.10.004_b6","series-title":"hydra-hoard\/hydra","author":"Chahal","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b7","doi-asserted-by":"crossref","unstructured":"C.-Y. Chen, J. Choi, D. Brand, A. Agrawal, W. Zhang, K. Gopalakrishnan, AdaComp: Adaptive residual gradient compression for data-parallel distributed training, in: AAAI Conference on Artificial Intelligence, 2018, URL https:\/\/aaai.org\/ocs\/index.php\/AAAI\/AAAI18\/paper\/view\/16859.","DOI":"10.1609\/aaai.v32i1.11728"},{"key":"10.1016\/j.jpdc.2019.10.004_b8","unstructured":"J. Chen, R. Monga, S. Bengio, R. Jozefowicz, Revisiting distributed synchronous SGD, in: International Conference on Learning Representations Workshop, 2016."},{"key":"10.1016\/j.jpdc.2019.10.004_b9","doi-asserted-by":"crossref","unstructured":"G. Cong, G. Domeniconi, J. Shapiro, F. Zhou, B. Chen, Accelerating deep neural network training for action recognition on a cluster of GPUs, in: 2018 30th International Symposium on Computer Architecture and High Performance Computing, SBAC-PAD, 2018, pp. 298\u2013305, http:\/\/dx.doi.org\/10.1109\/CAHPC.2018.8645861.","DOI":"10.1109\/CAHPC.2018.8645861"},{"key":"10.1016\/j.jpdc.2019.10.004_b10","series-title":"TechCrunch","article-title":"How big is facebook\u2019s data? 2.5 billion pieces of content and 500+ terabytes ingested every day","author":"Constine","year":"2012"},{"key":"10.1016\/j.jpdc.2019.10.004_b11","series-title":"Advances in Neural Information Processing Systems","first-page":"1223","article-title":"Large scale distributed deep networks","author":"Dean","year":"2012"},{"key":"10.1016\/j.jpdc.2019.10.004_b12","doi-asserted-by":"crossref","unstructured":"J. Deng, W. Dong, R. Socher, L. Li, . Kai\u00a0Li, L. Fei-Fei, ImageNet: A large-scale hierarchical image database, in: 2009 IEEE Conference on Computer Vision and Pattern Recognition, 2009, pp. 248\u2013255, http:\/\/dx.doi.org\/10.1109\/CVPR.2009.5206848.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10.1016\/j.jpdc.2019.10.004_b13","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.jpdc.2019.10.004_b14","series-title":"Proceedings of the Workshop on Machine Learning in High Performance Computing Environments","first-page":"1","article-title":"Communication quantization for data-parallel training of deep neural networks","author":"Dryden","year":"2016"},{"key":"10.1016\/j.jpdc.2019.10.004_b15","doi-asserted-by":"crossref","unstructured":"S. Ghemawat, H. Gobioff, S.-T. Leung, The Google file system, in: Proceedings of the 19th ACM Symposium on Operating Systems Principles, 2003, pp. 20\u201343, Bolton Landing, NY.","DOI":"10.1145\/945445.945450"},{"key":"10.1016\/j.jpdc.2019.10.004_b16","series-title":"Bringing HPC Techniques to Deep Learning","author":"Gibiansky","year":"2017"},{"key":"10.1016\/j.jpdc.2019.10.004_b17","series-title":"Cloud TPU","author":"Google","year":"2019"},{"key":"10.1016\/j.jpdc.2019.10.004_b18","series-title":"Accurate, large minibatch SGD: Training imagenet in 1 hour","author":"Goyal","year":"2017"},{"issue":"10","key":"10.1016\/j.jpdc.2019.10.004_b19","doi-asserted-by":"crossref","first-page":"2222","DOI":"10.1109\/TNNLS.2016.2582924","article-title":"LSTM: A search space odyssey","volume":"28","author":"Greff","year":"2017","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.jpdc.2019.10.004_b20","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: 2016 IEEE Conference on Computer Vision and Pattern Recognition , CVPR, 2016, pp. 770\u2013778, http:\/\/dx.doi.org\/10.1109\/CVPR.2016.90.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.jpdc.2019.10.004_b21","doi-asserted-by":"crossref","unstructured":"B. Jacob, S. Kligys, B. Chen, M. Zhu, M. Tang, A. Howard, H. Adam, D. Kalenichenko, Quantization and training of neural networks for efficient integer-arithmetic-only inference, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 2704\u20132713.","DOI":"10.1109\/CVPR.2018.00286"},{"key":"10.1016\/j.jpdc.2019.10.004_b22","series-title":"Proceedings of the 22nd ACM International Conference on Multimedia","first-page":"675","article-title":"Caffe: Convolutional architecture for fast feature embedding","author":"Jia","year":"2014"},{"key":"10.1016\/j.jpdc.2019.10.004_b23","unstructured":"X. Jia, S. Song, W. He, Y. Wang, H. Rong, F. Zhou, L. Xie, Z. Guo, Y. Yang, L. Yu, T. Chen, G. Hu, S. Shi, X. Chu, Highly scalable deep learning training system with mixed-precision: training imagenet in four minutes, in: NeurIPS Workshop on Systems for ML and Open Source Software. 2018."},{"key":"10.1016\/j.jpdc.2019.10.004_b24","unstructured":"P.H. Jin, Q. Yuan, F. Iandola, K. Keutzer, How to scale distributed deep learning? in: NIPS Workshop on Machine Learning Systems, 2016."},{"key":"10.1016\/j.jpdc.2019.10.004_b25","series-title":"Advances in Neural Information Processing Systems","first-page":"315","article-title":"Accelerating stochastic gradient descent using predictive variance reduction","author":"Johnson","year":"2013"},{"key":"10.1016\/j.jpdc.2019.10.004_b26","unstructured":"D.P. Kingma, J. Ba, Adam: A method for stochastic optimization, in: 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7\u20139, 2015, Conference Track Proceedings, 2015, http:\/\/arxiv.org\/abs\/1412.6980."},{"key":"10.1016\/j.jpdc.2019.10.004_b27","unstructured":"J. Kone\u010dn\u00fd, H.B. McMahan, F.X. Yu, P. Richtarik, A.T. Suresh, D. Bacon, Federated learning: Strategies for improving communication efficiency, in: NIPS Workshop on Private Multi-Party Machine Learning, 2016, https:\/\/arxiv.org\/abs\/1610.05492."},{"key":"10.1016\/j.jpdc.2019.10.004_b28","series-title":"Quantizing deep convolutional networks for efficient inference: A whitepaper","author":"Krishnamoorthi","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b29","series-title":"Learning Multiple Layers of Features from Tiny Images","author":"Krizhevsky","year":"2009"},{"key":"10.1016\/j.jpdc.2019.10.004_b30","series-title":"Advances in Neural Information Processing Systems, vol. 25","first-page":"1097","article-title":"Imagenet classification with deep convolutional neural networks","author":"Krizhevsky","year":"2012"},{"key":"10.1016\/j.jpdc.2019.10.004_b31","unstructured":"M. Li, D.G. Andersen, A. Smola, Distributed delayed proximal gradient methods , in: NIPS Workshop on Optimization for Machine Learning, vol. 3, 2013, p. 3."},{"key":"10.1016\/j.jpdc.2019.10.004_b32","series-title":"Advances in Neural Information Processing Systems","first-page":"19","article-title":"Communication efficient distributed machine learning with the parameter server","author":"Li","year":"2014"},{"key":"10.1016\/j.jpdc.2019.10.004_b33","unstructured":"Y. Lin, S. Han, H. Mao, Y. Wang, B. Dally, Deep gradient compression: Reducing the communication bandwidth for distributed training, in: International Conference on Learning Representations, 2018, URL https:\/\/openreview.net\/forum?id=SkhQHMW0W."},{"key":"10.1016\/j.jpdc.2019.10.004_b34","doi-asserted-by":"crossref","first-page":"11","DOI":"10.1016\/j.neucom.2016.12.038","article-title":"A survey of deep neural network architectures and their applications","volume":"234","author":"Liu","year":"2017","journal-title":"Neurocomputing"},{"key":"10.1016\/j.jpdc.2019.10.004_b35","series-title":"SC 18 Super Computing","article-title":"Tree-based fault-tolerant collective operations for MPI","author":"Margolin","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b36","unstructured":"P. Micikevicius, S. Narang, J. Alben, G. Diamos, E. Elsen, D. Garcia, B. Ginsburg, M. Houston, O. Kuchaiev, G. Venkatesh, H. Wu, Mixed precision training, in: International Conference on Learning Representations, 2018, URL https:\/\/openreview.net\/forum?id=r1gs9JgRZ."},{"key":"10.1016\/j.jpdc.2019.10.004_b37","series-title":"Playing atari with deep reinforcement learning","author":"Mnih","year":"2013"},{"key":"10.1016\/j.jpdc.2019.10.004_b38","series-title":"Mixed Precision Training: Tensor Cores","author":"Nvidia","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b39","unstructured":"A. Paszke, S. Gross, S. Chintala, G. Chanan, E. Yang, Z. DeVito, Z. Lin, A. Desmaison, L. Antiga, A. Lerer, Automatic differentiation in PyTorch, in: NIPS Workshop on Autodiff, 2017."},{"key":"10.1016\/j.jpdc.2019.10.004_b40","series-title":"Computational Science - ICCS 2004","first-page":"1","article-title":"Optimization of collective reduction operations","author":"Rabenseifner","year":"2004"},{"key":"10.1016\/j.jpdc.2019.10.004_b41","series-title":"Advances in Neural Information Processing Systems","first-page":"693","article-title":"Hogwild: A lock-free approach to parallelizing stochastic gradient descent","author":"Recht","year":"2011"},{"key":"10.1016\/j.jpdc.2019.10.004_b42","doi-asserted-by":"crossref","unstructured":"J. Redmon, A. Farhadi, YOLO9000: Better, faster, stronger, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 7263\u20137271.","DOI":"10.1109\/CVPR.2017.690"},{"key":"10.1016\/j.jpdc.2019.10.004_b43","series-title":"An overview of gradient descent optimization algorithms","author":"Ruder","year":"2016"},{"issue":"3","key":"10.1016\/j.jpdc.2019.10.004_b44","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","article-title":"Imagenet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2015","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.jpdc.2019.10.004_b45","doi-asserted-by":"crossref","first-page":"85","DOI":"10.1016\/j.neunet.2014.09.003","article-title":"Deep learning in neural networks: An overview","volume":"61","author":"Schmidhuber","year":"2015","journal-title":"Neural Netw."},{"key":"10.1016\/j.jpdc.2019.10.004_b46","series-title":"INTERSPEECH","article-title":"1-bit stochastic gradient descent and application to data-parallel distributed training of speech DNNs","author":"Seide","year":"2014"},{"key":"10.1016\/j.jpdc.2019.10.004_b47","series-title":"Horovod: Fast and easy distributed deep learning in tensorflow","author":"Sergeev","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b48","doi-asserted-by":"crossref","unstructured":"K. Shvachko, H. Kuang, S. Radia, R. Chansler, The hadoop distributed file system, in: 2010 IEEE 26th Symposium on Mass Storage Systems and Technologies, MSST, 2010, pp. 1\u201310, http:\/\/dx.doi.org\/10.1109\/MSST.2010.5496972.","DOI":"10.1109\/MSST.2010.5496972"},{"key":"10.1016\/j.jpdc.2019.10.004_b49","doi-asserted-by":"crossref","unstructured":"L.N. Smith, Cyclical learning rates for training neural networks, in: 2017 IEEE Winter Conference on Applications of Computer Vision , WACV, 2017, pp. 464\u2013472, http:\/\/dx.doi.org\/10.1109\/WACV.2017.58.","DOI":"10.1109\/WACV.2017.58"},{"key":"10.1016\/j.jpdc.2019.10.004_b50","series-title":"A disciplined approach to neural network hyper-parameters: Part 1 - learning rate, batch size, momentum, and weight decay","author":"Smith","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b51","unstructured":"S.L. Smith, P.-J. Kindermans, Q.V. Le, Don\u2019t decay the learning rate, increase the batch size, in: International Conference on Learning Representations, 2018, URL https:\/\/openreview.net\/forum?id=B1Yy1BxCZ."},{"key":"10.1016\/j.jpdc.2019.10.004_b52","series-title":"INTERSPEECH","article-title":"Scalable distributed DNN training using commodity GPU cloud computing","author":"Strom","year":"2015"},{"key":"10.1016\/j.jpdc.2019.10.004_b53","doi-asserted-by":"crossref","unstructured":"C. Szegedy, V. Vanhoucke, S. Ioffe, J. Shlens, Z. Wojna, Rethinking the inception architecture for computer vision, in: the IEEE Conference on Computer Vision and Pattern Recognition , CVPR, 2016.","DOI":"10.1109\/CVPR.2016.308"},{"issue":"1","key":"10.1016\/j.jpdc.2019.10.004_b54","doi-asserted-by":"crossref","first-page":"49","DOI":"10.1177\/1094342005051521","article-title":"Optimization of collective communication operations in MPICH","volume":"19","author":"Thakur","year":"2005","journal-title":"Int. J. High Perform. Comput. Appl."},{"issue":"2","key":"10.1016\/j.jpdc.2019.10.004_b55","first-page":"26","article-title":"Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude","volume":"4","author":"Tieleman","year":"2012","journal-title":"COURSERA: Neural Netw. Mach. Learn."},{"key":"10.1016\/j.jpdc.2019.10.004_b56","series-title":"Advances in Neural Information Processing Systems, vol. 30","first-page":"1509","article-title":"Terngrad: Ternary gradients to reduce communication in distributed deep learning","author":"Wen","year":"2017"},{"key":"10.1016\/j.jpdc.2019.10.004_b57","series-title":"White paper: BFLOAT16 \u2014 Hardware Numerics Definition","year":"2018"},{"key":"10.1016\/j.jpdc.2019.10.004_b58","series-title":"Large batch training of convolutional networks","author":"You","year":"2017"},{"key":"10.1016\/j.jpdc.2019.10.004_b59","series-title":"Advances in Neural Information Processing Systems","first-page":"685","article-title":"Deep learning with elastic averaging SGD","author":"Zhang","year":"2015"},{"key":"10.1016\/j.jpdc.2019.10.004_b60","series-title":"Proceedings of the Twenty-Fifth International Joint Conference on Artificial Intelligence","first-page":"2350","article-title":"Staleness-aware async-SGD for distributed deep learning","author":"Zhang","year":"2016"},{"key":"10.1016\/j.jpdc.2019.10.004_b61","unstructured":"R. Zhang, J. Kwok, Asynchronous distributed ADMM for consensus optimization, in: International Conference on Machine Learning, 2014, pp. 1701\u20131709."},{"key":"10.1016\/j.jpdc.2019.10.004_b62","series-title":"Proceedings of the 34th International Conference on Machine Learning - Volume 70","first-page":"4120","article-title":"Asynchronous stochastic gradient descent with delay compensation","author":"Zheng","year":"2017"}],"container-title":["Journal of Parallel and Distributed Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731518308712?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731518308712?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2022,8,3]],"date-time":"2022-08-03T23:22:06Z","timestamp":1659568926000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0743731518308712"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,3]]},"references-count":62,"alternative-id":["S0743731518308712"],"URL":"https:\/\/doi.org\/10.1016\/j.jpdc.2019.10.004","relation":{},"ISSN":["0743-7315"],"issn-type":[{"value":"0743-7315","type":"print"}],"subject":[],"published":{"date-parts":[[2020,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A Hitchhiker\u2019s Guide On Distributed Training Of Deep Neural Networks","name":"articletitle","label":"Article Title"},{"value":"Journal of Parallel and Distributed Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.jpdc.2019.10.004","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2019 Elsevier Inc. All rights reserved.","name":"copyright","label":"Copyright"}]}}