{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,23]],"date-time":"2024-10-23T08:17:53Z","timestamp":1729671473579,"version":"3.28.0"},"reference-count":84,"publisher":"IEEE","license":[{"start":{"date-parts":[[2019,7,1]],"date-time":"2019-07-01T00:00:00Z","timestamp":1561939200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,7,1]],"date-time":"2019-07-01T00:00:00Z","timestamp":1561939200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,7,1]],"date-time":"2019-07-01T00:00:00Z","timestamp":1561939200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019,7]]},"DOI":"10.1109\/icdcs.2019.00173","type":"proceedings-article","created":{"date-parts":[[2019,11,1]],"date-time":"2019-11-01T00:28:38Z","timestamp":1572568118000},"page":"1742-1753","source":"Crossref","is-referenced-by-count":7,"title":["HPDL: Towards a General Framework for High-performance Distributed Deep Learning"],"prefix":"10.1109","author":[{"given":"Dongsheng","family":"Li","sequence":"first","affiliation":[]},{"given":"Zhiquan","family":"Lai","sequence":"additional","affiliation":[]},{"given":"Keshi","family":"Ge","sequence":"additional","affiliation":[]},{"given":"Yiming","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Zhaoning","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qinglin","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Huaimin","family":"Wang","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"year":"2018","author":"chen","journal-title":"TVM End-to-End Optimization Stack for Deep Learning","key":"ref73"},{"year":"2018","author":"lai","journal-title":"Cmsis-nn Efficient neural network kernels for arm cortex-m cpus","key":"ref72"},{"year":"2019","journal-title":"Library Computing","key":"ref71"},{"year":"2019","journal-title":"Intel MKL-DNN","key":"ref70"},{"key":"ref76","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"duchi","year":"2011","journal-title":"Journal of Machine Learning Research"},{"year":"2014","author":"kingma","journal-title":"Adam A method for stochastic optimization","key":"ref77"},{"key":"ref74","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-030-04167-0","article-title":"LAG: Lazily aggregated gradient for communication-efficient distributed learning","author":"chen","year":"2018","journal-title":"Advances in Neural IInformation Processing Systems"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.1109\/TBDATA.2015.2472014"},{"key":"ref75","first-page":"1139","article-title":"On the importance of initialization and momentum in deep learning","author":"sutskever","year":"2013","journal-title":"International Conference on Machine Learning"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1145\/2523616.2523633"},{"key":"ref78","first-page":"19","article-title":"Progressive neural architecture search","author":"liu","year":"2018","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV)"},{"key":"ref79","first-page":"7827","article-title":"Neural architecture optimization","author":"luo","year":"2018","journal-title":"Advances in neural information processing systems"},{"year":"2012","author":"bastien","journal-title":"Theano new features and speed improvements","key":"ref33"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1007\/s10107-012-0572-5"},{"key":"ref31","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters","author":"zhang","year":"2017","journal-title":"ArXiv Preprint"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1109\/SC.2018.00054"},{"key":"ref37","first-page":"1337","article-title":"Deep learning with COTS HPC systems","author":"coates","year":"2013","journal-title":"International Conference on Machine Learning"},{"key":"ref36","first-page":"1","article-title":"Chainer: a next-generation open source framework for deep learning","volume":"5","author":"tokui","year":"2015","journal-title":"Proceedings of Workshop on Machine Learning Systems (LearningSys) in The Twenty-ninth Annual Conference on Neural Information Processing Systems (NIPS)"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1145\/2733373.2806232"},{"year":"2015","author":"moritz","journal-title":"Spark Net Training Deep Networks in Spark","key":"ref34"},{"year":"2017","author":"smith","journal-title":"Don't decay the learning rate increase the batch size","key":"ref60"},{"doi-asserted-by":"publisher","key":"ref62","DOI":"10.1145\/3225058.3225069"},{"year":"2017","author":"you","journal-title":"Scaling sgd batch size to 32k for imagenet training","key":"ref61"},{"year":"2018","author":"mikami","journal-title":"ImageNet\/ResNet-50 Training in 224 Seconds","key":"ref63"},{"year":"2017","author":"lee","journal-title":"Introducing Big Basin Our next-generation AI hardware","key":"ref28"},{"year":"2017","author":"mirhoseini","journal-title":"Device placement optimization with reinforcement learning","key":"ref64"},{"year":"2017","author":"goyal","journal-title":"Accurate large minibatch sgd Training imagenet in 1 hour","key":"ref27"},{"year":"2018","author":"mirhoseini","journal-title":"A hierarchical model for device placement","key":"ref65"},{"key":"ref66","first-page":"1676","article-title":"Spotlight: Optimizing device placement for training deep neural networks","author":"gao","year":"2018","journal-title":"International Conference on Machine Learning"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1007\/s02011-011-1137-8"},{"key":"ref67","first-page":"9993","article-title":"Post: Device placement with cross-entropy minimization and proximal policy optimization","author":"sutskever","year":"2018","journal-title":"Advances in neural information processing systems"},{"year":"2018","author":"jia","journal-title":"Beyond data and model parallelism for deep neural networks","key":"ref68"},{"year":"2019","journal-title":"Cudnn","key":"ref69"},{"year":"2014","author":"simonyan","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition","key":"ref2"},{"key":"ref1","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Advances in neural information processing systems"},{"year":"2016","author":"abu-el-haija","journal-title":"Youtube-8m A large-scale video classification benchmark","key":"ref20"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.1007\/s11263-015-0816-y"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/CVPR.2018.00907"},{"key":"ref24","first-page":"571","article-title":"Project Adam: Building an efficient and scalable deep learning training system","volume":"14","author":"chilimbi","year":"2014","journal-title":"OSDI"},{"key":"ref23","first-page":"1223","article-title":"Large scale distributed deep networks","author":"dean","year":"2012","journal-title":"Advances in neural information processing systems"},{"year":"2018","author":"sergeev","journal-title":"Horovod fast and easy distributed deep learning in tensorflow","key":"ref26"},{"year":"2017","author":"cho","journal-title":"PowerAI DDL","key":"ref25"},{"key":"ref50","first-page":"873","article-title":"Distributed delayed stochastic optimization","author":"agarwal","year":"2011","journal-title":"Advances in neural information processing systems"},{"key":"ref51","first-page":"693","article-title":"Hogwild: A lock-free approach to parallelizing stochastic gradient descent","author":"recht","year":"2011","journal-title":"Advances in neural information processing systems"},{"key":"ref59","doi-asserted-by":"crossref","first-page":"661","DOI":"10.1145\/2623330.2623612","article-title":"Efficient mini-batch training for stochastic optimization","author":"li","year":"2014","journal-title":"ACM SIGKDD International Conference on Knowledge Discovery and Data Mining"},{"year":"2018","author":"jia","journal-title":"Highly Scalable Deep Learning Training System with Mixed-Precision Training ImageNet in Four Minutes","key":"ref58"},{"doi-asserted-by":"publisher","key":"ref57","DOI":"10.1016\/j.jpdc.2008.09.002"},{"year":"2017","author":"gibiansky","journal-title":"Bringing HPC Techniques to Deep Learning","key":"ref56"},{"doi-asserted-by":"publisher","key":"ref55","DOI":"10.14778\/1920841.1920931"},{"key":"ref54","first-page":"583","article-title":"Scaling distributed machine learning with the parameter server","volume":"14","author":"li","year":"2014","journal-title":"USENIX Symposium on Operating Systems Design and Implementation"},{"doi-asserted-by":"publisher","key":"ref53","DOI":"10.1016\/0005-1098(91)90003-K"},{"key":"ref52","first-page":"5330","article-title":"Can decentralized algorithms outperform centralized algorithms? a case study for decentralized parallel stochastic gradient descent","author":"lian","year":"2017","journal-title":"Advances in neural information processing systems"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.18653\/v1\/P17-1012"},{"key":"ref11","first-page":"265","article-title":"Tensorfiow: a system for large-scale machine learning","volume":"16","author":"abadi","year":"0","journal-title":"OSDI"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.1145\/3097983.3098029"},{"year":"2015","author":"chen","journal-title":"Mxnet A flexible and efficient machine learning library for heterogeneous distributed systems","key":"ref12"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1145\/2647868.2654889"},{"year":"2017","author":"paszke","journal-title":"On Automatic Differentiation","key":"ref14"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1145\/3079856.3080246"},{"year":"2015","author":"han","journal-title":"Deep compression Compressing deep neural networks with pruning trained quantization and huffman coding","key":"ref82"},{"key":"ref16","first-page":"1742","article-title":"Flexpoint: An adaptive numerical format for efficient training of deep neural networks","author":"k\u00f6ster","year":"2017","journal-title":"Advances in neural information processing systems"},{"year":"2019","journal-title":"Nccl 2 0","key":"ref81"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/MICRO.2016.7783723"},{"doi-asserted-by":"publisher","key":"ref84","DOI":"10.1145\/3183713.3196894"},{"year":"2018","author":"devlin","journal-title":"BERT Pre-training of deep bidirectional transformers for language understanding","key":"ref18"},{"year":"2016","author":"zhang","journal-title":"The ZipML framework for training models with end-to-end low precision The cans the cannots and a little bit of deep learning","key":"ref83"},{"key":"ref19","first-page":"7","article-title":"Openimages: A public dataset for large-scale multi-label and multi-class image classification","volume":"2","author":"krasin","year":"2016","journal-title":"Dataset"},{"year":"0","author":"xiao","journal-title":"Gandiva Introspective cluster scheduling for deep learning","key":"ref80"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/CVPR.2016.90"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/CVPR.2015.7298594"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/CVPR.2017.376"},{"year":"2016","author":"bojarski","journal-title":"End to End Learning for Self-Driving Cars","key":"ref5"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.3115\/v1\/P15-1001"},{"year":"2014","author":"bahdanau","journal-title":"Neural machine translation by jointly learning to align and translate","key":"ref7"},{"doi-asserted-by":"publisher","key":"ref49","DOI":"10.1109\/BigData.2016.7840591"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref46","first-page":"2595","article-title":"Parallelized stochastic gradient descent","author":"zinkevich","year":"2010","journal-title":"Advances in neural information processing systems"},{"year":"2019","journal-title":"Core ML","key":"ref45"},{"doi-asserted-by":"publisher","key":"ref48","DOI":"10.1109\/ICASSP.2016.7472805"},{"key":"ref47","first-page":"456","article-title":"Distributed training strategies for the structured perceptron","author":"mcdonald","year":"2010","journal-title":"Human Language Technologies The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics"},{"key":"ref42","first-page":"561","article-title":"Ray: A distributed framework for emerging {AI} applications","author":"moritz","year":"2018","journal-title":"13th USENIX Symposium on Operating Systems Design and Implementation ( OSDI 18)"},{"doi-asserted-by":"publisher","key":"ref41","DOI":"10.1093\/nsr\/nwx018"},{"year":"2019","journal-title":"Onnx","key":"ref44"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.1109\/SC.2018.00053"}],"event":{"name":"2019 IEEE 39th International Conference on Distributed Computing Systems (ICDCS)","start":{"date-parts":[[2019,7,7]]},"location":"Dallas, TX, USA","end":{"date-parts":[[2019,7,10]]}},"container-title":["2019 IEEE 39th International Conference on Distributed Computing Systems (ICDCS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8867821\/8884790\/08885217.pdf?arnumber=8885217","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,3]],"date-time":"2022-10-03T05:35:26Z","timestamp":1664775326000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8885217\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,7]]},"references-count":84,"URL":"https:\/\/doi.org\/10.1109\/icdcs.2019.00173","relation":{},"subject":[],"published":{"date-parts":[[2019,7]]}}}