{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,8]],"date-time":"2024-09-08T03:56:14Z","timestamp":1725767774131},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,11,6]],"date-time":"2023-11-06T00:00:00Z","timestamp":1699228800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,11,6]],"date-time":"2023-11-06T00:00:00Z","timestamp":1699228800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,11,6]]},"DOI":"10.1109\/iccd58817.2023.00031","type":"proceedings-article","created":{"date-parts":[[2023,12,22]],"date-time":"2023-12-22T14:22:28Z","timestamp":1703254948000},"page":"150-157","source":"Crossref","is-referenced-by-count":1,"title":["A Cost-Efficient Failure-Tolerant Scheme for Distributed DNN Training"],"prefix":"10.1109","author":[{"given":"Menglei","family":"Chen","sequence":"first","affiliation":[{"name":"WNLO, Huazhong University of Science and Technology,Wuhan,China"}]},{"given":"Yu","family":"Hua","sequence":"additional","affiliation":[{"name":"WNLO, Huazhong University of Science and Technology,Wuhan,China"}]},{"given":"Rong","family":"Bai","sequence":"additional","affiliation":[{"name":"WNLO, Huazhong University of Science and Technology,Wuhan,China"}]},{"given":"Jianming","family":"Huang","sequence":"additional","affiliation":[{"name":"WNLO, Huazhong University of Science and Technology,Wuhan,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"journal-title":"NIPS","article-title":"Language models are few-shot learners","year":"2020","author":"Brown","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01417"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380362"},{"volume-title":"Analysis of large-scale multi-tenant gpu clusters for dnn training workloads","year":"2019","author":"Jeon","key":"ref5"},{"journal-title":"OSDI","article-title":"Heterogeneity-aware cluster scheduling policies for deep learning workloads","year":"2020","author":"Narayanan","key":"ref6"},{"journal-title":"OSDI","article-title":"Gandiva: Introspective cluster scheduling for deep learning","year":"2018","author":"Xiao","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid49817.2020.00-76"},{"journal-title":"NSDI","article-title":"Check-n-run: a check-pointing system for training deep learning recommendation models","year":"2022","author":"Eisenman","key":"ref9"},{"journal-title":"FAST","article-title":"Checkfreq: Frequent, fine-grained dnn checkpointing","year":"2021","author":"Mohan","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582055"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/iccd53106.2021.00057"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD53106.2021.00033"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD53106.2021.00037"},{"journal-title":"OSDI","article-title":"Listdb: Union of write-ahead logs and persistent skiplists for incremental checkpointing on persistent memory","year":"2022","author":"Kim","key":"ref15"},{"journal-title":"FAST","article-title":"NOVA: A log-structured file system for hybrid volatile\/non-volatile main memories","year":"2016","author":"Xu","key":"ref16"},{"journal-title":"FAST","article-title":"FORD: fast one-sided rdma-based distributed transactions for disaggregated persistent memory","year":"2022","author":"Zhang","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00035"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507758"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.14778\/3436905.3436921"},{"journal-title":"FAST","article-title":"An empirical guide to the behavior and use of scalable persistent memory","year":"2020","author":"Yang","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3204454"},{"journal-title":"OSDI","article-title":"Tensorflow: A system for large-scale machine learning","year":"2016","author":"Abadi","key":"ref23"},{"journal-title":"NIPS","article-title":"Pytorch: An imperative style, high-performance deep learning library","year":"2019","author":"Paszke","key":"ref24"},{"journal-title":"ICML","article-title":"Fault tolerance in iterative-convergent machine learning","year":"2019","author":"Qiao","key":"ref25"},{"volume-title":"Poseidon: An efficient communication architecture for distributed deep learning on gpu clusters","year":"2017","author":"Zhang","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"volume-title":"ZeRO-Offload: Democratizing Billion-Scale model training","year":"2021","author":"Ren","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.14778\/3372716.3372728"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.14778\/3476249.3476264"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"journal-title":"ICLR","article-title":"Very deep convolutional networks for large-scale image recognition","year":"2015","author":"Simonyan","key":"ref33"},{"journal-title":"NIPS","article-title":"Matching networks for one shot learning","year":"2016","author":"Vinyals","key":"ref34"},{"journal-title":"OpenAI blog","article-title":"Language models are unsupervised multitask learners","year":"2019","author":"Radford","key":"ref35"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref36"},{"article-title":"Pointer sentinel mixture models","year":"2016","author":"Merity","key":"ref37"},{"article-title":"Horovod: fast and easy distributed deep learning in tensorflow","year":"2018","author":"Sergeev","key":"ref38"},{"journal-title":"NSDI","article-title":"Tiresias: A gpu cluster manager for distributed deep learning","year":"2019","author":"Gu","key":"ref39"}],"event":{"name":"2023 IEEE 41st International Conference on Computer Design (ICCD)","start":{"date-parts":[[2023,11,6]]},"location":"Washington, DC, USA","end":{"date-parts":[[2023,11,8]]}},"container-title":["2023 IEEE 41st International Conference on Computer Design (ICCD)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10360938\/10360940\/10361018.pdf?arnumber=10361018","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,12]],"date-time":"2024-01-12T17:37:45Z","timestamp":1705081065000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10361018\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,6]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/iccd58817.2023.00031","relation":{},"subject":[],"published":{"date-parts":[[2023,11,6]]}}}