{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T10:40:09Z","timestamp":1729939209409,"version":"3.28.0"},"reference-count":42,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004735","name":"Natural Science Foundation of Hunan Province","doi-asserted-by":"publisher","award":["2022JJ40096"],"id":[{"id":"10.13039\/501100004735","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Future Generation Computer Systems"],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1016\/j.future.2023.08.020","type":"journal-article","created":{"date-parts":[[2023,8,18]],"date-time":"2023-08-18T07:16:48Z","timestamp":1692343008000},"page":"10-20","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["InferFair: Towards QoS-aware scheduling for performance isolation guarantee in heterogeneous model serving systems"],"prefix":"10.1016","volume":"150","author":[{"given":"Yaqiong","family":"Peng","sequence":"first","affiliation":[]},{"given":"Haocheng","family":"Peng","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"year":"2016","series-title":"Apple siri","key":"10.1016\/j.future.2023.08.020_b1"},{"year":"2012","series-title":"Large scale language modeling in automatic speech recognition","author":"Chelba","key":"10.1016\/j.future.2023.08.020_b2"},{"key":"10.1016\/j.future.2023.08.020_b3","article-title":"ImageNet large scale visual recognition challenge","volume":"115","author":"Russakovsky","year":"2014","journal-title":"Int. J. Comput. Vis."},{"year":"2021","series-title":"Alibaba DNN inference","key":"10.1016\/j.future.2023.08.020_b4"},{"key":"10.1016\/j.future.2023.08.020_b5","unstructured":"D. Crankshaw, X. Wang, G. Zhou, M.J. Franklin, J.E. Gonzalez, I. Stoica, Clipper: A Low-Latency Online Prediction Serving System, in: Proc. NSDI, 2017, pp. 613\u2013627."},{"key":"10.1016\/j.future.2023.08.020_b6","unstructured":"A. Gujarati, R. Karimi, S. Alzayat, W. Hao, A. Kaufmann, Y. Vigfusson, J. Mace, Serving DNNs like Clockwork: Performance Predictability from the Bottom Up, in: Proc. OSDI, ISBN: 978-1-939133-19-9, 2020, pp. 443\u2013462."},{"key":"10.1016\/j.future.2023.08.020_b7","doi-asserted-by":"crossref","unstructured":"M. Song, Y. Hu, H. Chen, T. Li, Towards Pervasive and User Satisfactory CNN across GPU Microarchitectures, in: Proc. HPCA, 2017, pp. 1\u201312.","DOI":"10.1109\/HPCA.2017.52"},{"key":"10.1016\/j.future.2023.08.020_b8","series-title":"Proceedings of the 2017 ACM on Conference on Information and Knowledge Management","first-page":"2067","article-title":"QoS-aware scheduling of heterogeneous servers for inference in deep neural networks","author":"Fang","year":"2017"},{"year":"2017","series-title":"TensorFlow-serving: Flexible, high-performance ML serving","author":"Olston","key":"10.1016\/j.future.2023.08.020_b9"},{"year":"2019","series-title":"NVIDA multi-process service","key":"10.1016\/j.future.2023.08.020_b10"},{"key":"10.1016\/j.future.2023.08.020_b11","unstructured":"P. Jain, X. Mo, A. Jain, H. Subbaraj, R. Durrani, A. Tumanov, J. Gonzalez, I. Stoica, Dynamic Space-Time Scheduling for GPU Inference, in: LearningSys Workshop at Neural Information Processing Systems 2018, 2018."},{"key":"10.1016\/j.future.2023.08.020_b12","doi-asserted-by":"crossref","unstructured":"Y. Hu, S. Rallapalli, B. Ko, R. Govindan, Olympian: Scheduling GPU Usage in a Deep Neural Network Model Serving System, in: Proc. Middleware, ISBN: 9781450357029, 2018, pp. 53\u201365.","DOI":"10.1145\/3274808.3274813"},{"year":"2021","series-title":"Serving DNN models with multi-instance GPUs: A case of the reconfigurable machine scheduling problem","author":"Tan","key":"10.1016\/j.future.2023.08.020_b13"},{"key":"10.1016\/j.future.2023.08.020_b14","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep Residual Learning for Image Recognition, in: Proc. CVPR, (ISSN: 1063-6919) 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.future.2023.08.020_b15","doi-asserted-by":"crossref","unstructured":"C. Szegedy, W. Liu, Y. Jia, P. Sermanet, S. Reed, D. Anguelov, D. Erhan, V. Vanhoucke, A. Rabinovich, Going deeper with convolutions, in: Proc. CVPR, 2015, pp. 1\u20139.","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"10.1016\/j.future.2023.08.020_b16","doi-asserted-by":"crossref","unstructured":"B. Xiao, H. Wu, Y. Wei, Simple Baselines for Human Pose Estimation and Tracking, in: Proc. ECCV, ISBN: 978-3-030-01231-1, 2018, pp. 472\u2013487.","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"10.1016\/j.future.2023.08.020_b17","unstructured":"F. Romero, Q. Li, N.J. Yadwadkar, C. Kozyrakis, INFaaS: Automated Model-less Inference Serving, in: Proc. ATC, 2021, pp. 397\u2013411."},{"year":"2019","series-title":"Salus: Fine-grained GPU sharing primitives for deep learning applications","author":"Yu","key":"10.1016\/j.future.2023.08.020_b18"},{"key":"10.1016\/j.future.2023.08.020_b19","doi-asserted-by":"crossref","unstructured":"Z. Fu, J. Ren, D. Zhang, Y. Zhou, Y. Zhang, Kalmia: A Heterogeneous QoS-aware Scheduling Framework for DNN Tasks on Edge Servers, in: Proc. INFOCOM, 2022, pp. 780\u2013789.","DOI":"10.1109\/INFOCOM48880.2022.9796661"},{"issue":"1","key":"10.1016\/j.future.2023.08.020_b20","first-page":"3:1","article-title":"Scalable deep learning on distributed infrastructures: Challenges, techniques, and tools","volume":"53","author":"Mayer","year":"2020","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.future.2023.08.020_b21","unstructured":"M. Shahrad, R. Fonseca, I. Goiri, G. Chaudhry, P. Batum, J. Cooke, E. Laureano, C. Tresness, M. Russinovich, R. Bianchini, Serverless in the Wild: Characterizing and Optimizing the Serverless Workload at a Large Cloud Provider, in: Proc. ATC, ISBN: 978-1-939133-14-4, 2020, pp. 205\u2013218."},{"key":"10.1016\/j.future.2023.08.020_b22","doi-asserted-by":"crossref","unstructured":"H. Zhou, S. Bateni, C. Liu, S3DNN: Supervised Streaming and Scheduling for GPU-Accelerated Real-Time DNN Workloads, in: Proc. RTAS, 2018, pp. 190\u2013201.","DOI":"10.1109\/RTAS.2018.00028"},{"issue":"6","key":"10.1016\/j.future.2023.08.020_b23","doi-asserted-by":"crossref","first-page":"1307","DOI":"10.1109\/TPDS.2020.3047638","article-title":"E2bird: Enhanced elastic batch for improving responsiveness and throughput of deep learning services","volume":"32","author":"Cui","year":"2021","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/j.future.2023.08.020_b24","unstructured":"W. Cui, H. Zhao, Q. Chen, H. Wei, Z. Li, D. Zeng, C. Li, M. Guo, DVABatch: Diversity-aware multi-entry multi-exit batching for Efficient Processing of DNN Services on GPUs, in: Proc. ATC, ISBN: 978-1-939133-29-10, 2022, pp. 183\u2013198."},{"key":"10.1016\/j.future.2023.08.020_b25","doi-asserted-by":"crossref","unstructured":"H. Shen, L. Chen, Y. Jin, L. Zhao, B. Kong, M. Philipose, A. Krishnamurthy, R. Sundaram, Nexus: a GPU cluster engine for accelerating DNN-based video analysis, in: Proc. SOSP, 2019, pp. 322\u2013337.","DOI":"10.1145\/3341301.3359658"},{"issue":"1","key":"10.1016\/j.future.2023.08.020_b26","doi-asserted-by":"crossref","first-page":"46","DOI":"10.1145\/321738.321743","article-title":"Scheduling algorithms for multiprogramming in a hard-real-time environment","volume":"20","author":"Liu","year":"1973","journal-title":"J. ACM"},{"key":"10.1016\/j.future.2023.08.020_b27","unstructured":"S. Choi, S. Lee, Y. Kim, J. Park, Y. Kwon, J. Huh, Serving Heterogeneous Machine Learning Models on Multi-GPU Servers with Spatio-Temporal Sharing, in: Proc. ATC, 2022, pp. 199\u2013216."},{"year":"2020","series-title":"The ONNX model zoo","key":"10.1016\/j.future.2023.08.020_b28"},{"issue":"23","key":"10.1016\/j.future.2023.08.020_b29","first-page":"1","article-title":"GluonCV and GluonNLP: Deep learning in computer vision and natural language processing","volume":"21","author":"Guo","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"10.1016\/j.future.2023.08.020_b30","unstructured":"D. Baylor, E. Breck, H.-T. Cheng, N. Fiedel, C.Y. Foo, Z. Haque, S. Haykal, M. Ispir, V. Jain, L. Koc, C.Y. Koo, L. Lew, C. Mewald, A.N. Modi, N. Polyzotis, S. Ramesh, S. Roy, S.E. Whang, M. Wicke, J. Wilkiewicz, X. Zhang, M. Zinkevich, TFX: A TensorFlow-Based Production-Scale Machine Learning Platform, in: Proc. SIGKDD, ISBN: 9781450348874, 2017, pp. 1387\u20131395."},{"year":"2018","series-title":"InferLine: ML inference pipeline composition framework","author":"Crankshaw","key":"10.1016\/j.future.2023.08.020_b31"},{"key":"10.1016\/j.future.2023.08.020_b32","doi-asserted-by":"crossref","unstructured":"J. Fang, Y. Yu, C. Zhao, J. Zhou, TurboTransformers: An Efficient GPU Serving System for Transformer Models, in: Proc. PPoPP, ISBN: 9781450382946, 2021, pp. 389\u2013402.","DOI":"10.1145\/3437801.3441578"},{"year":"2022","series-title":"Deep learning workload scheduling in GPU datacenters: Taxonomy, challenges and vision","author":"Gao","key":"10.1016\/j.future.2023.08.020_b33"},{"key":"10.1016\/j.future.2023.08.020_b34","unstructured":"A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, L. Antiga, A. Desmaison, A. K\u00f6pf, E.Z. Yang, Z. DeVito, M. Raison, A. Tejani, S. Chilamkurthy, B. Steiner, L. Fang, J. Bai, S. Chintala, PyTorch: An Imperative Style, High-Performance Deep Learning Library, in: Proc. NeurIPS, 2019, pp. 8024\u20138035."},{"year":"2016","series-title":"TensorFlow: Large-scale machine learning on heterogeneous distributed systems","author":"Abadi","key":"10.1016\/j.future.2023.08.020_b35"},{"key":"10.1016\/j.future.2023.08.020_b36","doi-asserted-by":"crossref","unstructured":"Z. Liu, J. Leng, Z. Zhang, Q. Chen, C. Li, M. Guo, VELTAIR: Towards High-Performance Multi-Tenant Deep Learning Services via Adaptive Compilation and Scheduling, in: Proc. ASPLOS, 2022, pp. 388\u2013401.","DOI":"10.1145\/3503222.3507752"},{"key":"10.1016\/j.future.2023.08.020_b37","unstructured":"C. Lv, C. Niu, R. Gu, X. Jiang, Z. Wang, B. Liu, Z. Wu, Q. Yao, C. Huang, P. Huang, T. Huang, H. Shu, J. Song, B. Zou, P. Lan, G. Xu, F. Wu, S. Tang, F. Wu, G. Chen, Walle: An End-to-End, General-Purpose, and Large-Scale Production System for Device-Cloud Collaborative Machine Learning, in: Proc. OSDI, 2022, pp. 249\u2013265."},{"key":"10.1016\/j.future.2023.08.020_b38","unstructured":"Z. Bai, Z. Zhang, Y. Zhu, X. Jin, PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications, in: Proc. OSDI, ISBN: 978-1-939133-19-9, 2020, pp. 499\u2013514."},{"key":"10.1016\/j.future.2023.08.020_b39","unstructured":"H. Zhao, Z. Han, Z. Yang, Q. Zhang, F. Yang, L. Zhou, M. Yang, F.C. Lau, Y. Wang, Y. Xiong, B. Wang, HiveD: Sharing a GPU Cluster for Deep Learning with Guarantees, in: Proc. OSDI, ISBN: 978-1-939133-19-9, 2020, pp. 515\u2013532."},{"key":"10.1016\/j.future.2023.08.020_b40","doi-asserted-by":"crossref","unstructured":"C. Lu, J.A. Stankovic, G. Tao, S.H. Son, Design and evaluation of a feedback control EDF scheduling algorithm, in: Proc. RTSS, 1999, pp. 56\u201367.","DOI":"10.1109\/REAL.1999.818828"},{"issue":"1","key":"10.1016\/j.future.2023.08.020_b41","doi-asserted-by":"crossref","first-page":"39","DOI":"10.1023\/B:TIME.0000033378.56741.14","article-title":"Utilization bounds for EDF scheduling on real-time multiprocessor systems","volume":"28","author":"L\u00f3pez","year":"2004","journal-title":"Real-Time Syst."},{"key":"10.1016\/j.future.2023.08.020_b42","unstructured":"S.A. Brandt, S. Banachowski, C. Lin, T. Bisson, Dynamic Integrated Scheduling of Hard Real-Time, Soft Real-Time and Non-Real-Time Processes, in: Proc. RTSS, ISBN: 0769520448, 2003, p. 396."}],"container-title":["Future Generation Computer Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167739X23003187?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167739X23003187?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T09:40:17Z","timestamp":1729935617000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167739X23003187"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1]]},"references-count":42,"alternative-id":["S0167739X23003187"],"URL":"https:\/\/doi.org\/10.1016\/j.future.2023.08.020","relation":{},"ISSN":["0167-739X"],"issn-type":[{"type":"print","value":"0167-739X"}],"subject":[],"published":{"date-parts":[[2024,1]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"InferFair: Towards QoS-aware scheduling for performance isolation guarantee in heterogeneous model serving systems","name":"articletitle","label":"Article Title"},{"value":"Future Generation Computer Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.future.2023.08.020","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2023 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}