{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T13:49:40Z","timestamp":1730209780652,"version":"3.28.0"},"reference-count":28,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T00:00:00Z","timestamp":1719878400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,2]],"date-time":"2024-07-02T00:00:00Z","timestamp":1719878400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,2]]},"DOI":"10.1109\/compsac61105.2024.00066","type":"proceedings-article","created":{"date-parts":[[2024,8,26]],"date-time":"2024-08-26T17:23:31Z","timestamp":1724693011000},"page":"429-438","source":"Crossref","is-referenced-by-count":0,"title":["Effective Compression of Language Models by Combining Pruning and Knowledge Distillation"],"prefix":"10.1109","author":[{"given":"Chi-Yu","family":"Chiu","sequence":"first","affiliation":[{"name":"Graduate Institute of Networking and Multimedia, National Taiwan University,Taipei,Taiwan"}]},{"given":"Ding-Yong","family":"Hong","sequence":"additional","affiliation":[{"name":"Institute of Information Science, Academia Sinica,Taipei,Taiwan"}]},{"given":"Pangfeng","family":"Liu","sequence":"additional","affiliation":[{"name":"Graduate Institute of Networking and Multimedia, National Taiwan University,Department of Computer Science and Information Engineering,Taipei,Taiwan"}]},{"given":"Jan-Jan","family":"Wu","sequence":"additional","affiliation":[{"name":"Institute of Information Science, Academia Sinica,Taipei,Taiwan"}]}],"member":"263","reference":[{"key":"ref1","first-page":"5998","article-title":"Attention is all you need","volume-title":"Advances in Neural Information Processing Systems","volume":"30","author":"Vaswani","year":"2017"},{"first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Devlin","key":"ref2"},{"issue":"9","key":"ref3","first-page":"1","article-title":"Distilling the Knowledge in a Neural Network","volume":"1","author":"Hinton","year":"2015","journal-title":"NIPS 2014 Deep Learning Workshop"},{"issue":"10","key":"ref4","first-page":"1","article-title":"Compressing deep convolutional networks using vector quantization","volume":"abs\/1412.6115","author":"Gong","year":"2014","journal-title":"CoRR"},{"volume-title":"Nvidia ampere ga102 gpu architecture","year":"2020","key":"ref5"},{"journal-title":"arXiv preprint","article-title":"Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter","year":"2019","author":"Sanh","key":"ref6"},{"issue":"8","key":"ref7","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"volume-title":"The state of sparsity in deep neural networks","year":"2019","author":"Gale","key":"ref8"},{"key":"ref9","first-page":"20378","article-title":"Movement pruning: Adaptive sparsity by fine-tuning","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Sanh","year":"2020"},{"first-page":"4163","article-title":"The optimal BERT surgeon: Scalable and accurate second-order pruning for large language models","volume-title":"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing","author":"Kurtic","key":"ref10"},{"key":"ref11","article-title":"Are sixteen heads really better than one?","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Michel","year":"2019"},{"key":"ref12","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","volume-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","author":"Papineni","year":"2002"},{"key":"ref13","doi-asserted-by":"crossref","first-page":"5797","DOI":"10.18653\/v1\/P19-1580","article-title":"Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","author":"Voita","year":"2019"},{"article-title":"Reducing transformer depth on demand with structured dropout","volume-title":"International Conference on Learning Representations","author":"Fan","key":"ref14"},{"article-title":"Multilingual neural machine translation with knowledge distillation","volume-title":"International Conference on Learning Representations","author":"Tan","key":"ref15"},{"key":"ref16","doi-asserted-by":"crossref","first-page":"4323","DOI":"10.18653\/v1\/D19-1441","article-title":"Patient knowledge distillation for BERT model compression","volume-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)","author":"Sun","year":"2019"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00356"},{"key":"ref19","doi-asserted-by":"crossref","first-page":"2383","DOI":"10.18653\/v1\/D16-1264","article-title":"SQuAD: 100,000+ questions for machine comprehension of text","volume-title":"Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing","author":"Rajpurkar","year":"2016"},{"key":"ref20","doi-asserted-by":"crossref","first-page":"784","DOI":"10.18653\/v1\/P18-2124","article-title":"Know what you dont know: Unanswerable questions for SQuAD","volume-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)","author":"Rajpurkar","year":"2018"},{"key":"ref21","doi-asserted-by":"crossref","first-page":"353","DOI":"10.18653\/v1\/W18-5446","article-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding","volume-title":"Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP","author":"Wang","year":"2018"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"ref23","first-page":"8024","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume-title":"Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8\u201314, 2019, Vancouver, BC, Canada","author":"Paszke","year":"2019"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.5555\/3291168.3291211"},{"volume-title":"Improving language understanding by generative pre-training","year":"2018","author":"Radford","key":"ref25"},{"volume-title":"Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers","year":"2020","author":"Wang","key":"ref26"},{"volume-title":"Picking winning tickets before training by preserving gradient flow","year":"2020","author":"Wang","key":"ref27"},{"key":"ref28","first-page":"20721","article-title":"Dominosearch: Find layer-wise fine-grained n: M sparse schemes from dense neural networks","volume":"34","author":"Sun","year":"2021","journal-title":"Advances in neural information processing systems"}],"event":{"name":"2024 IEEE 48th Annual Computers, Software, and Applications Conference (COMPSAC)","start":{"date-parts":[[2024,7,2]]},"location":"Osaka, Japan","end":{"date-parts":[[2024,7,4]]}},"container-title":["2024 IEEE 48th Annual Computers, Software, and Applications Conference (COMPSAC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10633276\/10633246\/10633390.pdf?arnumber=10633390","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,3]],"date-time":"2024-09-03T04:54:46Z","timestamp":1725339286000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10633390\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,2]]},"references-count":28,"URL":"https:\/\/doi.org\/10.1109\/compsac61105.2024.00066","relation":{},"subject":[],"published":{"date-parts":[[2024,7,2]]}}}