{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T18:32:28Z","timestamp":1724524348039},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,3,1]],"date-time":"2022-03-01T00:00:00Z","timestamp":1646092800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100012542","name":"Sichuan Province Science and Technology Support Program","doi-asserted-by":"publisher","award":["2020YFH0037"],"id":[{"id":"10.13039\/100012542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["52079026","61976044"],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004806","name":"Fok Ying Tong Education Foundation","doi-asserted-by":"publisher","award":["161062"],"id":[{"id":"10.13039\/501100004806","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["ZYGX2019Z014"],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013804","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013804","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2022,3]]},"DOI":"10.1016\/j.knosys.2021.108075","type":"journal-article","created":{"date-parts":[[2022,1,5]],"date-time":"2022-01-05T16:54:40Z","timestamp":1641401680000},"page":"108075","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":7,"special_numbering":"C","title":["Mixhead: Breaking the low-rank bottleneck in multi-head attention language models"],"prefix":"10.1016","volume":"240","author":[{"ORCID":"http:\/\/orcid.org\/0000-0003-1349-9755","authenticated-orcid":false,"given":"Zhong","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-6260-3005","authenticated-orcid":false,"given":"Nian","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Chongming","family":"Gao","sequence":"additional","affiliation":[]},{"given":"Rui","family":"Miao","sequence":"additional","affiliation":[]},{"given":"Qinli","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0002-6022-428X","authenticated-orcid":false,"given":"Junming","family":"Shao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2021.108075_b1","series-title":"Advances In Neural Information Processing Systems, Vol. 30","article-title":"Attention is all you need","author":"Vaswani","year":"2017"},{"key":"10.1016\/j.knosys.2021.108075_b2","doi-asserted-by":"crossref","unstructured":"Z. Dai, Z. Yang, Y. Yang, J. Carbonell, Q. Le, R. Salakhutdinov, Transformer-XL: Attentive Language Models beyond a Fixed-Length Context, in: Proceedings Of The 57th Annual Meeting Of The Association For Computational Linguistics, 2019, pp. 2978\u20132988.","DOI":"10.18653\/v1\/P19-1285"},{"key":"10.1016\/j.knosys.2021.108075_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2020.106321","article-title":"Relation classification via knowledge graph enhanced transformer encoder","volume":"206","author":"Huang","year":"2020","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2021.108075_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2020.105964","article-title":"Biomedical-domain pre-trained language model for extractive summarization","volume":"199","author":"Du","year":"2020","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2021.108075_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2021.106936","article-title":"Enhancing transformer-based language models with commonsense representations for knowledge-driven machine comprehension","volume":"220","author":"Li","year":"2021","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2021.108075_b6","article-title":"DAM: Transformer-based relation detection for question answering over knowledge base","volume":"201\u2013202","author":"Chen","year":"2020","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.knosys.2021.108075_b7","unstructured":"M. Chen, A. Radford, R. Child, J. Wu, H. Jun, P. Dhariwal, D. Luan, I. Sutskever, Generative Pretraining from Pixels, in: Proceedings Of The 37th International Conference On Machine Learning, 119, 2020, pp. 1691\u20131703."},{"key":"10.1016\/j.knosys.2021.108075_b8","doi-asserted-by":"crossref","unstructured":"F. Sun, J. Liu, J. Wu, C. Pei, X. Lin, W. Ou, P. Jiang, BERT4Rec: Sequential recommendation with bidirectional encoder representations from transformer, in: Proceedings Of The 28th ACM International Conference On Information And Knowledge Management, 2019, pp. 1441\u20131450.","DOI":"10.1145\/3357384.3357895"},{"key":"10.1016\/j.knosys.2021.108075_b9","series-title":"Advances In Neural Information Processing Systems, Vol. 32","first-page":"11983","article-title":"Graph transformer networks","author":"Yun","year":"2019"},{"key":"10.1016\/j.knosys.2021.108075_b10","unstructured":"A. Dosovitskiy, L. Beyer, A. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani, M. Minderer, G. Heigold, S. Gelly, J. Uszkoreit, N. Houlsby, An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale, in: International Conference On Learning Representations, 2021."},{"key":"10.1016\/j.knosys.2021.108075_b11","doi-asserted-by":"crossref","unstructured":"N. Carion, F. Massa, G. Synnaeve, N. Usunier, A. Kirillov, S. Zagoruyko, End-to-end object detection with transformers, in: European Conference On Computer Vision, 2020, pp. 213\u2013229.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"10.1016\/j.knosys.2021.108075_b12","doi-asserted-by":"crossref","unstructured":"H. Wang, Y. Zhu, B. Green, H. Adam, A. Yuille, L.-C. Chen, Axial-deeplab: Stand-alone axial-attention for panoptic segmentation, in: European Conference On Computer Vision, 2020, pp. 108\u2013126.","DOI":"10.1007\/978-3-030-58548-8_7"},{"key":"10.1016\/j.knosys.2021.108075_b13","unstructured":"S. Bhojanapalli, C. Yun, A.S. Rawat, S.J. Reddi, S. Kumar, Low-Rank Bottleneck in Multi-head Attention Models, in: Proceedings Of The 37th International Conference On Machine Learning, 119, 2020, pp. 864\u2013873."},{"key":"10.1016\/j.knosys.2021.108075_b14","unstructured":"Z. Yang, Z. Dai, R. Salakhutdinov, W.W. Cohen, Breaking the Softmax Bottleneck: A High-Rank RNN Language Model, in: International Conference On Learning Representations, 2018."},{"key":"10.1016\/j.knosys.2021.108075_b15","doi-asserted-by":"crossref","unstructured":"K. He, X. Zhang, S. Ren, J. Sun, Deep residual learning for image recognition, in: Proceedings Of The IEEE Conference On Computer Vision And Pattern Recognition, 2016, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"10.1016\/j.knosys.2021.108075_b16","series-title":"Layer normalization","author":"Ba","year":"2016"},{"key":"10.1016\/j.knosys.2021.108075_b17","unstructured":"A. Baevski, M. Auli, Adaptive Input Representations for Neural Language Modeling, in: International Conference On Learning Representations, 2019."},{"key":"10.1016\/j.knosys.2021.108075_b18","unstructured":"S. Merity, C. Xiong, J. Bradbury, R. Socher, Pointer Sentinel Mixture Models, in: Proceedings Of The 5th International Conference On Learning Representations, 2017."},{"key":"10.1016\/j.knosys.2021.108075_b19","doi-asserted-by":"crossref","unstructured":"K. Clark, U. Khandelwal, O. Levy, C.D. Manning, What Does BERT Look at? An Analysis of BERT\u2019s Attention, in: Proceedings Of The 2019 ACL Workshop BlackboxNLP: Analyzing And Interpreting Neural Networks For NLP, 2019, pp. 276\u2013286.","DOI":"10.18653\/v1\/W19-4828"},{"key":"10.1016\/j.knosys.2021.108075_b20","doi-asserted-by":"crossref","unstructured":"J. Li, Z. Tu, B. Yang, M.R. Lyu, T. Zhang, Multi-Head Attention with Disagreement Regularization, in: Proceedings Of The 2018 Conference On Empirical Methods In Natural Language Processing, 2018, pp. 2897\u20132903.","DOI":"10.18653\/v1\/D18-1317"},{"key":"10.1016\/j.knosys.2021.108075_b21","series-title":"Bounds for the Rank of the Sum of Two Matrices","author":"Marsaglia","year":"1964"},{"key":"10.1016\/j.knosys.2021.108075_b22","doi-asserted-by":"crossref","unstructured":"C. Chelba, T. Mikolov, M. Schuster, Q. Ge, T. Brants, P. Koehn, T. Robinson, One billion word benchmark for measuring progress in statistical language modeling, in: Proceedings Of The 15th Annual Conference Of The International Speech Communication Association, 2014, pp. 2635\u20132639.","DOI":"10.21437\/Interspeech.2014-564"},{"key":"10.1016\/j.knosys.2021.108075_b23","series-title":"Synthesizer: Rethinking self-attention in transformer models","author":"Tay","year":"2020"},{"key":"10.1016\/j.knosys.2021.108075_b24","doi-asserted-by":"crossref","unstructured":"M. Ott, S. Edunov, A. Baevski, A. Fan, S. Gross, N. Ng, D. Grangier, M. Auli, fairseq: A Fast, Extensible Toolkit for Sequence Modeling, in: Proceedings Of The 2019 Conference Of The North American Chapter Of The Association For Computational Linguistics, 2019, pp. 48\u201353.","DOI":"10.18653\/v1\/N19-4009"},{"key":"10.1016\/j.knosys.2021.108075_b25","series-title":"Advances In Neural Information Processing Systems, Vol. 31","first-page":"286","article-title":"Sigsoftmax: Reanalysis of the softmax bottleneck","author":"Kanai","year":"2018"},{"key":"10.1016\/j.knosys.2021.108075_b26","unstructured":"D. Bahdanau, K. Cho, Y. Bengio, Neural Machine Translation by Jointly Learning to Align and Translate, in: International Conference On Learning Representations, 2015."},{"key":"10.1016\/j.knosys.2021.108075_b27","unstructured":"Y. Cheng, S. Shen, Z. He, W. He, H. Wu, M. Sun, Y. Liu, Agreement-Based Joint Training for Bidirectional Attention-Based Neural Machine Translation, in: Proceedings Of The 25th International Joint Conference On Artificial Intelligence, 2016, pp. 2761\u20132767."},{"key":"10.1016\/j.knosys.2021.108075_b28","unstructured":"M. Cettolo, J. Niehues, S. St\u00fcker, L. Bentivogli, M. Federico, Report on the 11th IWSLT evaluation campaign, IWSLT 2014, in: Proceedings Of The International Workshop On Spoken Language Translation, 57, 2014."},{"key":"10.1016\/j.knosys.2021.108075_b29","doi-asserted-by":"crossref","unstructured":"T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, T.L. Scao, S. Gugger, M. Drame, Q. Lhoest, A.M. Rush, Transformers: State-of-the-Art Natural Language Processing, in: Proceedings Of The 2020 Conference On Empirical Methods In Natural Language Processing, 2020, pp. 38\u201345.","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"10.1016\/j.knosys.2021.108075_b30","doi-asserted-by":"crossref","unstructured":"A. Wang, A. Singh, J. Michael, F. Hill, O. Levy, S. Bowman, GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding, in: Proceedings Of The 2018 EMNLP Workshop BlackboxNLP: Analyzing And Interpreting Neural Networks For NLP, 2018, pp. 353\u2013355.","DOI":"10.18653\/v1\/W18-5446"},{"key":"10.1016\/j.knosys.2021.108075_b31","doi-asserted-by":"crossref","unstructured":"P. Rajpurkar, J. Zhang, K. Lopyrev, P. Liang, SQuAD: 100,000+ Questions for Machine Comprehension of Text, in: Proceedings Of The 2016 Conference On Empirical Methods In Natural Language Processing, 2016, pp. 2383\u20132392.","DOI":"10.18653\/v1\/D16-1264"},{"key":"10.1016\/j.knosys.2021.108075_b32","series-title":"Advances In Neural Information Processing Systems, Vol. 33","first-page":"17283","article-title":"Big bird: Transformers for longer sequences","author":"Zaheer","year":"2020"},{"key":"10.1016\/j.knosys.2021.108075_b33","series-title":"Advances In Neural Information Processing Systems, Vol. 33","first-page":"13783","article-title":"O(n) connections are expressive enough: Universal approximability of sparse transformers","author":"Yun","year":"2020"},{"key":"10.1016\/j.knosys.2021.108075_b34","unstructured":"J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, in: Proceedings Of The 2019 Conference Of The North American Chapter Of The Association For Computational Linguistics: Human Language Technologies, 2019, pp. 4171\u20134186."},{"key":"10.1016\/j.knosys.2021.108075_b35","series-title":"Improving Language Understanding by Generative Pre-Training","author":"Radford","year":"2018"},{"key":"10.1016\/j.knosys.2021.108075_b36","doi-asserted-by":"crossref","unstructured":"Y. Wang, H.-Y. Lee, Y.-N. Chen, Tree Transformer: Integrating Tree Structures into Self-Attention, in: Proceedings Of The 2019 Conference On Empirical Methods In Natural Language Processing And The 9th International Joint Conference On Natural Language Processing (EMNLP-IJCNLP), 2019, pp. 1060\u20131070.","DOI":"10.18653\/v1\/D19-1098"},{"key":"10.1016\/j.knosys.2021.108075_b37","series-title":"Generating long sequences with sparse transformers","author":"Child","year":"2019"},{"key":"10.1016\/j.knosys.2021.108075_b38","series-title":"Longformer: The long-document transformer","author":"Beltagy","year":"2020"},{"key":"10.1016\/j.knosys.2021.108075_b39","unstructured":"N. Kitaev, L. Kaiser, A. Levskaya, Reformer: The Efficient Transformer, in: International Conference On Learning Representations, 2020."},{"key":"10.1016\/j.knosys.2021.108075_b40","doi-asserted-by":"crossref","unstructured":"S. Takase, J. Suzuki, M. Nagata, Direct Output Connection for a High-Rank Language Model, in: Proceedings Of The 2018 Conference On Empirical Methods In Natural Language Processing, 2018, pp. 4599\u20134609.","DOI":"10.18653\/v1\/D18-1489"},{"key":"10.1016\/j.knosys.2021.108075_b41","series-title":"Advances In Neural Information Processing Systems, Vol. 31","first-page":"7739","article-title":"Breaking the activation function bottleneck through adaptive parameterization","author":"Flennerhag","year":"2018"},{"key":"10.1016\/j.knosys.2021.108075_b42","series-title":"Advances In Neural Information Processing Systems, Vol. 32","first-page":"5775","article-title":"Mixtape: Breaking the softmax bottleneck efficiently","author":"Yang","year":"2019"},{"key":"10.1016\/j.knosys.2021.108075_b43","unstructured":"O. Ganea, S. Gelly, G. B\u00e9cigneul, A. Severyn, Breaking the Softmax Bottleneck via Learnable Monotonic Pointwise Non-linearities, in: Proceedings Of The 36th International Conference On Machine Learning, 97, 2019, pp. 2073\u20132082."},{"key":"10.1016\/j.knosys.2021.108075_b44","series-title":"Linformer: Self-attention with linear complexity","author":"Wang","year":"2020"},{"issue":"1","key":"10.1016\/j.knosys.2021.108075_b45","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","article-title":"Adaptive mixtures of local experts","volume":"3","author":"Jacobs","year":"1991","journal-title":"Neural Comput."},{"issue":"8","key":"10.1016\/j.knosys.2021.108075_b46","doi-asserted-by":"crossref","first-page":"1177","DOI":"10.1109\/TNNLS.2012.2200299","article-title":"Twenty years of mixture of experts","volume":"23","author":"Yuksel","year":"2012","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.knosys.2021.108075_b47","series-title":"Weighted transformer network for machine translation","author":"Ahmed","year":"2017"},{"key":"10.1016\/j.knosys.2021.108075_b48","series-title":"Multi-branch attentive transformer","author":"Fan","year":"2020"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705121011503?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705121011503?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2023,3,11]],"date-time":"2023-03-11T16:33:40Z","timestamp":1678552420000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705121011503"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,3]]},"references-count":48,"alternative-id":["S0950705121011503"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2021.108075","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2022,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Mixhead: Breaking the low-rank bottleneck in multi-head attention language models","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2021.108075","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2022 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"108075"}}