{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T05:15:34Z","timestamp":1726031734708},"reference-count":52,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,3,2]],"date-time":"2024-03-02T00:00:00Z","timestamp":1709337600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,3,2]],"date-time":"2024-03-02T00:00:00Z","timestamp":1709337600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000028","name":"Semiconductor Research Corporation (SRC)","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000028","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000185","name":"DARPA","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,3,2]]},"DOI":"10.1109\/hpca57654.2024.00078","type":"proceedings-article","created":{"date-parts":[[2024,4,2]],"date-time":"2024-04-02T18:36:37Z","timestamp":1712082997000},"source":"Crossref","is-referenced-by-count":2,"title":["An LPDDR-based CXL-PNM Platform for TCO-efficient Inference of Transformer-based Large Language Models"],"prefix":"10.1109","author":[{"given":"Sang-Soo","family":"Park","sequence":"first","affiliation":[{"name":"Samsung Electronics"}]},{"given":"KyungSoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jinin","family":"So","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jin","family":"Jung","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jonggeon","family":"Lee","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Kyoungwan","family":"Woo","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Nayeon","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Younghyun","family":"Lee","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Hyungyo","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Yongsuk","family":"Kwon","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jinhyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jieun","family":"Lee","sequence":"additional","affiliation":[{"name":"Seoul National University"}]},{"given":"YeonGon","family":"Cho","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Yongmin","family":"Tai","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jeonghyeon","family":"Cho","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Hoyoung","family":"Song","sequence":"additional","affiliation":[{"name":"Samsung Electronics"}]},{"given":"Jung Ho","family":"Ahn","sequence":"additional","affiliation":[{"name":"Seoul National University"}]},{"given":"Nam Sung","family":"Kim","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/sc41404.2022.00051"},{"key":"ref2","volume-title":"ARM, \u201cAMBA AHB Protocol Specification","year":"2023"},{"key":"ref3","volume-title":"ARM, \u201cAMBA AXI Protocol Specification","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783753"},{"key":"ref5","volume-title":"Compute Express Link (CXL): Memory and Cache Protocols","author":"Blankenship","year":"2020"},{"key":"ref6","volume-title":"Distributed Inference and Fine-tuning of Large Language Models Over The Internet","author":"Borzunov","year":"2023"},{"key":"ref7","article-title":"Language Models are Few-Shot Learners","volume-title":"NeurIPS","author":"Brown","year":"2020"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-5602"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476146"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2023.3305386"},{"key":"ref11","volume-title":"CXL Consortium, \u201cCompute Express Link (CXL)","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875680"},{"key":"ref13","volume-title":"ENERGYSAGE, \u201cHow much does electricity cost by state?"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056040"},{"key":"ref15","article-title":"Scalable Transformers for Neural Machine Translation","author":"Gao","year":"2021","journal-title":"arXiv"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2021.3117150"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00060"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00051"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/2950067.2950086"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3564606"},{"key":"ref22","volume-title":"Intel Agilex\u2122 7 FPGA I-Series Development Kit","author":"Corporation","year":"2023"},{"key":"ref23","volume-title":"JEDEC, \u201cDDR5 SDRAM","year":"2022"},{"key":"ref24","volume-title":"JEDEC, \u201cGRAPHICS DOUBLE DATA RATE (GDDR6) SGRAM STANDARD","year":"2023"},{"key":"ref25","volume-title":"JEDEC, \u201cHigh Bandwidth Memory DRAM (HBM3)","year":"2023"},{"key":"ref26","volume-title":"JEDEC, \u201cLow Power Double Data Rate (LPDDR) 5\/5X","year":"2023"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3097700"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC42613.2021.9365862"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.46506\/jica.2021.2.1.043"},{"key":"ref30","volume-title":"Meta, \u201cOPT","year":"2023"},{"key":"ref31","volume-title":"NVIDIA, \u201cNVIDIA H100 Tensor Core GPU"},{"key":"ref32","volume-title":"NVIDIA, \u201cNVIDIA DGX A100 System Architecture","year":"2020"},{"key":"ref33","volume-title":"NVIDIA, \u201cFasterTransformer","year":"2023"},{"key":"ref34","volume-title":"OpenAI, \u201cOpenAI: Introducing ChatGPT","year":"2023"},{"key":"ref35","article-title":"LUT-GEMM: Quantized Matrix Multiplication based on LUTs for Efficient Inference in Large-Scale Generative Language Models","author":"Park","year":"2023","journal-title":"arXiv"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480080"},{"key":"ref37","volume-title":"Micron \u2013Investor Day","author":"Patodia","year":"2023"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref39","volume-title":"Samsung, \u201cSamsung Develops Industrys First CXL DRAM Supporting CXL 2.0","year":"2023"},{"key":"ref40","volume-title":"Error Correction Code (ECC) in DDR Memories","author":"Sankaranarayanan","year":"2023"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/dac56929.2023.10247726"},{"key":"ref42","article-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer","author":"Shazeer","year":"2017","journal-title":"arXiv"},{"key":"ref43","article-title":"FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU","author":"Sheng","year":"2023","journal-title":"arXiv"},{"key":"ref44","article-title":"Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv"},{"key":"ref45","volume-title":"SK hynix Introduces Industrys First CXL-based Computational Memory Solution (CMS) at the OCP Global Summit","author":"SK hynix","year":"2022"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/EPTC.2011.6184382"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3613424.3614256"},{"key":"ref48","article-title":"Attention is All you Need","volume-title":"NeurIPS","author":"Vaswani","year":"2017"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1176"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-industry.15"},{"key":"ref51","volume-title":"Xilinx, \u201cVirtex UltraScale+"},{"key":"ref52","article-title":"OPT: Open Pre-trained Transformer Language Models","author":"Zhang","year":"2022","journal-title":"arXiv"}],"event":{"name":"2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA)","location":"Edinburgh, United Kingdom","start":{"date-parts":[[2024,3,2]]},"end":{"date-parts":[[2024,3,6]]}},"container-title":["2024 IEEE International Symposium on High-Performance Computer Architecture (HPCA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10476359\/10476395\/10476443.pdf?arnumber=10476443","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,3]],"date-time":"2024-04-03T05:31:10Z","timestamp":1712122270000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10476443\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,2]]},"references-count":52,"URL":"https:\/\/doi.org\/10.1109\/hpca57654.2024.00078","relation":{},"subject":[],"published":{"date-parts":[[2024,3,2]]}}}