{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,8]],"date-time":"2024-09-08T12:19:04Z","timestamp":1725797944839},"reference-count":41,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"FONDECYT","award":["11180881"]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2021,1,1]]},"DOI":"10.1109\/tpds.2020.3011893","type":"journal-article","created":{"date-parts":[[2020,7,24]],"date-time":"2020-07-24T20:12:23Z","timestamp":1595621543000},"page":"72-84","source":"Crossref","is-referenced-by-count":27,"title":["GPU Tensor Cores for Fast Arithmetic Reductions"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"http:\/\/orcid.org\/0000-0001-7090-9904","authenticated-orcid":false,"given":"Cristobal A.","family":"Navarro","sequence":"first","affiliation":[]},{"given":"Roberto","family":"Carrasco","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0001-5345-7061","authenticated-orcid":false,"given":"Ricardo J.","family":"Barrientos","sequence":"additional","affiliation":[]},{"ORCID":"http:\/\/orcid.org\/0000-0001-5389-2207","authenticated-orcid":false,"given":"Javier A.","family":"Riquelme","sequence":"additional","affiliation":[]},{"given":"Raimundo","family":"Vega","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1103\/RevModPhys.55.601"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0016236"},{"key":"ref33","first-page":"1","article-title":"Nvidia CUDA C programming guide","year":"2020","journal-title":"NVIDIA Corporation"},{"key":"ref32","article-title":"Cuda unbound (CUB) library","year":"2015","journal-title":"NVIDIA-Labs"},{"key":"ref31","article-title":"A100 Tensor Core GPU Architecture Whitepaper","year":"2020"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/1401132.1401152"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.102"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2014.09.003"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/PDP.2009.43"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2014.6853195"},{"key":"ref10","first-page":"73","article-title":"Exploration of low numeric precision deep learning inference using Intel® FPGAs","author":"colangelo","year":"2018","journal-title":"Proc IEEE 26th Annu Int Symp Field-Programmable Custom Comput Mach"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021741"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3331057"},{"key":"ref12","first-page":"10","article-title":"MapReduce: Simplified data processing on large clusters","author":"dean","year":"2004","journal-title":"Proc 6th Conf Symp Operating Syst Des Implementation"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/RT.2007.4342598"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3148226.3148237"},{"key":"ref15","first-page":"447","article-title":"A comparative study on ASIC, FPGAs, GPUs and general purpose processors in the $o(n^2)$o(n2) gravitational N-body simulation","author":"hamada","year":"2009","journal-title":"Proc NASA\/ESA Conf Adaptive Hardware Syst"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/1198555.1198768"},{"key":"ref17","first-page":"1","article-title":"Optimizing CUDA","author":"harris","year":"2007","journal-title":"Proc Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1088\/0031-9155\/56\/22\/002"},{"key":"ref19","article-title":"Dissecting the NVidia turing T4 GPU via microbenchmarking","author":"jia","year":"2019"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.4208\/cicp.110113.010813a"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1002\/widm.1232"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2018.2849705"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1145\/321812.321815"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2018.04.006"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2016.04.007"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/SCCC.2018.8705253"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073601"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2018.02.010"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2011.12.024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/1250790.1250877"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24595-9_12"},{"key":"ref20","first-page":"1","article-title":"Dissecting the NVIDIA volta GPU architecture via microbenchmarking","author":"jia","year":"2018","journal-title":"CoRR"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3140659.3080246"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.032271057"},{"key":"ref24","doi-asserted-by":"crossref","first-page":"436","DOI":"10.1038\/nature14539","article-title":"Deep learning","volume":"521","author":"lecun","year":"2015","journal-title":"Nature"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-007-0191-y"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3177754"},{"key":"ref26","first-page":"444","article-title":"Benchmarking the NVIDIA V100 GPU and tensor cores","author":"martineau","year":"2018","journal-title":"Proc Eur Conf Parallel Process"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2018.00091"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9152195\/09147055.pdf?arnumber=9147055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:50:28Z","timestamp":1652194228000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9147055\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,1,1]]},"references-count":41,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2020.3011893","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,1,1]]}}}