{"id":"https://openalex.org/W4281618649","doi":"https://doi.org/10.1109/fccm53951.2022.9786164","title":"A Generator of Numerically-Tailored and High-Throughput Accelerators for Batched GEMMs","display_name":"A Generator of Numerically-Tailored and High-Throughput Accelerators for Batched GEMMs","publication_year":2022,"publication_date":"2022-05-15","ids":{"openalex":"https://openalex.org/W4281618649","doi":"https://doi.org/10.1109/fccm53951.2022.9786164"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/fccm53951.2022.9786164","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://hal.science/hal-04103774/document","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045922707","display_name":"Louis V. Ledoux","orcid":null},"institutions":[{"id":"https://openalex.org/I2799803557","display_name":"Barcelona Supercomputing Center","ror":"https://ror.org/05sd8tv96","country_code":"ES","type":"funder","lineage":["https://openalex.org/I2799803557","https://openalex.org/I9617848"]},{"id":"https://openalex.org/I9617848","display_name":"Universitat Polit\u00e8cnica de Catalunya","ror":"https://ror.org/03mb6wj31","country_code":"ES","type":"funder","lineage":["https://openalex.org/I9617848"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Louis Ledoux","raw_affiliation_strings":["Universitat Politecnica de Catalunya (UPC),Barcelona Supercomputing Center (BSC),Barcelona,Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Politecnica de Catalunya (UPC),Barcelona Supercomputing Center (BSC),Barcelona,Spain","institution_ids":["https://openalex.org/I2799803557","https://openalex.org/I9617848"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5044183257","display_name":"Marc Casas","orcid":"https://orcid.org/0000-0003-4564-2093"},"institutions":[{"id":"https://openalex.org/I2799803557","display_name":"Barcelona Supercomputing Center","ror":"https://ror.org/05sd8tv96","country_code":"ES","type":"funder","lineage":["https://openalex.org/I2799803557","https://openalex.org/I9617848"]},{"id":"https://openalex.org/I9617848","display_name":"Universitat Polit\u00e8cnica de Catalunya","ror":"https://ror.org/03mb6wj31","country_code":"ES","type":"funder","lineage":["https://openalex.org/I9617848"]}],"countries":["ES"],"is_corresponding":false,"raw_author_name":"Marc Casas","raw_affiliation_strings":["Universitat Politecnica de Catalunya (UPC),Barcelona Supercomputing Center (BSC),Barcelona,Spain"],"affiliations":[{"raw_affiliation_string":"Universitat Politecnica de Catalunya (UPC),Barcelona Supercomputing Center (BSC),Barcelona,Spain","institution_ids":["https://openalex.org/I2799803557","https://openalex.org/I9617848"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":1.45,"has_fulltext":true,"fulltext_origin":"pdf","cited_by_count":5,"citation_normalized_percentile":{"value":0.606347,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":82,"max":84},"biblio":{"volume":null,"issue":null,"first_page":"1","last_page":"10"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9995,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11697","display_name":"Numerical Methods and Algorithms","score":0.9992,"subfield":{"id":"https://openalex.org/subfields/1703","display_name":"Computational Theory and Mathematics"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10363","display_name":"Low-power high-performance VLSI design","score":0.9981,"subfield":{"id":"https://openalex.org/subfields/2208","display_name":"Electrical and Electronic Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/single-precision-floating-point-format","display_name":"Single-precision floating-point format","score":0.49508968},{"id":"https://openalex.org/keywords/hardware-acceleration","display_name":"Hardware acceleration","score":0.4119825}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.80759406},{"id":"https://openalex.org/C42935608","wikidata":"https://www.wikidata.org/wiki/Q190411","display_name":"Field-programmable gate array","level":2,"score":0.6448476},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.56781},{"id":"https://openalex.org/C157764524","wikidata":"https://www.wikidata.org/wiki/Q1383412","display_name":"Throughput","level":3,"score":0.5658651},{"id":"https://openalex.org/C133095886","wikidata":"https://www.wikidata.org/wiki/Q1307173","display_name":"Single-precision floating-point format","level":3,"score":0.49508968},{"id":"https://openalex.org/C2780992000","wikidata":"https://www.wikidata.org/wiki/Q17016113","display_name":"Generator (circuit theory)","level":3,"score":0.49162203},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.44741973},{"id":"https://openalex.org/C9390403","wikidata":"https://www.wikidata.org/wiki/Q3966","display_name":"Computer hardware","level":1,"score":0.43007654},{"id":"https://openalex.org/C126831891","wikidata":"https://www.wikidata.org/wiki/Q221673","display_name":"Host (biology)","level":2,"score":0.42612427},{"id":"https://openalex.org/C13164978","wikidata":"https://www.wikidata.org/wiki/Q600158","display_name":"Hardware acceleration","level":3,"score":0.4119825},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.3202138},{"id":"https://openalex.org/C163258240","wikidata":"https://www.wikidata.org/wiki/Q25342","display_name":"Power (physics)","level":2,"score":0.19325218},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.1301803},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.11694628},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C555944384","wikidata":"https://www.wikidata.org/wiki/Q249","display_name":"Wireless","level":2,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0}],"mesh":[],"locations_count":4,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/fccm53951.2022.9786164","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"https://hal.science/hal-04103774","pdf_url":"https://hal.science/hal-04103774/document","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true},{"is_oa":true,"landing_page_url":"https://hdl.handle.net/2117/368563","pdf_url":"https://upcommons.upc.edu/bitstream/2117/368563/1/fccm_2022.pdf","source":{"id":"https://openalex.org/S4306400551","display_name":"UPCommons (Polytechnic University of Catalonia)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I9617848","host_organization_name":"Universitat Polit\u00e8cnica de Catalunya","host_organization_lineage":["https://openalex.org/I9617848"],"host_organization_lineage_names":["Universitat Polit\u00e8cnica de Catalunya"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true},{"is_oa":true,"landing_page_url":"https://hal.science/hal-04103774/file/fccm_send.pdf","pdf_url":"https://hal.science/hal-04103774/file/fccm_send.pdf","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://hal.science/hal-04103774","pdf_url":"https://hal.science/hal-04103774/document","source":null,"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"publishedVersion","is_accepted":true,"is_published":true},"sustainable_development_goals":[{"display_name":"Affordable and clean energy","id":"https://metadata.un.org/sdg/7","score":0.91}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":45,"referenced_works":["https://openalex.org/W114125895","https://openalex.org/W1481966920","https://openalex.org/W1560559367","https://openalex.org/W1969213662","https://openalex.org/W1972635027","https://openalex.org/W1984087192","https://openalex.org/W2009811083","https://openalex.org/W2017369466","https://openalex.org/W2026891778","https://openalex.org/W2030898836","https://openalex.org/W2058978358","https://openalex.org/W2084595820","https://openalex.org/W2097117768","https://openalex.org/W2115613939","https://openalex.org/W2125385721","https://openalex.org/W2157266542","https://openalex.org/W2182678152","https://openalex.org/W2194775991","https://openalex.org/W2265064666","https://openalex.org/W2271840356","https://openalex.org/W2525382741","https://openalex.org/W2593032746","https://openalex.org/W2727238169","https://openalex.org/W2743322459","https://openalex.org/W2803254188","https://openalex.org/W2899063892","https://openalex.org/W2904622547","https://openalex.org/W2908782257","https://openalex.org/W2955474694","https://openalex.org/W2962914733","https://openalex.org/W2964937707","https://openalex.org/W2975508544","https://openalex.org/W2989762710","https://openalex.org/W3011681010","https://openalex.org/W3043484776","https://openalex.org/W3093552810","https://openalex.org/W3098863686","https://openalex.org/W3104595391","https://openalex.org/W3117877730","https://openalex.org/W3118400821","https://openalex.org/W3152828401","https://openalex.org/W3207378573","https://openalex.org/W4287773840","https://openalex.org/W4288346545","https://openalex.org/W833222660"],"related_works":["https://openalex.org/W4293323152","https://openalex.org/W4200391368","https://openalex.org/W3159273459","https://openalex.org/W2518118925","https://openalex.org/W2373535795","https://openalex.org/W2355315220","https://openalex.org/W2316202402","https://openalex.org/W2210979487","https://openalex.org/W2111241003","https://openalex.org/W2074043759"],"abstract_inverted_index":{"We":[0,220],"propose":[1],"a":[2,37,120,125,135,154,164,216],"hardware":[3],"generator":[4,9,207],"of":[5,40,107,124,146,167,198,218,226],"GEMM":[6,68,112,229],"accelerators.":[7],"Our":[8,157,187],"produces":[10],"vendor-agnostic":[11],"HDL":[12],"describing":[13],"highly":[14],"customizable":[15],"systolic":[16],"arrays":[17,27],"guided":[18],"by":[19,66,231],"accuracy":[20,89,168],"and":[21,90,134,149,172,195],"energy":[22,81,224],"efficiency":[23,82,170,225],"goals.":[24],"The":[25],"generated":[26,117],"have":[28],"three":[29],"main":[30],"novel":[31],"aspects.":[32],"First,":[33],"the":[34,56,67,97,105,144,147,203,209,222],"accelerators":[35,95,230],"handle":[36],"large":[38],"variety":[39,217],"computer":[41],"number":[42],"formats":[43],"using":[44],"intermediate":[45,61,72],"representations":[46],"based":[47],"on":[48],"our":[49,94,115],"Sign":[50],"Scale":[51],"Significand":[52],"(S3)":[53],"format.":[54],"Second,":[55],"processing":[57],"elements":[58],"perform":[59],"all":[60],"dot-product":[62],"arithmetic":[63],"operations":[64],"required":[65],"kernel":[69],"without":[70],"any":[71],"rounding,":[73],"which":[74,103],"makes":[75],"it":[76,151],"possible":[77],"to":[78,152,162,191,200,213],"deliver":[79],"better":[80],"than":[83],"state-of-the-art":[84,227],"approaches":[85],"while":[86,176],"offering":[87],"more":[88],"reproducible":[91],"results.":[92],"Third,":[93],"feature":[96],"Half-Speed":[98],"Sink":[99],"Down":[100],"(HSSD)":[101],"mechanism,":[102],"maximizes":[104],"overlap":[106],"host-accelerator":[108],"data":[109],"transfers":[110],"with":[111,183],"computations.We":[113],"evaluate":[114],"automatically":[116],"designs":[118],"in":[119],"cutting-edge":[121],"setup":[122],"composed":[123],"POWER9":[126],"host,":[127],"CAPI":[128],"(Coherent":[129],"Accelerator":[130],"Processor":[131],"Interface)":[132],"link,":[133],"Virtex":[136],"Ultrascale":[137],"Plus":[138],"FPGA.":[139,204],"Arrays":[140],"can":[141,173],"operate":[142],"at":[143],"speed":[145],"link":[148],"saturate":[150],"reach":[153,174],"13GB/s":[155],"throughput.":[156],"fine-grain":[158],"customization":[159],"approach":[160],"allows":[161],"cover":[163],"wide":[165],"range":[166],"versus":[169],"scenarios":[171],"0.65GOps/s/W":[175],"producing":[177],"1024":[178],"accurate":[179,185],"bits":[180],"or":[181],"148.7GOps/s/W":[182],"6":[184],"bits.":[186],"configurations":[188],"achieve":[189],"up":[190,199],"1613GOps/s":[192],"system":[193],"performance":[194],"power":[196],"efficiencies":[197],"240GOps/s/W":[201],"for":[202],"This":[205],"automatic":[206],"is":[208],"first":[210],"being":[211],"able":[212],"produce":[214],"such":[215],"designs.":[219],"improve":[221],"single-precision":[223],"FPGA":[228],"1.86\u00d7.":[232]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4281618649","counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":3}],"updated_date":"2025-02-16T02:07:38.547961","created_date":"2022-06-13"}