{"id":"https://openalex.org/W4285503858","doi":"https://doi.org/10.1109/ipdps53621.2022.00089","title":"A Fine-grained Prefetching Scheme for DGEMM Kernels on GPU with Auto-tuning Compatibility","display_name":"A Fine-grained Prefetching Scheme for DGEMM Kernels on GPU with Auto-tuning Compatibility","publication_year":2022,"publication_date":"2022-05-01","ids":{"openalex":"https://openalex.org/W4285503858","doi":"https://doi.org/10.1109/ipdps53621.2022.00089"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/ipdps53621.2022.00089","pdf_url":null,"source":{"id":"https://openalex.org/S4363607067","display_name":"2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5027384538","display_name":"Jialin Li","orcid":"https://orcid.org/0009-0007-5424-4069"},"institutions":[{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]},{"id":"https://openalex.org/I4210108629","display_name":"Computer Network Information Center","ror":"https://ror.org/01s0wyf50","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210108629"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jialin Li","raw_affiliation_strings":["Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]},{"raw_affiliation_string":"Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210108629","https://openalex.org/I19820366"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102523917","display_name":"Huang Ye","orcid":"https://orcid.org/0000-0003-3336-0785"},"institutions":[{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210108629","display_name":"Computer Network Information Center","ror":"https://ror.org/01s0wyf50","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210108629"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Huang Ye","raw_affiliation_strings":["Computer Network Information Center, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I19820366","https://openalex.org/I4210108629"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114024773","display_name":"Shaobo Tian","orcid":null},"institutions":[{"id":"https://openalex.org/I4210108629","display_name":"Computer Network Information Center","ror":"https://ror.org/01s0wyf50","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210108629"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]},{"id":"https://openalex.org/I4210165038","display_name":"University of Chinese Academy of Sciences","ror":"https://ror.org/05qbk4x57","country_code":"CN","type":"funder","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210165038"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Shaobo Tian","raw_affiliation_strings":["Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","University of Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210108629","https://openalex.org/I19820366"]},{"raw_affiliation_string":"University of Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210165038"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100663159","display_name":"Xinyuan Li","orcid":"https://orcid.org/0000-0001-8758-5301"},"institutions":[{"id":"https://openalex.org/I45928872","display_name":"Alibaba Group (China)","ror":"https://ror.org/00k642b80","country_code":"CN","type":"company","lineage":["https://openalex.org/I45928872"]},{"id":"https://openalex.org/I4210108629","display_name":"Computer Network Information Center","ror":"https://ror.org/01s0wyf50","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210108629"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Xinyuan Li","raw_affiliation_strings":["Alibaba Group, Beijing, China","Computer Network Information Center, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Alibaba Group, Beijing, China","institution_ids":["https://openalex.org/I45928872"]},{"raw_affiliation_string":"Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210108629","https://openalex.org/I19820366"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5100409912","display_name":"Jian Zhang","orcid":"https://orcid.org/0000-0002-0075-7375"},"institutions":[{"id":"https://openalex.org/I4210108629","display_name":"Computer Network Information Center","ror":"https://ror.org/01s0wyf50","country_code":"CN","type":"facility","lineage":["https://openalex.org/I19820366","https://openalex.org/I4210108629"]},{"id":"https://openalex.org/I19820366","display_name":"Chinese Academy of Sciences","ror":"https://ror.org/034t30j35","country_code":"CN","type":"government","lineage":["https://openalex.org/I19820366"]}],"countries":["CN"],"is_corresponding":false,"raw_author_name":"Jian Zhang","raw_affiliation_strings":["Computer Network Information Center, Chinese Academy of Sciences, Beijing, China"],"affiliations":[{"raw_affiliation_string":"Computer Network Information Center, Chinese Academy of Sciences, Beijing, China","institution_ids":["https://openalex.org/I4210108629","https://openalex.org/I19820366"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.356,"has_fulltext":false,"cited_by_count":1,"citation_normalized_percentile":{"value":0.348326,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":59,"max":69},"biblio":{"volume":null,"issue":null,"first_page":"863","last_page":"874"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10054","display_name":"Parallel Computing and Optimization Techniques","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1708","display_name":"Hardware and Architecture"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9986,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11269","display_name":"Algorithms and Data Compression","score":0.9946,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.71408373},{"id":"https://openalex.org/keywords/memory-hierarchy","display_name":"Memory hierarchy","score":0.48660743}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8496934},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.7842678},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.71408373},{"id":"https://openalex.org/C138101251","wikidata":"https://www.wikidata.org/wiki/Q213092","display_name":"Thread (computing)","level":2,"score":0.6738262},{"id":"https://openalex.org/C17349429","wikidata":"https://www.wikidata.org/wiki/Q1049914","display_name":"Matrix multiplication","level":3,"score":0.6558528},{"id":"https://openalex.org/C2778100165","wikidata":"https://www.wikidata.org/wiki/Q1589327","display_name":"Memory hierarchy","level":3,"score":0.48660743},{"id":"https://openalex.org/C42992933","wikidata":"https://www.wikidata.org/wiki/Q691169","display_name":"Task parallelism","level":3,"score":0.48547485},{"id":"https://openalex.org/C61483411","wikidata":"https://www.wikidata.org/wiki/Q3124522","display_name":"Data parallelism","level":3,"score":0.44183904},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.43370882},{"id":"https://openalex.org/C2781172179","wikidata":"https://www.wikidata.org/wiki/Q853109","display_name":"Parallelism (grammar)","level":2,"score":0.23342398},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.15136781},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C84114770","wikidata":"https://www.wikidata.org/wiki/Q46344","display_name":"Quantum","level":2,"score":0.0},{"id":"https://openalex.org/C115537543","wikidata":"https://www.wikidata.org/wiki/Q165596","display_name":"Cache","level":2,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/ipdps53621.2022.00089","pdf_url":null,"source":{"id":"https://openalex.org/S4363607067","display_name":"2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)","issn_l":null,"issn":null,"is_oa":false,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":null,"host_organization_name":null,"host_organization_lineage":[],"host_organization_lineage_names":[],"type":"conference"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[],"grants":[{"funder":"https://openalex.org/F4320321001","funder_display_name":"National Natural Science Foundation of China","award_id":"11871454"},{"funder":"https://openalex.org/F4320335777","funder_display_name":"National Key Research and Development Program of China","award_id":"2021YFB0300203"}],"datasets":[],"versions":[],"referenced_works_count":27,"referenced_works":["https://openalex.org/W1863336885","https://openalex.org/W1888553388","https://openalex.org/W1978642402","https://openalex.org/W1982996921","https://openalex.org/W2037854811","https://openalex.org/W2041356909","https://openalex.org/W2047060659","https://openalex.org/W2071148743","https://openalex.org/W2090593986","https://openalex.org/W2093843662","https://openalex.org/W2099021415","https://openalex.org/W2232645663","https://openalex.org/W2409690919","https://openalex.org/W2499931820","https://openalex.org/W2556016862","https://openalex.org/W2580538010","https://openalex.org/W2619500264","https://openalex.org/W2769375938","https://openalex.org/W2786544209","https://openalex.org/W2885184702","https://openalex.org/W2913790721","https://openalex.org/W2936491961","https://openalex.org/W3007718266","https://openalex.org/W3080244561","https://openalex.org/W3128982134","https://openalex.org/W4237024478","https://openalex.org/W4250470790"],"related_works":["https://openalex.org/W74409296","https://openalex.org/W4240606930","https://openalex.org/W2950520577","https://openalex.org/W2494130044","https://openalex.org/W2468095077","https://openalex.org/W2177166030","https://openalex.org/W2040503315","https://openalex.org/W2003935582","https://openalex.org/W1972271823","https://openalex.org/W168408236"],"abstract_inverted_index":{"General":[0],"Matrix":[1],"Multiplication":[2],"(GEMM)":[3],"is":[4,25,43,103,115],"one":[5],"of":[6,19,31,84,130,136,141,154],"the":[7,17,23,35,40,48,54,77,82,107,112,118,134],"fundamental":[8],"kernels":[9,131],"for":[10,138,157],"scientific":[11],"and":[12,63,89,93,99,126,160],"high-performance":[13],"computing.":[14],"When":[15],"optimizing":[16],"performance":[18,109,135,148],"GEMM":[20],"on":[21,91,150],"GPU,":[22],"matrix":[24,155],"usually":[26],"partitioned":[27],"into":[28,117],"a":[29,71,100,128,139,151],"hierarchy":[30],"tiles":[32],"to":[33,105,122,132],"fit":[34],"thread":[36,94],"hierarchy.":[37],"In":[38],"practice,":[39],"thread-level":[41,78],"parallelism":[42,79,96],"affected":[44],"not":[45],"only":[46],"by":[47,53,80],"tiling":[49],"scheme":[50,74,114],"but":[51],"also":[52],"resources":[55],"that":[56,75],"each":[57],"tile":[58],"consumes,":[59],"such":[60,85],"as":[61],"registers":[62],"local":[64],"data":[65],"share":[66],"memory.":[67],"This":[68],"paper":[69],"presents":[70],"fine-grained":[72],"prefetching":[73],"improves":[76],"balancing":[81],"usage":[83],"resources.":[86],"The":[87],"gain":[88],"loss":[90],"instruction":[92],"level":[95],"are":[97],"analyzed":[98],"mathematical":[101],"model":[102],"developed":[104],"estimate":[106],"overall":[108],"gain.":[110],"Moreover,":[111],"proposed":[113],"integrated":[116],"open-source":[119],"tool":[120],"Tensile":[121],"automatically":[123],"generate":[124],"assembly":[125],"tune":[127],"collection":[129],"maximize":[133],"DGEMM":[137],"family":[140],"problem":[142],"sizes.":[143],"Experiments":[144],"show":[145],"about":[146],"1.10X":[147],"speedup":[149],"wide":[152],"range":[153],"sizes":[156],"both":[158],"single":[159],"batched":[161],"matrix-matrix":[162],"multiplication.":[163]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4285503858","counts_by_year":[{"year":2024,"cited_by_count":1}],"updated_date":"2025-04-22T22:46:04.393504","created_date":"2022-07-15"}