{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,11,19]],"date-time":"2024-11-19T17:56:45Z","timestamp":1732039005953},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/100010434","name":"\u201cla Caixa\u201d Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100010434","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100007601","name":"Horizon 2020","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100007601","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100008530","name":"European Regional Development Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100008530","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Future Generation Computer Systems"],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1016\/j.future.2020.02.069","type":"journal-article","created":{"date-parts":[[2020,2,26]],"date-time":"2020-02-26T16:36:27Z","timestamp":1582734987000},"page":"161-177","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":22,"special_numbering":"C","title":["A benchmark set of highly-efficient CUDA and OpenCL kernels and its dynamic autotuning with Kernel Tuning Toolkit"],"prefix":"10.1016","volume":"108","author":[{"given":"Filip","family":"Petrovi\u010d","sequence":"first","affiliation":[]},{"given":"David","family":"St\u0159el\u00e1k","sequence":"additional","affiliation":[]},{"given":"Jana","family":"Hozzov\u00e1","sequence":"additional","affiliation":[]},{"given":"Jaroslav","family":"Ol\u2019ha","sequence":"additional","affiliation":[]},{"given":"Richard","family":"Trembeck\u00fd","sequence":"additional","affiliation":[]},{"given":"Siegfried","family":"Benkner","sequence":"additional","affiliation":[]},{"given":"Ji\u0159\u00ed","family":"Filipovi\u010d","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"issue":"11","key":"10.1016\/j.future.2020.02.069_b1","doi-asserted-by":"crossref","first-page":"2045","DOI":"10.1109\/TPDS.2011.311","article-title":"Autotuning GEMM kernels for the Fermi GPU","volume":"23","author":"Kurzak","year":"2012","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/j.future.2020.02.069_b2","series-title":"2017 IEEE 19th International Conference on High Performance Computing and Communications; IEEE 15th International Conference on Smart City; IEEE 3rd International Conference on Data Science and Systems (HPCC\/SmartCity\/DSS)","article-title":"Revisiting online autotuning for sparse-matrix vector multiplication kernels on next-generation architectures","author":"Gonzalo","year":"2017"},{"key":"10.1016\/j.future.2020.02.069_b3","doi-asserted-by":"crossref","DOI":"10.1177\/1094342019832958","article-title":"A GPU acceleration of 3D Fourier reconstruction in Cryo-EM","author":"St\u0159el\u00e1k","year":"2019","journal-title":"Int. J. High Perform. Comput. Appl."},{"issue":"11","key":"10.1016\/j.future.2020.02.069_b4","doi-asserted-by":"crossref","first-page":"2068","DOI":"10.1109\/JPROC.2018.2841200","article-title":"Autotuning in high-performance computing applications","volume":"106","author":"Balaprakash","year":"2018","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.future.2020.02.069_b5","series-title":"Proceedings of the 23rd International Conference on Parallel Architectures and Compilation, PACT \u201914","article-title":"Opentuner: an extensible framework for program autotuning","author":"Ansel","year":"2014"},{"key":"10.1016\/j.future.2020.02.069_b6","series-title":"Proceedings of the IEEE 9th International Symposium on Embedded Multicore\/Many-Core Systems-on-Chip (MCSoC)","article-title":"Cltune: a generic auto-tuner for opencl kernels","author":"Nugteren","year":"2015"},{"key":"10.1016\/j.future.2020.02.069_b7","article-title":"ATF: A generic directive-based auto-tuning framework","author":"Rasch","year":"2018","journal-title":"Concurr. Comput.: Pract. Exper."},{"key":"10.1016\/j.future.2020.02.069_b8","doi-asserted-by":"crossref","first-page":"347","DOI":"10.1016\/j.future.2018.08.004","article-title":"Kernel tuner: A search-optimizing GPU code auto-tuner","volume":"90","author":"Werkhoven","year":"2019","journal-title":"Future Gener. Comput. Syst."},{"key":"10.1016\/j.future.2020.02.069_b9","series-title":"Automatic Tuning of HPC Applications - the Periscope Tuning Framework (PTF).","article-title":"Automatic tuning of HPC applications - the periscope tuning framework (PTF)","author":"Gerndt","year":"2015"},{"key":"10.1016\/j.future.2020.02.069_b10","article-title":"A multi-aspect online tuning framework for HPC applications","author":"Gerndt","year":"2017","journal-title":"Softw. Qual. J."},{"key":"10.1016\/j.future.2020.02.069_b11","series-title":"2016 49th Hawaii International Conference on System Sciences (HICSS)","article-title":"Tuning opencl applications with the periscope tuning framework","author":"Bajrovic","year":"2016"},{"key":"10.1016\/j.future.2020.02.069_b12","series-title":"Applied Parallel and Scientific Computing: 11th International Conference, PARA 2012, Helsinki, Finland, June 10-13, 2012, Revised Selected Papers","first-page":"328","article-title":"Autotune: A plugin-driven approach to the automatic tuning of parallel applications","author":"Miceli","year":"2013"},{"key":"10.1016\/j.future.2020.02.069_b13","series-title":"Proceedings of the 1998 ACM\/IEEE Conference on Supercomputing","article-title":"Automatically tuned linear algebra software","author":"Whaley","year":"1998"},{"issue":"2","key":"10.1016\/j.future.2020.02.069_b14","doi-asserted-by":"crossref","first-page":"216","DOI":"10.1109\/JPROC.2004.840301","article-title":"The design and implementation of fftw3","volume":"93","author":"Frigo","year":"2005","journal-title":"Proc. IEEE"},{"key":"10.1016\/j.future.2020.02.069_b15","series-title":"Proceedings of the 9th International Conference on Computational Science: Part I","article-title":"A note on auto-tuning GEMM for GPUs","author":"Li","year":"2009"},{"key":"10.1016\/j.future.2020.02.069_b16","series-title":"Fourth Workshop on General Purpose Processing on Graphics Processing Units (GPGPU)","article-title":"Automatically generating and tuning GPU code for sparse matrix-vector multiplication from a high-level representation","author":"Grewe","year":"2011"},{"issue":"1","key":"10.1016\/j.future.2020.02.069_b17","doi-asserted-by":"crossref","first-page":"90","DOI":"10.1007\/s11390-013-1314-8","article-title":"MPFFT: An auto-tuning FFT library for opencl GPUs","volume":"28","author":"Li","year":"2013","journal-title":"J. Comput. Sci. Tech."},{"key":"10.1016\/j.future.2020.02.069_b18","series-title":"2012 SC Companion: High Performance Computing, Networking Storage and Analysis","article-title":"Performance tuning of matrix multiplication in opencl on different GPUs and CPUs","author":"Matsumotoi","year":"2012"},{"key":"10.1016\/j.future.2020.02.069_b19","series-title":"MCC-3: Swedish Woekshop on Multicore Computing","article-title":"Towards a tunable multi-backend skeleton programming framework for multi-GPU systems","author":"Enmyren","year":"2010"},{"key":"10.1016\/j.future.2020.02.069_b20","series-title":"2012 Innovative Parallel Computing (InPar)","article-title":"Auto-tuning a high-level language targeted to GPU codes","author":"Grauer-Gray","year":"2012"},{"key":"10.1016\/j.future.2020.02.069_b21","doi-asserted-by":"crossref","DOI":"10.1007\/s11227-015-1483-z","article-title":"Optimizing CUDA code by kernel fusion: application on BLAS","author":"Filipovi\u010d","year":"2015","journal-title":"J. Supercomput."},{"key":"10.1016\/j.future.2020.02.069_b22","series-title":"2014 International Conference on Parallel and Distributed Processing, Techniques and Applications","article-title":"Automatic performance tuning of pipeline patterns for heterogeneous parallel architectures","author":"Bajrovic","year":"2014"},{"key":"10.1016\/j.future.2020.02.069_b23","series-title":"Proceedings of the 2014 IEEE 28th International Parallel and Distributed Processing Symposium","first-page":"501","article-title":"Nitro: A framework for adaptive code variant tuning","author":"Muralidharan","year":"2014"},{"key":"10.1016\/j.future.2020.02.069_b24","series-title":"Languages and Compilers for Parallel Computing","article-title":"A programming language interface to describe transformations and code generation","author":"Rudy","year":"2011"},{"key":"10.1016\/j.future.2020.02.069_b25","series-title":"IEEE International Parallel Distributed Processing Symposium (IPDPS)","article-title":"Online adaptive code generation and tuning","author":"Tiwari","year":"2011"},{"key":"10.1016\/j.future.2020.02.069_b26","series-title":"2017 IEEE\/ACM International Symposium on Code Generation and Optimization (CGO)","first-page":"74","article-title":"LIFT: A functional data-parallel IR for high-performance GPU code generation","author":"Steuwer","year":"2017"},{"key":"10.1016\/j.future.2020.02.069_b27","series-title":"Proceedings of the 2015 IEEE International Parallel and Distributed Processing Symposium Workshop","article-title":"Machine learning based auto-tuning for enhanced opencl performance portability","author":"Falch","year":"2015"},{"key":"10.1016\/j.future.2020.02.069_b28","series-title":"Proceedings of the 1st Workshop on AutotuniNg and ADaptivity AppRoaches for Energy Efficient HPC Systems (ANDARE \u201917)","article-title":"Autotuning of OpenCL kernels with global optimizations","author":"Filipovi\u010d","year":"2017"},{"key":"10.1016\/j.future.2020.02.069_b29","series-title":"2017 IEEE 19th International Conference on High Performance Computing and Communications; IEEE 15th International Conference on Smart City; IEEE 3rd International Conference on Data Science and Systems (HPCC\/SmartCity\/DSS)","article-title":"ATF: A generic auto-tuning framework","author":"Rasch","year":"2017"},{"year":"2012","series-title":"Parboil: A Revised Benchmark Suite for Scientificand Commercial Throughput Computing","author":"Stratton","key":"10.1016\/j.future.2020.02.069_b30"},{"key":"10.1016\/j.future.2020.02.069_b31","series-title":"Proceedings of the 3rd Workshop on General-Purpose Computation on Graphics Processing Units","first-page":"63","article-title":"The scalable heterogeneous computing (SHOC) benchmark suite","author":"Danalis","year":"2010"},{"year":"2012","series-title":"Polybench\/GPU 1.0","author":"Grauer-Gray","key":"10.1016\/j.future.2020.02.069_b32"},{"key":"10.1016\/j.future.2020.02.069_b33","series-title":"2018 Design, Automation Test in Europe Conference Exhibition (DATE)","article-title":"SOCRATES \u2014 A seamless online compiler and system runtime autotuning framework for energy-aware applications","author":"Gadioli","year":"2018"},{"issue":"2","key":"10.1016\/j.future.2020.02.069_b34","doi-asserted-by":"crossref","first-page":"325","DOI":"10.1145\/2980024.2872411","article-title":"Architecture-adaptive code variant tuning","volume":"44","author":"Muralidharan","year":"2016","journal-title":"SIGARCH Comput. Archit. News"},{"key":"10.1016\/j.future.2020.02.069_b35","unstructured":"T. Kisuki, P.M.W. Knijnenburg, M.F.P. O\u2019Boyle, Combined selection of tile sizes and unroll factors using iterative compilation, in: Proceedings 2000 International Conference on Parallel Architectures and Compilation Techniques (PACT\u201900), 2000."},{"key":"10.1016\/j.future.2020.02.069_b36","series-title":"2008 IEEE International Conference on Cluster Computing","article-title":"A comparison of search heuristics for empirical code optimization","author":"Seymour","year":"2008"},{"key":"10.1016\/j.future.2020.02.069_b37","series-title":"2017 26th International Conference on Parallel Architectures and Compilation Techniques (PACT)","first-page":"219","article-title":"End-to-end deep learning of optimization heuristics","author":"Cummins","year":"2017"},{"key":"10.1016\/j.future.2020.02.069_b38","series-title":"ACM\/IEEE Conference on Supercomputing (SC)","article-title":"Benchmarking GPUs to tune dense linear algebra","author":"Volkov","year":"2008"},{"issue":"16","key":"10.1016\/j.future.2020.02.069_b39","doi-asserted-by":"crossref","DOI":"10.1002\/jcc.20829","article-title":"Accelerating molecular modeling applications with graphics processors","volume":"28","author":"Stone","year":"2007","journal-title":"J. Comput. Chem."},{"issue":"1","key":"10.1016\/j.future.2020.02.069_b40","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1002\/(SICI)1099-1352(199601)9:1<1::AID-JMR241>3.0.CO;2-6","article-title":"Automated docking of flexible ligands: Applications of autodock","volume":"9","author":"Goodsell","year":"1996","journal-title":"J. Mol. Recognit."},{"issue":"1","key":"10.1016\/j.future.2020.02.069_b41","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/77626.79170","article-title":"A set of level 3 basic linear algebra subprograms","volume":"16","author":"Dongarra","year":"1990","journal-title":"ACM Trans. Math. Software"},{"year":"2015","series-title":"Tensorflow: Large-scale machine learning on heterogeneous systems","author":"Abadi","key":"10.1016\/j.future.2020.02.069_b42"},{"issue":"3","key":"10.1016\/j.future.2020.02.069_b43","doi-asserted-by":"crossref","first-page":"302","DOI":"10.1145\/356044.356047","article-title":"The multifrontal solution of indefinite sparse symmetric linear","volume":"9","author":"Duff","year":"1983","journal-title":"ACM Trans. Math. Software"},{"key":"10.1016\/j.future.2020.02.069_b44","series-title":"Euro-Par 2014: Parallel Processing Workshops","first-page":"450","article-title":"Matrix-free finite-element operator application on graphics processing units","author":"Ljungkvist","year":"2014"},{"key":"10.1016\/j.future.2020.02.069_b45","series-title":"Euro-Par 2016: Parallel Processing","first-page":"659","article-title":"High-performance matrix-matrix multiplications of very small matrices","author":"Masliah","year":"2016"},{"key":"10.1016\/j.future.2020.02.069_b46","series-title":"IEEE International Symposium on Workload Characterization (IISWC)","article-title":"Rodinia: A benchmark suite for heterogeneous computing","author":"Che","year":"2009"},{"key":"10.1016\/j.future.2020.02.069_b47","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1016\/j.ultramic.2015.05.018","article-title":"A fast iterative convolution weighting approach for gridding-based direct fourier three-dimensional reconstruction with correction for the contrast transfer function","volume":"157","author":"Abrishami","year":"2015","journal-title":"Ultramicroscopy"},{"key":"10.1016\/j.future.2020.02.069_b48","doi-asserted-by":"crossref","first-page":"4808","DOI":"10.1038\/ncomms5808","article-title":"An atomic model of brome mosaic virus using direct electron detection and real-space optimization.","volume":"5","author":"Wang","year":"2014","journal-title":"Nature Commun."}],"container-title":["Future Generation Computer Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167739X19327360?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0167739X19327360?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2021,4,18]],"date-time":"2021-04-18T07:21:12Z","timestamp":1618730472000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0167739X19327360"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7]]},"references-count":48,"alternative-id":["S0167739X19327360"],"URL":"https:\/\/doi.org\/10.1016\/j.future.2020.02.069","relation":{},"ISSN":["0167-739X"],"issn-type":[{"type":"print","value":"0167-739X"}],"subject":[],"published":{"date-parts":[[2020,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A benchmark set of highly-efficient CUDA and OpenCL kernels and its dynamic autotuning with Kernel Tuning Toolkit","name":"articletitle","label":"Article Title"},{"value":"Future Generation Computer Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.future.2020.02.069","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2020 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}]}}