{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T13:24:36Z","timestamp":1726493076918},"reference-count":28,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2015,2,1]],"date-time":"2015-02-01T00:00:00Z","timestamp":1422748800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Parallel and Distributed Computing"],"published-print":{"date-parts":[[2015,2]]},"DOI":"10.1016\/j.jpdc.2014.11.001","type":"journal-article","created":{"date-parts":[[2014,11,12]],"date-time":"2014-11-12T05:04:53Z","timestamp":1415768693000},"page":"3-15","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":10,"special_numbering":"C","title":["A model-driven blocking strategy for load balanced sparse matrix\u2013vector multiplication on GPUs"],"prefix":"10.1016","volume":"76","author":[{"given":"Arash","family":"Ashari","sequence":"first","affiliation":[]},{"given":"Naser","family":"Sedaghati","sequence":"additional","affiliation":[]},{"given":"John","family":"Eisenlohr","sequence":"additional","affiliation":[]},{"given":"P.","family":"Sadayappan","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.jpdc.2014.11.001_br000005","doi-asserted-by":"crossref","unstructured":"A. Ashari, N. Sedaghati, J. Eisenlohr, S. Parthasarathy, P. Sadayappan, Fast sparse matrix\u2013vector multiplication on gpus for graph applications, in: The International Conference for High Performance Computing, Networking, Storage and Analysis, SC, 2014.","DOI":"10.1109\/SC.2014.69"},{"key":"10.1016\/j.jpdc.2014.11.001_br000010","doi-asserted-by":"crossref","unstructured":"A. Ashari, N. Sedaghati, J. Eisenlohr, P. Sadayappan, An efficient two-dimensional blocking mechanism for sparse matrix\u2013vector multiplication on GPUs, in: International Conference on Supercomputing, ICS, 2014.","DOI":"10.1109\/SC.2014.69"},{"year":"2013","series-title":"PETSc users manual, Tech. Rep. ANL-95\/11\u2014Revision 3.4","author":"Balay","key":"10.1016\/j.jpdc.2014.11.001_br000015"},{"key":"10.1016\/j.jpdc.2014.11.001_br000020","unstructured":"S. Balay, J. Brown, K. Buschelman, W.D. Gropp, D. Kaushik, M.G. Knepley, L.C. McInnes, B.F. Smith, H. Zhang, PETSc, 2013. http:\/\/www.mcs.anl.gov\/petsc."},{"year":"2008","series-title":"Optimizing Sparse Matrix\u2013Vector Multiplication on GPUs, Technical Report, IBM Research Report RC24704 (W0812-047)","author":"Baskaran","key":"10.1016\/j.jpdc.2014.11.001_br000025"},{"year":"2008","series-title":"Efficient sparse matrix\u2013vector multiplication on CUDA, NVIDIA Technical Report NVR-2008-004","author":"Bell","key":"10.1016\/j.jpdc.2014.11.001_br000030"},{"key":"10.1016\/j.jpdc.2014.11.001_br000035","doi-asserted-by":"crossref","unstructured":"N. Bell, M. Garland, Implementing sparse matrix\u2013vector multiplication on throughput-oriented processors, in: Proc. Conference on High Performance Computing Networking, Storage and Analysis, 2009.","DOI":"10.1145\/1654059.1654078"},{"key":"10.1016\/j.jpdc.2014.11.001_br000040","doi-asserted-by":"crossref","unstructured":"J.W. Choi, A. Singh, R.W. Vuduc, Model-driven autotuning of sparse matrix\u2013vector multiply on GPUs, in: ACM SIGPLAN Symp. Principles and Practice of Parallel Programming, PPoPP, 2010.","DOI":"10.1145\/1693453.1693471"},{"key":"10.1016\/j.jpdc.2014.11.001_br000045","unstructured":"Cuda, a parallel computing platform and programming model invented by nvidia. https:\/\/developer.nvidia.com\/cuda-home-new.html. URL: http:\/\/www.nvidia.com\/object\/cuda_home_new.html."},{"key":"10.1016\/j.jpdc.2014.11.001_br000050","unstructured":"CUSP, the nvidia library of generic parallel algorithms for sparse linear algebra and graph computations on CUDA architecture GPUs. URL: https:\/\/developer.nvidia.com\/cusp."},{"key":"10.1016\/j.jpdc.2014.11.001_br000055","unstructured":"cusparse, the nvidia cuda sparse matrix library. URL: https:\/\/developer.nvidia.com\/cusparse."},{"key":"10.1016\/j.jpdc.2014.11.001_br000060","doi-asserted-by":"crossref","unstructured":"J. Demmel, H.D. Nguyen, Fast reproducible floating-point summation, in: Computer Arithmetic, ARITH, 2013 21st IEEE Symposium on, 2013, pp. 163\u2013172.","DOI":"10.1109\/ARITH.2013.9"},{"key":"10.1016\/j.jpdc.2014.11.001_br000065","doi-asserted-by":"crossref","unstructured":"A. Ekambaram, E. Montagne, An alternative compressed storage format for sparse matrices, in: ISCIS, 2003, pp. 196\u2013203.","DOI":"10.1007\/978-3-540-39737-3_25"},{"issue":"3","key":"10.1016\/j.jpdc.2014.11.001_br000070","doi-asserted-by":"crossref","first-page":"C303","DOI":"10.1137\/12088358X","article-title":"A fast dense triangular solve in CUDA","volume":"35","author":"Hogg","year":"2013","journal-title":"SIAM J. Sci. Comput."},{"key":"10.1016\/j.jpdc.2014.11.001_br000075","unstructured":"Khronos OpenCL Working Group, The OpenCL specification, version 1.0.29, 8 December 2008."},{"key":"10.1016\/j.jpdc.2014.11.001_br000080","doi-asserted-by":"crossref","unstructured":"X. Liu, M. Smelyanskiy, E. Chow, P. Dubey, Efficient sparse matrix\u2013vector multiplication on x86-based many-core processors, in: International ACM Conference on International Conference on Supercomputing, 2013, pp. 273\u2013282.","DOI":"10.1145\/2464996.2465013"},{"key":"10.1016\/j.jpdc.2014.11.001_br000085","unstructured":"National Institute of Standards and Technology, the matrix market format. URL: http:\/\/math.nist.gov."},{"issue":"2","key":"10.1016\/j.jpdc.2014.11.001_br000090","doi-asserted-by":"crossref","first-page":"40","DOI":"10.1145\/1365490.1365500","article-title":"Scalable parallel programming with CUDA","volume":"6","author":"Nickolls","year":"2008","journal-title":"ACM Queue"},{"key":"10.1016\/j.jpdc.2014.11.001_br000095","unstructured":"Nvvp, NVIDIA visual profiler. URL: https:\/\/developer.nvidia.com\/nvidia-visual-profiler."},{"key":"10.1016\/j.jpdc.2014.11.001_br000100","doi-asserted-by":"crossref","unstructured":"I. Reguly, M. Giles, Efficient sparse matrix\u2013vector multiplication on cache-based GPUs, in: Innovative Parallel Computing, InPar, 2012, pp. 1\u201312.","DOI":"10.1109\/InPar.2012.6339602"},{"key":"10.1016\/j.jpdc.2014.11.001_br000105","unstructured":"D.M.Y. Roger, G. Grimes, David Ronald Kincaid, ITPACK 2.0: user\u2019s guide, 1980."},{"key":"10.1016\/j.jpdc.2014.11.001_br000110","doi-asserted-by":"crossref","first-page":"1200","DOI":"10.1137\/0910073","article-title":"Krylov subspace methods on supercomputers","volume":"10","author":"Saad","year":"1989","journal-title":"SIAM J. Sci. Stat. Comput."},{"key":"10.1016\/j.jpdc.2014.11.001_br000115","unstructured":"Y. Saad, SPARSKIT: a basic tool kit for sparse matrix computations\u2014version 2."},{"key":"10.1016\/j.jpdc.2014.11.001_br000120","unstructured":"S. Sengupta, M. Harris, Y. Zhang, J.D. Owens, Scan primitives for GPU computing, in: Graphics Hardware, 2007, pp. 97\u2013106."},{"year":"2004","series-title":"Automatic performance tuning of sparse matrix kernels","author":"Vuduc","key":"10.1016\/j.jpdc.2014.11.001_br000125"},{"issue":"3","key":"10.1016\/j.jpdc.2014.11.001_br000130","doi-asserted-by":"crossref","first-page":"178","DOI":"10.1016\/j.parco.2008.12.006","article-title":"Optimization of sparse matrix\u2013vector multiplication on emerging multicore platforms","volume":"35","author":"Williams","year":"2009","journal-title":"Parallel Comput."},{"key":"10.1016\/j.jpdc.2014.11.001_br000135","series-title":"Proceedings of the 19th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","first-page":"107","article-title":"yaSpMV: Yet another spmv framework on GPUs","author":"Yan","year":"2014"},{"issue":"4","key":"10.1016\/j.jpdc.2014.11.001_br000140","doi-asserted-by":"crossref","first-page":"231","DOI":"10.14778\/1938545.1938548","article-title":"Fast sparse matrix\u2013vector multiplication on gpus: implications for graph mining","volume":"4","author":"Yang","year":"2011","journal-title":"Proc. VLDB Endow."}],"container-title":["Journal of Parallel and Distributed Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731514002081?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731514002081?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2019,11,2]],"date-time":"2019-11-02T14:36:33Z","timestamp":1572705393000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0743731514002081"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,2]]},"references-count":28,"alternative-id":["S0743731514002081"],"URL":"https:\/\/doi.org\/10.1016\/j.jpdc.2014.11.001","relation":{},"ISSN":["0743-7315"],"issn-type":[{"type":"print","value":"0743-7315"}],"subject":[],"published":{"date-parts":[[2015,2]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A model-driven blocking strategy for load balanced sparse matrix\u2013vector multiplication on GPUs","name":"articletitle","label":"Article Title"},{"value":"Journal of Parallel and Distributed Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.jpdc.2014.11.001","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"Copyright \u00a9 2014 Elsevier Inc. All rights reserved.","name":"copyright","label":"Copyright"}]}}