{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T02:41:02Z","timestamp":1740105662973,"version":"3.37.3"},"reference-count":33,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2019,8,1]],"date-time":"2019-08-01T00:00:00Z","timestamp":1564617600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2020,4,20]],"date-time":"2020-04-20T00:00:00Z","timestamp":1587340800000},"content-version":"am","delay-in-days":263,"URL":"http:\/\/www.elsevier.com\/open-access\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation, USA","doi-asserted-by":"publisher","award":["OCI-0725070","ACI-1238993"],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"state of Illinois, USA"},{"DOI":"10.13039\/100000001","name":"National Science Foundation, USA","doi-asserted-by":"publisher","award":["DGE-1144245"],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000015","name":"Department of Energy, USA","doi-asserted-by":"publisher","award":["DE-NA0002374"],"id":[{"id":"10.13039\/100000015","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006168","name":"National Nuclear Security Administration, USA","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006168","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Journal of Parallel and Distributed Computing"],"published-print":{"date-parts":[[2019,8]]},"DOI":"10.1016\/j.jpdc.2019.03.016","type":"journal-article","created":{"date-parts":[[2019,3,29]],"date-time":"2019-03-29T16:21:57Z","timestamp":1553876517000},"page":"166-178","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":19,"special_numbering":"C","title":["Node aware sparse matrix\u2013vector multiplication"],"prefix":"10.1016","volume":"130","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8891-934X","authenticated-orcid":false,"given":"Amanda","family":"Bienz","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2905-3029","authenticated-orcid":false,"given":"William D.","family":"Gropp","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5283-6104","authenticated-orcid":false,"given":"Luke N.","family":"Olson","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.jpdc.2019.03.016_b1","series-title":"Proceedings of the 20th International Conference on Parallel and Distributed Processing","article-title":"Topology-aware task mapping for reducing communication contention on large parallel machines","author":"Agarwal","year":"2006"},{"issue":"4","key":"10.1016\/j.jpdc.2019.03.016_b2","doi-asserted-by":"crossref","first-page":"C123","DOI":"10.1137\/110838844","article-title":"Exposing fine-grained parallelism in algebraic multigrid methods","volume":"34","author":"Bell","year":"2012","journal-title":"SIAM J. Sci. Comput."},{"key":"10.1016\/j.jpdc.2019.03.016_b3","series-title":"NVIDIA Technical Report NVR-2008-004","article-title":"Efficient sparse matrix-vector multiplication on CUDA","author":"Bell","year":"2008"},{"issue":"5","key":"10.1016\/j.jpdc.2019.03.016_b4","doi-asserted-by":"crossref","first-page":"S332","DOI":"10.1137\/15M1026341","article-title":"Reducing parallel communication in algebraic multigrid through sparsification","volume":"38","author":"Bienz","year":"2016","journal-title":"SIAM J. Sci. Comput."},{"key":"10.1016\/j.jpdc.2019.03.016_b5","series-title":"17th SIAM Conference on Parallel Processing for Scientific Computing","article-title":"Topology-aware performance modeling of parallel spmvs","author":"Bienz","year":"2016"},{"key":"10.1016\/j.jpdc.2019.03.016_b6","unstructured":"A. Bienz, L.N. Olson, RAPtor: parallel algebraic multigrid v0.1, Release 0.1, 2017, URL https:\/\/github.com\/lukeolson\/raptor."},{"key":"10.1016\/j.jpdc.2019.03.016_b7","first-page":"47","article-title":"Communication balancing in parallel sparse matrix-vector multiplication.","volume":"21","author":"Bisseling","year":"2005","journal-title":"ETNA. Electronic Trans. Numer. Anal. [electronic only]"},{"issue":"7","key":"10.1016\/j.jpdc.2019.03.016_b8","doi-asserted-by":"crossref","first-page":"673","DOI":"10.1109\/71.780863","article-title":"Hypergraph-partitioning-based decomposition for parallel sparse-matrix vector multiplication","volume":"10","author":"\u00c7ataly\u00fcrek","year":"1999","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"2","key":"10.1016\/j.jpdc.2019.03.016_b9","doi-asserted-by":"crossref","first-page":"656","DOI":"10.1137\/080737770","article-title":"On two-dimensional sparse matrix partitioning: models, methods, and a recipe","volume":"32","author":"\u00c7ataly\u00fcrek","year":"2010","journal-title":"SIAM J. Sci. Comput."},{"issue":"68","key":"10.1016\/j.jpdc.2019.03.016_b10","doi-asserted-by":"crossref","first-page":"318","DOI":"10.1016\/j.parco.2007.12.001","article-title":"PT-Scotch: a tool for efficient parallel graph ordering","volume":"34","author":"Chevalier","year":"2008","journal-title":"Parallel Comput."},{"issue":"4","key":"10.1016\/j.jpdc.2019.03.016_b11","doi-asserted-by":"crossref","first-page":"25:1","DOI":"10.1145\/2699470","article-title":"Optimizing sparse matrix-matrix multiplication for the gpu","volume":"41","author":"Dalton","year":"2015","journal-title":"ACM Trans. Math. Software"},{"key":"10.1016\/j.jpdc.2019.03.016_b12","first-page":"1:1","article-title":"The university of florida sparse matrix collection","volume":"32","author":"Davis","year":"2011","journal-title":"ACM Trans. Math. Software"},{"issue":"7","key":"10.1016\/j.jpdc.2019.03.016_b13","doi-asserted-by":"crossref","first-page":"883","DOI":"10.1016\/S0167-8191(01)00073-4","article-title":"Towards a fast parallel sparse symmetric matrix-vector multiplication","volume":"27","author":"Geus","year":"2001","journal-title":"Parallel Comput."},{"key":"10.1016\/j.jpdc.2019.03.016_b14","series-title":"Proceedings of the 23rd European MPI Users\u2019 Group Meeting","first-page":"41","article-title":"Modeling MPI communication performance on SMP nodes: is it time to retire the ping pong test","author":"Gropp","year":"2016"},{"issue":"1","key":"10.1016\/j.jpdc.2019.03.016_b15","doi-asserted-by":"crossref","first-page":"103","DOI":"10.1177\/1094342015593156","article-title":"A hybrid format for better performance of sparse matrix-vector multiplication on a gpu","volume":"30","author":"Guo","year":"2016","journal-title":"Int. J. High Performance Comput. Appl."},{"issue":"12","key":"10.1016\/j.jpdc.2019.03.016_b16","doi-asserted-by":"crossref","first-page":"1519","DOI":"10.1016\/S0167-8191(00)00048-X","article-title":"Graph partitioning models for parallel computing","volume":"26","author":"Hendrickson","year":"2000","journal-title":"Parallel Comput."},{"key":"10.1016\/j.jpdc.2019.03.016_b17","series-title":"2013 IEEE International Conference on Cluster Computing (CLUSTER)","first-page":"1","article-title":"Communication and topology-aware load balancing in charm++ with treematch","author":"Jeannot","year":"2013"},{"key":"10.1016\/j.jpdc.2019.03.016_b18","series-title":"Proceedings 14th International Parallel and Distributed Processing Symposium. IPDPS 2000","first-page":"377","article-title":"Exploiting hierarchy in parallel computer networks to optimize collective operation performance","author":"Karonis","year":"2000"},{"issue":"1","key":"10.1016\/j.jpdc.2019.03.016_b19","doi-asserted-by":"crossref","first-page":"71","DOI":"10.1006\/jpdc.1997.1403","article-title":"A parallel algorithm for multilevel graph partitioning and sparse matrix ordering","volume":"48","author":"Karypis","year":"1998","journal-title":"J. Parallel Distrib. Comput."},{"issue":"8","key":"10.1016\/j.jpdc.2019.03.016_b20","doi-asserted-by":"crossref","first-page":"131","DOI":"10.1145\/329366.301116","article-title":"Magpie: mpi\u2019s collective communication operations for clustered wide area systems","volume":"34","author":"Kielmann","year":"1999","journal-title":"SIGPLAN Not."},{"issue":"3","key":"10.1016\/j.jpdc.2019.03.016_b21","doi-asserted-by":"crossref","first-page":"802","DOI":"10.1002\/cpe.3609","article-title":"Network-aware optimization of communications for parallel matrix multiplication on hierarchical hpc platforms","volume":"28","author":"Malik","year":"2016","journal-title":"Concurr. Comput. : Pract. Exper."},{"issue":"8","key":"10.1016\/j.jpdc.2019.03.016_b22","doi-asserted-by":"crossref","first-page":"974","DOI":"10.1016\/j.jpdc.2004.05.003","article-title":"Fast optimal load balancing algorithms for 1d partitioning","volume":"64","author":"Pinar","year":"2004","journal-title":"J. Parallel Distrib. Comput."},{"key":"10.1016\/j.jpdc.2019.03.016_b23","unstructured":"A. Reisner, L.N. Olson, J.D. Moulton, Scaling Structured Multigrid to 500K+ Cores through Coarse-Grid Redistribution, CoRR, abs\/1803.02481."},{"key":"10.1016\/j.jpdc.2019.03.016_b24","series-title":"Proceedings of the 17th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming","first-page":"45","article-title":"Faster topology-aware collective algorithms through non-minimal communication","author":"Sack","year":"2012"},{"issue":"03","key":"10.1016\/j.jpdc.2019.03.016_b25","doi-asserted-by":"crossref","first-page":"339","DOI":"10.1142\/S0129626411000254","article-title":"Hybrid-parallel sparse matrix-vector multiplication with explicit communication overlap on current multicore-based systems","volume":"21","author":"Schubert","year":"2011","journal-title":"Parallel Process. Lett."},{"key":"10.1016\/j.jpdc.2019.03.016_b26","series-title":"Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis","first-page":"77:1","article-title":"Improving communication performance in dense linear algebra via topology aware collectives","author":"Solomonik","year":"2011"},{"key":"10.1016\/j.jpdc.2019.03.016_b27","series-title":"Proceedings of the 7th ACM\/SPEC on International Conference on Performance Engineering","first-page":"225","article-title":"Communication characterization and optimization of applications using topology-aware task mapping on large supercomputers","author":"Sreepathi","year":"2016"},{"issue":"4","key":"10.1016\/j.jpdc.2019.03.016_b28","doi-asserted-by":"crossref","first-page":"1019","DOI":"10.1137\/040615729","article-title":"Reducing complexity in parallel algebraic multigrid preconditioners","volume":"27","author":"Sterck","year":"2006","journal-title":"SIAM J. Matrix Anal. Appl."},{"key":"10.1016\/j.jpdc.2019.03.016_b29","series-title":"Proceedings of the 2002 ACM\/IEEE Conference on Supercomputing","first-page":"1","article-title":"Implementing the mpi process topology mechanism","author":"Tr\u00e4ff","year":"2002"},{"issue":"1","key":"10.1016\/j.jpdc.2019.03.016_b30","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1137\/S0036144502409019","article-title":"A two-dimensional data distribution method for parallel sparse matrix-vector multiplication","volume":"47","author":"Vastenhouw","year":"2005","journal-title":"SIAM Rev."},{"key":"10.1016\/j.jpdc.2019.03.016_b31","series-title":"2014 43rd International Conference on Parallel Processing","first-page":"211","article-title":"Tram: optimizing fine-grained communication with topological routing and aggregation of messages","author":"Wesolowski","year":"2014"},{"key":"10.1016\/j.jpdc.2019.03.016_b32","series-title":"Proceedings of the 2007 ACM\/IEEE Conference on Supercomputing","first-page":"38:1","article-title":"Optimization of sparse matrix-vector multiplication on emerging multicore platforms","author":"Williams","year":"2007"},{"issue":"03","key":"10.1016\/j.jpdc.2019.03.016_b33","doi-asserted-by":"crossref","first-page":"239","DOI":"10.1142\/S0129053390000157","article-title":"A model and implementation of multigrid for massively parallel computers","volume":"02","author":"WOMBLE","year":"1990","journal-title":"Int. J. High Speed Comput."}],"container-title":["Journal of Parallel and Distributed Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731519302321?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0743731519302321?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2021,4,18]],"date-time":"2021-04-18T11:45:16Z","timestamp":1618746316000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0743731519302321"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,8]]},"references-count":33,"alternative-id":["S0743731519302321"],"URL":"https:\/\/doi.org\/10.1016\/j.jpdc.2019.03.016","relation":{},"ISSN":["0743-7315"],"issn-type":[{"type":"print","value":"0743-7315"}],"subject":[],"published":{"date-parts":[[2019,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Node aware sparse matrix\u2013vector multiplication","name":"articletitle","label":"Article Title"},{"value":"Journal of Parallel and Distributed Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.jpdc.2019.03.016","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2019 Elsevier Inc. All rights reserved.","name":"copyright","label":"Copyright"}]}}