{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,5]],"date-time":"2024-09-05T11:46:36Z","timestamp":1725536796829},"publisher-location":"Berlin, Heidelberg","reference-count":14,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"type":"print","value":"9783642038686"},{"type":"electronic","value":"9783642038693"}],"license":[{"start":{"date-parts":[[2009,1,1]],"date-time":"2009-01-01T00:00:00Z","timestamp":1230768000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2009,1,1]],"date-time":"2009-01-01T00:00:00Z","timestamp":1230768000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2009]]},"DOI":"10.1007\/978-3-642-03869-3_87","type":"book-chapter","created":{"date-parts":[[2009,8,22]],"date-time":"2009-08-22T00:04:48Z","timestamp":1250899488000},"page":"948-959","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["High Performance Matrix Multiplication on Many Cores"],"prefix":"10.1007","author":[{"given":"Nan","family":"Yuan","sequence":"first","affiliation":[]},{"given":"Yongbin","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Guangming","family":"Tan","sequence":"additional","affiliation":[]},{"given":"Junchao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Dongrui","family":"Fan","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"87_CR1","doi-asserted-by":"crossref","unstructured":"Alverson, R., Callahan, D., Cummings, D., Koblenz, B., Porterfield, A., Smith, B.: The Tera computer system. In: Proceedings of the 4th international conference on Supercomputing (1990)","DOI":"10.1145\/77726.255132"},{"key":"87_CR2","unstructured":"Asanovic, K., Bodik, R., Catanzaro, B.C., Gebis, J.J., Husbands, P., Keutzer, K., Patterson, D.A., Plishker, W.L., Shalf, J., Williams, S.W., et al.: The landscape of parallel computing research: A view from berkeley. Electrical Engineering and Computer Sciences, University of California at Berkeley, Technical Report No. UCB\/EECS-2006-183, December, 18(2006-183):19 (2006)"},{"issue":"7","key":"87_CR3","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1109\/MC.2004.65","volume":"37","author":"D. Burger","year":"2004","unstructured":"Burger, D., Keckler, S.W., McKinley, K.S., Dahlin, M., John, L.K., Lin, C., Moore, C.R., Burrill, J., McDonald, R.G., Yoder, W., et al.: Scaling to the End of Silicon with EDGE Architectures. Computer\u00a037(7), 44\u201355 (2004)","journal-title":"Computer"},{"key":"87_CR4","unstructured":"Cannon, L.E.: A cellular computer to implement the Kalman filter algorithm (1969)"},{"key":"87_CR5","doi-asserted-by":"crossref","unstructured":"Diamond, J.R., Robatmili, B., Keckler, S.W., van de Geijn, R., Goto, K., Burger, D.: High performance dense linear algebra on a spatially distributed processor. In: Proceedings of the 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 63\u201372 (2008)","DOI":"10.1145\/1345206.1345218"},{"key":"87_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"134","DOI":"10.1007\/11823285_14","volume-title":"Euro-Par 2006 Parallel Processing","author":"Z. Hu","year":"2006","unstructured":"Hu, Z., del Cuvillo, J., Zhu, W., Gao, G.R.: Optimization of dense matrix multiplication on IBM cyclops-64: Challenges and experiences. In: Nagel, W.E., Walter, W.V., Lehner, W. (eds.) Euro-Par 2006. LNCS, vol.\u00a04128, pp. 134\u2013144. Springer, Heidelberg (2006)"},{"key":"87_CR7","unstructured":"Kapasi, U.J., Dally, W.J., Rixner, S., Owens, J.D., Khailany, B.: The Imagine stream processor. In: Proceedings 2002 IEEE International Conference on Computer Design, pp. 282\u2013288 (2002)"},{"key":"87_CR8","doi-asserted-by":"crossref","unstructured":"Mattson, T.G., Van der Wijngaart, R., Frumkin, M.: Programming the Intel 80-core network-on-a-chip terascale processor. In: Proceedings of the 2008 ACM\/IEEE conference on Supercomputing (2008)","DOI":"10.1109\/SC.2008.5213921"},{"key":"87_CR9","doi-asserted-by":"crossref","unstructured":"Mukherjee, S.S., Silla, F., Bannon, P., Emer, J., Lang, S., Webb, D.: A comparative study of arbitration algorithms for the Alpha 21364 pipelined router. In: Proceedings of the 10th international conference on Architectural Support for Programming Languages and Operating Systems (2002)","DOI":"10.1145\/605397.605421"},{"key":"87_CR10","doi-asserted-by":"crossref","unstructured":"Tan, G., Fan, D., Zhang, J., Russo, A., Gao, G.R.: Experience on optimizing irregular computation for memory hierarchy in manycore architecture. In: Proceedings of the 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, pp. 279\u2013280 (2008)","DOI":"10.1145\/1345206.1345255"},{"key":"87_CR11","first-page":"2","volume":"272","author":"M.B. Taylor","year":"2002","unstructured":"Taylor, M.B., Kim, J., Miller, J., Wentzlaff, D., Ghodrat, F., Greenwald, B., Hoffman, H., Johnson, P., Lee, J.W., Lee, W., et al.: The Raw microprocessor: A computational fabric for software circuits and general-purpose programs. IEEE micro.\u00a0272, 2 (2002)","journal-title":"IEEE micro."},{"key":"87_CR12","doi-asserted-by":"crossref","unstructured":"Williams, S., Shalf, J., Oliker, L., Kamil, S., Husbands, P., Yelick, K.: The potential of the cell processor for scientific computing. In: Proceedings of the 3rd conference on Computing Frontiers, pp. 9\u201320 (2006)","DOI":"10.1145\/1128022.1128027"},{"key":"87_CR13","doi-asserted-by":"crossref","unstructured":"Ye, X., Nguyen, V.H., Lavenier, D., Fan, D.: Efficient parallelization of a protein sequence comparison algorithm on manycore architecture. In: Proceedings of the 9th international conference on Parallel and Distributed Computing, Applications and Technologies, pp. 167\u2013170 (2008)","DOI":"10.1109\/PDCAT.2008.28"},{"key":"87_CR14","doi-asserted-by":"crossref","unstructured":"Zhu, W., Sreedhar, V.C., Hu, Z., Gao, G.R.: Synchronization state buffer: supporting efficient fine-grain synchronization on many-core architectures. In: Proceedings of the 34th annual International Symposium on Computer Architecture, pp. 35\u201345 (2007)","DOI":"10.1145\/1250662.1250668"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2009 Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-03869-3_87","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,19]],"date-time":"2020-05-19T09:55:16Z","timestamp":1589882116000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-03869-3_87"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2009]]},"ISBN":["9783642038686","9783642038693"],"references-count":14,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-03869-3_87","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2009]]},"assertion":[{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}