We":[74],"propose":[75],"method":[78],"that":[79,84],"builds":[80],"upon":[81],"insight":[83],"multiplication":[86],"generally":[87],"breaks":[88],"large":[90],"into":[92,163],"multiple":[93],"smaller":[94],"tiles":[95],"parallel":[97],"execution.":[98],"We":[99,157,181],"present":[100],"\"tile-wise\"":[102],"pattern,":[104],"maintains":[106],"pattern":[110,168],"at":[111,123,138,148],"tile":[113],"level":[114,151],"efficient":[116],"execution":[117],"but":[118],"allows":[119],"global":[125,140],"scale":[126],"high":[129],"accuracy.":[130],"In":[131],"addition,":[132],"tile-wise":[134],"is":[136],"implemented":[137],"memory":[141],"level,":[142],"and":[143,174,193,202],"executes":[147],"register":[150],"inside":[152],"core.":[156],"combine":[159],"these":[160],"two":[161],"patterns":[162],"\"tile-vector-wise\"":[165],"(TVW)":[166],"explore":[170],"more":[171],"fine-grained":[172],"further":[175],"accelerate":[176],"DNN":[179],"evaluate":[182],"TVW":[184],"GPU,":[187],"achieving":[188],"averages":[189],"1:85\u00d7,":[191],"2:75\u00d7,":[192],"22:18\u00d7":[194],"speedups":[195],"over":[196],"model,":[199],"block":[200],"sparsity,":[201]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4391827186","counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2024-12-08T08:35:33.777357","created_date":"2024-02-15"}