{"id":"https://openalex.org/W4327673107","doi":"https://doi.org/10.48550/arxiv.2303.08685","title":"Making Vision Transformers Efficient from A Token Sparsification View","display_name":"Making Vision Transformers Efficient from A Token Sparsification View","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4327673107","doi":"https://doi.org/10.48550/arxiv.2303.08685"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.08685","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2303.08685","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101664745","display_name":"Shuning Chang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chang, Shuning","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5042680345","display_name":"Pichao Wang","orcid":"https://orcid.org/0000-0002-1430-0237"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Pichao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100302832","display_name":"Ming Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Ming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100719173","display_name":"Fan Wang","orcid":"https://orcid.org/0000-0001-7320-1119"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Fan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5024413361","display_name":"David Junhao Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, David Junhao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069394608","display_name":"Rong Jin","orcid":"https://orcid.org/0000-0002-8797-4646"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jin, Rong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5068937750","display_name":"Mike Zheng Shou","orcid":"https://orcid.org/0000-0002-7681-2166"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shou, Mike Zheng","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":65},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9997,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10036","display_name":"Advanced Neural Network Applications","score":0.9997,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11307","display_name":"Domain Adaptation and Few-Shot Learning","score":0.9973,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T13114","display_name":"Image Processing Techniques and Applications","score":0.9952,"subfield":{"id":"https://openalex.org/subfields/2214","display_name":"Media Technology"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pooling","display_name":"Pooling","score":0.49286294}],"concepts":[{"id":"https://openalex.org/C48145219","wikidata":"https://www.wikidata.org/wiki/Q1335365","display_name":"Security token","level":2,"score":0.83453506},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8179984},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.640757},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.62567043},{"id":"https://openalex.org/C70437156","wikidata":"https://www.wikidata.org/wiki/Q7228652","display_name":"Pooling","level":2,"score":0.49286294},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.472364},{"id":"https://openalex.org/C31972630","wikidata":"https://www.wikidata.org/wiki/Q844240","display_name":"Computer vision","level":1,"score":0.34170607},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.13955283},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.1074833},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.08685","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2303.08685","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2303.08685","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.47,"id":"https://metadata.un.org/sdg/9","display_name":"Industry, innovation and infrastructure"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4390975304","https://openalex.org/W4304700937","https://openalex.org/W4287635093","https://openalex.org/W4206178588","https://openalex.org/W3214715529","https://openalex.org/W3094491777","https://openalex.org/W2953234277","https://openalex.org/W2900413183","https://openalex.org/W2626256601","https://openalex.org/W147410782"],"abstract_inverted_index":{"The":[0,83],"quadratic":[1],"computational":[2],"complexity":[3],"to":[4,20,24,76,113,173,196,206,240],"the":[5,10,42,114,123,147,208,214,241],"number":[6],"of":[7,13],"tokens":[8,23,85,96,120,142,169],"limits":[9],"practical":[11],"applications":[12],"Vision":[14],"Transformers":[15],"(ViTs).":[16],"Several":[17],"works":[18],"propose":[19,57],"prune":[21],"redundant":[22],"achieve":[25,146,236],"efficient":[26,65],"ViTs.":[27],"However,":[28],"these":[29],"methods":[30],"generally":[31],"suffer":[32],"from":[33],"(i)":[34],"dramatic":[35],"accuracy":[36,149,183],"drops,":[37],"(ii)":[38],"application":[39],"difficulty":[40],"in":[41,97,170,188,244],"local":[43,68,109,134],"vision":[44,69,135],"transformer,":[45],"and":[46,67,89,99,133,157,247],"(iii)":[47],"non-general-purpose":[48],"networks":[49,243],"for":[50,64,80,130,219,225,255],"downstream":[51,81,220],"tasks.":[52,82],"In":[53,199],"this":[54],"work,":[55],"we":[56,164,191,201],"a":[58,117,203],"novel":[59],"Semantic":[60],"Token":[61],"ViT":[62],"(STViT),":[63],"global":[66,107,132],"transformers,":[70],"which":[71,103,222],"can":[72,104,121,145,165,235],"also":[73,192],"be":[74],"revised":[75],"serve":[77],"as":[78,126],"backbone":[79],"semantic":[84,110,119,141,168],"represent":[86,106],"cluster":[87,115],"centers,":[88],"they":[90],"are":[91],"initialized":[92],"by":[93,101,178],"pooling":[94],"image":[95,128,189],"space":[98],"recovered":[100],"attention,":[102],"adaptively":[105],"or":[108],"information.":[111],"Due":[112],"properties,":[116],"few":[118],"attain":[122],"same":[124,148],"effect":[125],"vast":[127],"tokens,":[129],"both":[131],"transformers.":[136],"For":[137],"instance,":[138],"only":[139],"16":[140,167],"on":[143,162,213],"DeiT-(Tiny,Small,Base)":[144],"with":[150,181,250],"more":[151],"than":[152],"100%":[153],"inference":[154],"speed":[155,175],"improvement":[156],"nearly":[158],"60%":[159],"FLOPs":[160,253],"reduction;":[161],"Swin-(Tiny,Small,Base),":[163],"employ":[166],"each":[171],"window":[172],"further":[174],"it":[176,217],"up":[177],"around":[179],"20%":[180],"slight":[182],"increase.":[184],"Besides":[185],"great":[186],"success":[187],"classification,":[190],"extend":[193],"our":[194,233],"method":[195,234],"video":[197],"recognition.":[198],"addition,":[200],"design":[202],"STViT-R(ecover)":[204],"network":[205],"restore":[207],"detailed":[209],"spatial":[210],"information":[211],"based":[212],"STViT,":[215],"making":[216],"work":[218],"tasks,":[221],"is":[223,258],"powerless":[224],"previous":[226],"token":[227],"sparsification":[228],"methods.":[229],"Experiments":[230],"demonstrate":[231],"that":[232],"competitive":[237],"results":[238],"compared":[239],"original":[242],"object":[245],"detection":[246],"instance":[248],"segmentation,":[249],"over":[251],"30%":[252],"reduction":[254],"backbone.":[256],"Code":[257],"available":[259],"at":[260],"http://github.com/changsn/STViT-R":[261]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4327673107","counts_by_year":[],"updated_date":"2025-04-13T02:52:59.666873","created_date":"2023-03-18"}