{"id":"https://openalex.org/W4299494398","doi":"https://doi.org/10.48550/arxiv.2208.02070","title":"Efficient Fine-Tuning of Compressed Language Models with Learners","display_name":"Efficient Fine-Tuning of Compressed Language Models with Learners","publication_year":2022,"publication_date":"2022-01-01","ids":{"openalex":"https://openalex.org/W4299494398","doi":"https://doi.org/10.48550/arxiv.2208.02070"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2208.02070","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2208.02070","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5071866685","display_name":"Danilo Vucetic","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Vucetic, Danilo","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5022419434","display_name":"Mohammadreza Tayaranian","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Tayaranian, Mohammadreza","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029947593","display_name":"Maryam Ziaeefard","orcid":"https://orcid.org/0000-0003-4490-1268"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziaeefard, Maryam","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079264223","display_name":"James J. Clark","orcid":"https://orcid.org/0000-0002-4512-6171"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Clark, James J.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034701871","display_name":"Brett H. Meyer","orcid":"https://orcid.org/0000-0002-6650-3298"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Meyer, Brett H.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5091600002","display_name":"Warren J. Gross","orcid":"https://orcid.org/0000-0002-6226-6037"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gross, Warren J.","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":60},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9996,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9996,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9992,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.998,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.6856071},{"id":"https://openalex.org/keywords/fine-tuning","display_name":"Fine-tuning","score":0.42067793},{"id":"https://openalex.org/keywords/speedup","display_name":"Speedup","score":0.41992158}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.81743646},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.6856071},{"id":"https://openalex.org/C45374587","wikidata":"https://www.wikidata.org/wiki/Q12525525","display_name":"Computation","level":2,"score":0.57240814},{"id":"https://openalex.org/C2777303404","wikidata":"https://www.wikidata.org/wiki/Q759757","display_name":"Convergence (economics)","level":2,"score":0.5648417},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.5588852},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.5292501},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.52501184},{"id":"https://openalex.org/C206345919","wikidata":"https://www.wikidata.org/wiki/Q20380951","display_name":"Resource (disambiguation)","level":2,"score":0.4880065},{"id":"https://openalex.org/C2776214188","wikidata":"https://www.wikidata.org/wiki/Q408386","display_name":"Inference","level":2,"score":0.48797673},{"id":"https://openalex.org/C157524613","wikidata":"https://www.wikidata.org/wiki/Q2828883","display_name":"Fine-tuning","level":2,"score":0.42067793},{"id":"https://openalex.org/C68339613","wikidata":"https://www.wikidata.org/wiki/Q1549489","display_name":"Speedup","level":2,"score":0.41992158},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3615008},{"id":"https://openalex.org/C113775141","wikidata":"https://www.wikidata.org/wiki/Q428691","display_name":"Computer engineering","level":1,"score":0.3524882},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.3351449},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.24701726},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.1947923},{"id":"https://openalex.org/C31258907","wikidata":"https://www.wikidata.org/wiki/Q1301371","display_name":"Computer network","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C6557445","wikidata":"https://www.wikidata.org/wiki/Q173113","display_name":"Agronomy","level":1,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C50522688","wikidata":"https://www.wikidata.org/wiki/Q189833","display_name":"Economic growth","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2208.02070","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2208.02070","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2208.02070","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4309877123","https://openalex.org/W4302305417","https://openalex.org/W4287802662","https://openalex.org/W3154646238","https://openalex.org/W3037551068","https://openalex.org/W3023594376","https://openalex.org/W3023285645","https://openalex.org/W2899852118","https://openalex.org/W2164382479","https://openalex.org/W2058965144"],"abstract_inverted_index":{"Fine-tuning":[0],"BERT-based":[1],"models":[2,55],"is":[3],"resource-intensive":[4],"in":[5,59],"memory,":[6],"computation,":[7],"and":[8,42,62,81,89,124],"time.":[9],"While":[10],"many":[11],"prior":[12],"works":[13,25],"aim":[14],"to":[15,35,56],"improve":[16],"inference":[17],"efficiency":[18],"via":[19],"compression":[20],"techniques,":[21],"e.g.,":[22],"pruning,":[23],"these":[24],"do":[26],"not":[27],"explicitly":[28],"address":[29],"the":[30,50,68,106],"computational":[31],"challenges":[32],"of":[33,52,71,79],"training":[34,73,83],"downstream":[36],"tasks.":[37],"We":[38],"introduce":[39],"Learner":[40,65],"modules":[41,66],"priming,":[43],"novel":[44],"methods":[45,115],"for":[46],"fine-tuning":[47,76],"that":[48,98],"exploit":[49],"overparameterization":[51],"pre-trained":[53],"language":[54],"gain":[57],"benefits":[58],"convergence":[60,88],"speed":[61],"resource":[63,128],"utilization.":[64,129],"navigate":[67],"double":[69],"bind":[70],"1)":[72],"efficiently":[74],"by":[75,85],"a":[77],"subset":[78],"parameters,":[80],"2)":[82],"effectively":[84],"ensuring":[86],"quick":[87],"high":[90],"metric":[91],"scores.":[92],"Our":[93],"results":[94],"on":[95,101,116],"DistilBERT":[96],"demonstrate":[97],"learners":[99,120],"perform":[100],"par":[102],"with":[103],"or":[104],"surpass":[105],"baselines.":[107],"Learners":[108],"train":[109],"7x":[110],"fewer":[111],"parameters":[112],"than":[113],"state-of-the-art":[114],"GLUE.":[117],"On":[118],"CoLA,":[119],"fine-tune":[121],"20%":[122],"faster,":[123],"have":[125],"significantly":[126],"lower":[127]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4299494398","counts_by_year":[],"updated_date":"2025-01-22T01:02:27.863413","created_date":"2022-10-02"}