{"id":"https://openalex.org/W4396718180","doi":"https://doi.org/10.48550/arxiv.2405.02774","title":"Get more for less: Principled Data Selection for Warming Up Fine-Tuning\n in LLMs","display_name":"Get more for less: Principled Data Selection for Warming Up Fine-Tuning\n in LLMs","publication_year":2024,"publication_date":"2024-05-04","ids":{"openalex":"https://openalex.org/W4396718180","doi":"https://doi.org/10.48550/arxiv.2405.02774"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2405.02774","pdf_url":"https://arxiv.org/pdf/2405.02774","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2405.02774","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5085966366","display_name":"Feiyang Kang","orcid":"https://orcid.org/0009-0001-8390-6662"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Kang, Feiyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5082453482","display_name":"Hoang Anh Just","orcid":"https://orcid.org/0009-0002-6094-2473"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Just, Hoang Anh","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5061976148","display_name":"Yifan Sun","orcid":"https://orcid.org/0000-0002-4235-331X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sun, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5096885936","display_name":"Himanshu Jahagirdar","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jahagirdar, Himanshu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101948008","display_name":"Yuanzhi Zhang","orcid":"https://orcid.org/0000-0002-8591-675X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Yuanzhi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109713676","display_name":"Rongxing Du","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Du, Rongxing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019965945","display_name":"Anit Kumar Sahu","orcid":"https://orcid.org/0000-0002-4083-0418"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sahu, Anit Kumar","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5032275274","display_name":"Ruoxi Jia","orcid":"https://orcid.org/0000-0001-9662-9556"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Jia, Ruoxi","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.4841,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.4841,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12535","display_name":"Machine Learning and Data Classification","score":0.4324,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12282","display_name":"Mineral Processing and Grinding","score":0.4201,"subfield":{"id":"https://openalex.org/subfields/2210","display_name":"Mechanical Engineering"},"field":{"id":"https://openalex.org/fields/22","display_name":"Engineering"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.63386077},{"id":"https://openalex.org/C115343472","wikidata":"https://www.wikidata.org/wiki/Q7942","display_name":"Global warming","level":3,"score":0.46476942},{"id":"https://openalex.org/C132651083","wikidata":"https://www.wikidata.org/wiki/Q7942","display_name":"Climate change","level":2,"score":0.38606724},{"id":"https://openalex.org/C17744445","wikidata":"https://www.wikidata.org/wiki/Q36442","display_name":"Political science","level":0,"score":0.35539967},{"id":"https://openalex.org/C175605778","wikidata":"https://www.wikidata.org/wiki/Q3299701","display_name":"Natural resource economics","level":1,"score":0.34622866},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.32798123},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.32138473},{"id":"https://openalex.org/C15744967","wikidata":"https://www.wikidata.org/wiki/Q9418","display_name":"Psychology","level":0,"score":0.32093},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.18174276},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.16048104},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.15193516}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2405.02774","pdf_url":"https://arxiv.org/pdf/2405.02774","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2405.02774","pdf_url":"https://arxiv.org/pdf/2405.02774","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W4205762803","https://openalex.org/W4200393486","https://openalex.org/W3194840763","https://openalex.org/W3124384296","https://openalex.org/W3122646731","https://openalex.org/W2790595990","https://openalex.org/W2118699598","https://openalex.org/W2033889603","https://openalex.org/W1540025019"],"abstract_inverted_index":{"This":[0],"work":[1],"focuses":[2],"on":[3,98],"leveraging":[4],"and":[5],"selecting":[6],"from":[7,84,103],"vast,":[8],"unlabeled,":[9],"open":[10],"data":[11,28,39,61,67,112],"to":[12,21,59,110,119,155,176,217],"pre-fine-tune":[13],"a":[14,82,99,144,181],"pre-trained":[15,97],"language":[16,60],"model.":[17],"The":[18],"goal":[19],"is":[20,109,169,187],"minimize":[22],"the":[23,71,92,115,120,125,138,219],"need":[24],"for":[25,29,45,51,130,198,221],"costly":[26],"domain-specific":[27],"subsequent":[30],"fine-tuning":[31,131,194],"while":[32],"achieving":[33],"desired":[34],"performance":[35,200],"levels.":[36],"While":[37,74,193],"many":[38],"selection":[40,163],"algorithms":[41],"have":[42],"been":[43,96],"designed":[44],"small-scale":[46],"applications,":[47],"rendering":[48],"them":[49],"unsuitable":[50],"our":[52,106,141,166],"context,":[53],"some":[54],"emerging":[55],"methods":[56],"do":[57],"cater":[58],"scales.":[62],"However,":[63],"they":[64],"often":[65,207],"prioritize":[66],"that":[68,113,158],"aligns":[69],"with":[70,152,212],"target":[72,121],"distribution.":[73,101,122],"this":[75,128,213],"strategy":[76],"may":[77],"be":[78],"effective":[79],"when":[80,91],"training":[81],"model":[83,93],"scratch,":[85],"it":[86,159],"can":[87],"yield":[88],"limited":[89],"results":[90],"has":[94],"already":[95],"different":[100],"Differing":[102],"prior":[104],"work,":[105,214],"key":[107],"idea":[108],"select":[111],"nudges":[114],"pre-training":[116],"distribution":[117],"closer":[118],"We":[123,136],"show":[124],"optimality":[126],"of":[127,140,147,178],"approach":[129],"tasks":[132,148],"under":[133],"certain":[134],"conditions.":[135],"demonstrate":[137],"efficacy":[139],"methodology":[142],"across":[143,201],"diverse":[145,202],"array":[146],"(NLU,":[149],"NLG,":[150],"zero-shot)":[151],"models":[153],"up":[154],"2.7B,":[156],"showing":[157],"consistently":[160],"surpasses":[161],"other":[162],"methods.":[164],"Moreover,":[165],"proposed":[167],"method":[168],"significantly":[170],"faster":[171],"than":[172],"existing":[173],"techniques,":[174],"scaling":[175],"millions":[177],"samples":[179],"within":[180],"single":[182],"GPU":[183],"hour.":[184],"Our":[185],"code":[186],"open-sourced":[188],"(Code":[189],"repository:":[190],"https://anonymous.4open.science/r/DV4LLM-D761/":[191],").":[192],"offers":[195],"significant":[196],"potential":[197],"enhancing":[199],"tasks,":[203],"its":[204,209,225],"associated":[205],"costs":[206],"limit":[208],"widespread":[210],"adoption;":[211],"we":[215],"hope":[216],"lay":[218],"groundwork":[220],"cost-effective":[222],"fine-tuning,":[223],"making":[224],"benefits":[226],"more":[227],"accessible.":[228]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4396718180","counts_by_year":[],"updated_date":"2025-04-22T20:08:26.030426","created_date":"2024-05-08"}