{"id":"https://openalex.org/W4399911888","doi":"https://doi.org/10.48550/arxiv.2406.14473","title":"Data-Centric AI in the Age of Large Language Models","display_name":"Data-Centric AI in the Age of Large Language Models","publication_year":2024,"publication_date":"2024-06-20","ids":{"openalex":"https://openalex.org/W4399911888","doi":"https://doi.org/10.48550/arxiv.2406.14473"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.14473","pdf_url":"https://arxiv.org/pdf/2406.14473","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2406.14473","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101059157","display_name":"Xinyi Xu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xu, Xinyi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101545321","display_name":"Zhaoxuan Wu","orcid":"https://orcid.org/0009-0002-5659-6387"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Zhaoxuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5028955329","display_name":"Rui Qiao","orcid":"https://orcid.org/0000-0002-6719-4490"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Qiao, Rui","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101384234","display_name":"Arun Verma","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Verma, Arun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113007586","display_name":"Yao Shu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shu, Yao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102693032","display_name":"Jingtan Wang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Jingtan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5030809586","display_name":"Xinyuan Niu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Niu, Xinyuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101384235","display_name":"Zhenfeng He","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"He, Zhenfeng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5113261128","display_name":"Jiangwei Chen","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chen, Jiangwei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102944236","display_name":"Zijian Zhou","orcid":"https://orcid.org/0000-0003-3315-3962"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhou, Zijian","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5093052666","display_name":"Gregory Kang Ruey Lau","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lau, Gregory Kang Ruey","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041179168","display_name":"Hieu Dao","orcid":"https://orcid.org/0000-0003-2940-807X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dao, Hieu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5099380857","display_name":"Lucas Agussurja","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Agussurja, Lucas","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085515319","display_name":"Rachael Hwee Ling Sim","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sim, Rachael Hwee Ling","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101211925","display_name":"Xiaoqiang Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Xiaoqiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5111236271","display_name":"Wenyang Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Wenyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5064034052","display_name":"Zhongxiang Dai","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Dai, Zhongxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079904764","display_name":"Pang Wei Koh","orcid":"https://orcid.org/0000-0003-4330-6969"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Koh, Pang Wei","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5030304400","display_name":"Bryan Kian Hsiang Low","orcid":"https://orcid.org/0000-0003-2808-451X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Low, Bryan Kian Hsiang","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":78},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6639,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.6639,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.6488,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.4368177},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.4022761}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.14473","pdf_url":"https://arxiv.org/pdf/2406.14473","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2406.14473","pdf_url":"https://arxiv.org/pdf/2406.14473","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4396701345","https://openalex.org/W4396696052","https://openalex.org/W4395014643","https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2382290278","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2001405890"],"abstract_inverted_index":{"This":[0],"position":[1],"paper":[2],"proposes":[3],"a":[4,102,109],"data-centric":[5,62,112],"viewpoint":[6],"of":[7,40,81,111,120],"AI":[8,149],"research,":[9],"focusing":[10],"on":[11,92],"large":[12],"language":[13],"models":[14],"(LLMs).":[15],"We":[16,53],"start":[17],"by":[18],"making":[19],"the":[20,28,50,79,89,93,99,116],"key":[21],"observation":[22],"that":[23],"data":[24,65,67,121,132],"is":[25],"instrumental":[26],"in":[27,148],"developmental":[29],"(e.g.,":[30,37],"pretraining":[31],"and":[32,34,42,64,71,87,118,135,139,146,150],"fine-tuning)":[33],"inferential":[35],"stages":[36],"in-context":[38],"learning)":[39],"LLMs,":[41],"yet":[43],"it":[44],"receives":[45],"disproportionally":[46],"low":[47],"attention":[48],"from":[49],"research":[51,85,94,137],"community.":[52],"identify":[54],"four":[55],"specific":[56],"scenarios":[57],"centered":[58],"around":[59],"data,":[60,82],"covering":[61],"benchmarks":[63,113,125],"curation,":[66],"attribution,":[68],"knowledge":[69],"transfer,":[70],"inference":[72],"contextualization.":[73],"In":[74],"each":[75],"scenario,":[76],"we":[77,106],"underscore":[78],"importance":[80],"highlight":[83],"promising":[84],"directions,":[86],"articulate":[88],"potential":[90],"impacts":[91],"community":[95],"and,":[96],"where":[97],"applicable,":[98],"society":[100],"as":[101],"whole.":[103],"For":[104],"instance,":[105],"advocate":[107],"for":[108,122],"suite":[110],"tailored":[114],"to":[115,129],"scale":[117],"complexity":[119],"LLMs.":[123],"These":[124],"can":[126,142],"be":[127],"used":[128],"develop":[130],"new":[131],"curation":[133],"methods":[134],"document":[136],"efforts":[138],"results,":[140],"which":[141],"help":[143],"promote":[144],"openness":[145],"transparency":[147],"LLM":[151],"research.":[152]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4399911888","counts_by_year":[],"updated_date":"2025-04-05T05:40:04.941535","created_date":"2024-06-22"}