{"id":"https://openalex.org/W4397028110","doi":"https://doi.org/10.48550/arxiv.2405.09597","title":"When AI Eats Itself: On the Caveats of Data Pollution in the Era of\n Generative AI","display_name":"When AI Eats Itself: On the Caveats of Data Pollution in the Era of\n Generative AI","publication_year":2024,"publication_date":"2024-05-15","ids":{"openalex":"https://openalex.org/W4397028110","doi":"https://doi.org/10.48550/arxiv.2405.09597"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.09597","pdf_url":"http://arxiv.org/pdf/2405.09597","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2405.09597","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5047946782","display_name":"Xiaodan Xing","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xing, Xiaodan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5114162229","display_name":"Fadong Shi","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Fadong","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5069163726","display_name":"Jiahao Huang","orcid":"https://orcid.org/0000-0001-9326-5320"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Jiahao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5039464012","display_name":"Yinzhe Wu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Yinzhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100645654","display_name":"Nan Yang","orcid":"https://orcid.org/0000-0002-7373-7906"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Nan, Yang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100394044","display_name":"Sheng Zhang","orcid":"https://orcid.org/0000-0003-1034-6622"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Sheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5085166013","display_name":"Yingying Fang","orcid":"https://orcid.org/0000-0001-6334-8635"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fang, Yingying","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102392400","display_name":"Mike Roberts","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Roberts, Mike","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5033880300","display_name":"Carola\u2010Bibiane Sch\u00f6nlieb","orcid":"https://orcid.org/0000-0003-0099-6306"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sch\u00f6nlieb, Carola-Bibiane","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5017326471","display_name":"Javier Del Ser","orcid":"https://orcid.org/0000-0002-1260-9775"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Del Ser, Javier","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5074053036","display_name":"Guang Yang","orcid":"https://orcid.org/0000-0001-6845-5317"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Guang","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":84},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.6196,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T10883","display_name":"Ethics and Social Impacts of AI","score":0.6196,"subfield":{"id":"https://openalex.org/subfields/3311","display_name":"Safety Research"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C39890363","wikidata":"https://www.wikidata.org/wiki/Q36108","display_name":"Generative grammar","level":2,"score":0.67879224},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.41996294},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.27356339}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.09597","pdf_url":"http://arxiv.org/pdf/2405.09597","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2405.09597","pdf_url":"http://arxiv.org/pdf/2405.09597","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4390718435","https://openalex.org/W4390549206","https://openalex.org/W4383031710","https://openalex.org/W4379540039","https://openalex.org/W4237784285","https://openalex.org/W3211753092","https://openalex.org/W3137171911","https://openalex.org/W2386000789","https://openalex.org/W2380075625","https://openalex.org/W2374712251"],"abstract_inverted_index":{"Generative":[0],"artificial":[1],"intelligence":[2],"(AI)":[3],"technologies":[4,255],"and":[5,20,32,85,93,139,214,217,244],"large":[6,31,260],"models":[7,26,47],"are":[8],"producing":[9],"realistic":[10],"outputs":[11],"across":[12],"various":[13],"domains,":[14],"such":[15],"as":[16,49],"images,":[17],"text,":[18],"speech,":[19],"music.":[21],"Creating":[22],"these":[23,222],"advanced":[24],"generative":[25,123,146,181,209,253],"requires":[27],"significant":[28,167],"resources,":[29],"particularly":[30,183],"high-quality":[33],"datasets.":[34],"To":[35,192],"minimize":[36],"training":[37,52,208],"expenses,":[38],"many":[39],"algorithm":[40],"developers":[41],"use":[42,69,179,243],"data":[43,58,74,87,98,178,205],"created":[44],"by":[45],"the":[46,68,79,102,160,170,174,187,200,249,257],"themselves":[48],"a":[50,64,120,166,229,238],"cost-effective":[51],"solution.":[53],"However,":[54],"not":[55],"all":[56],"synthetic":[57,73,86,97,115,177,204,233],"effectively":[59],"improve":[60],"model":[61,137],"performance,":[62],"necessitating":[63],"strategic":[65],"balance":[66],"in":[67,169,180,184,256],"of":[70,83,96,104,176,186,189,202,232,252,259],"real":[71,84],"versus":[72],"to":[75,101,158,220,227,241],"optimize":[76],"outcomes.":[77],"Currently,":[78],"previously":[80],"well-controlled":[81],"integration":[82],"is":[88,165,226],"becoming":[89],"uncontrollable.":[90],"The":[91,224],"widespread":[92],"unregulated":[94],"dissemination":[95],"online":[99],"leads":[100],"contamination":[103],"datasets":[105],"traditionally":[106],"compiled":[107],"through":[108],"web":[109],"scraping,":[110],"now":[111],"mixed":[112],"with":[113],"unlabeled":[114],"data.":[116],"This":[117],"trend":[118],"portends":[119],"future":[121],"where":[122],"AI":[124,147,210,254],"systems":[125],"may":[126],"increasingly":[127],"rely":[128],"blindly":[129,206],"on":[130,207,211],"consuming":[131],"self-generated":[132],"data,":[133],"raising":[134],"concerns":[135],"about":[136],"performance":[138],"ethical":[140],"issues.":[141],"What":[142,153],"will":[143],"happen":[144],"if":[145],"continuously":[148],"consumes":[149],"itself":[150],"without":[151],"discernment?":[152],"measures":[154],"can":[155],"we":[156],"take":[157],"mitigate":[159,221],"potential":[161],"adverse":[162],"effects?":[163],"There":[164],"gap":[168],"scientific":[171],"literature":[172],"regarding":[173],"impact":[175],"AI,":[182],"terms":[185],"fusion":[188],"multimodal":[190],"information.":[191],"address":[193],"this":[194,197],"research":[195],"gap,":[196],"review":[198],"investigates":[199],"consequences":[201],"integrating":[203],"both":[212],"image":[213],"text":[215],"modalities":[216],"explores":[218],"strategies":[219],"effects.":[223],"goal":[225],"offer":[228],"comprehensive":[230],"view":[231],"data's":[234],"role,":[235],"advocating":[236],"for":[237],"balanced":[239],"approach":[240],"its":[242],"exploring":[245],"practices":[246],"that":[247],"promote":[248],"sustainable":[250],"development":[251],"era":[258],"models.":[261]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4397028110","counts_by_year":[],"updated_date":"2025-01-01T19:33:47.036637","created_date":"2024-05-18"}