{"id":"https://openalex.org/W2056085726","doi":"https://doi.org/10.1109/soli.2012.6273510","title":"Managing data quality by identifying the noisiest data samples","display_name":"Managing data quality by identifying the noisiest data samples","publication_year":2012,"publication_date":"2012-07-01","ids":{"openalex":"https://openalex.org/W2056085726","doi":"https://doi.org/10.1109/soli.2012.6273510","mag":"2056085726"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/soli.2012.6273510","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5058758292","display_name":"Kalika Prasad","orcid":"https://orcid.org/0000-0001-8666-856X"},"institutions":[{"id":"https://openalex.org/I4210103279","display_name":"IBM Research - India","ror":"https://ror.org/014wt7r80","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210103279","https://openalex.org/I4210114115"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"K. Hima Prasad","raw_affiliation_strings":["IBM Research-India, New Delhi, India"],"affiliations":[{"raw_affiliation_string":"IBM Research-India, New Delhi, India","institution_ids":["https://openalex.org/I4210103279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5041254552","display_name":"Snigdha Chaturvedi","orcid":null},"institutions":[{"id":"https://openalex.org/I4210103279","display_name":"IBM Research - India","ror":"https://ror.org/014wt7r80","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210103279","https://openalex.org/I4210114115"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Snigdha Chaturvedi","raw_affiliation_strings":["IBM Research-India, New Delhi, India"],"affiliations":[{"raw_affiliation_string":"IBM Research-India, New Delhi, India","institution_ids":["https://openalex.org/I4210103279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5004954473","display_name":"Tanveer A. Faruquie","orcid":"https://orcid.org/0009-0008-9474-7928"},"institutions":[{"id":"https://openalex.org/I4210103279","display_name":"IBM Research - India","ror":"https://ror.org/014wt7r80","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210103279","https://openalex.org/I4210114115"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Tanveer A. Faruquie","raw_affiliation_strings":["IBM Research-India, New Delhi, India"],"affiliations":[{"raw_affiliation_string":"IBM Research-India, New Delhi, India","institution_ids":["https://openalex.org/I4210103279"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5110629023","display_name":"L. Venkata Subramaniam","orcid":null},"institutions":[{"id":"https://openalex.org/I4210103279","display_name":"IBM Research - India","ror":"https://ror.org/014wt7r80","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210103279","https://openalex.org/I4210114115"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"L. Venkata Subramaniam","raw_affiliation_strings":["IBM Research-India, New Delhi, India"],"affiliations":[{"raw_affiliation_string":"IBM Research-India, New Delhi, India","institution_ids":["https://openalex.org/I4210103279"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5047987914","display_name":"Mukesh Mohania","orcid":"https://orcid.org/0000-0003-4429-1412"},"institutions":[{"id":"https://openalex.org/I4210103279","display_name":"IBM Research - India","ror":"https://ror.org/014wt7r80","country_code":"IN","type":"facility","lineage":["https://openalex.org/I1341412227","https://openalex.org/I4210103279","https://openalex.org/I4210114115"]}],"countries":["IN"],"is_corresponding":false,"raw_author_name":"Mukesh K. Mohania","raw_affiliation_strings":["IBM Research-India, New Delhi, India"],"affiliations":[{"raw_affiliation_string":"IBM Research-India, New Delhi, India","institution_ids":["https://openalex.org/I4210103279"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"fulltext_origin":"ngrams","cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":64},"biblio":{"volume":null,"issue":null,"first_page":"90","last_page":"95"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T10538","display_name":"Data Mining Algorithms and Applications","score":0.9985,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11652","display_name":"Imbalanced Data Classification Techniques","score":0.9942,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/data-cleansing","display_name":"Data cleansing","score":0.85506827},{"id":"https://openalex.org/keywords/data-transformation","display_name":"Data transformation","score":0.556417},{"id":"https://openalex.org/keywords/noisy-data","display_name":"Noisy data","score":0.47183898}],"concepts":[{"id":"https://openalex.org/C42199009","wikidata":"https://www.wikidata.org/wiki/Q1172378","display_name":"Data cleansing","level":4,"score":0.85506827},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.84215784},{"id":"https://openalex.org/C2780451532","wikidata":"https://www.wikidata.org/wiki/Q759676","display_name":"Task (project management)","level":2,"score":0.6508447},{"id":"https://openalex.org/C124101348","wikidata":"https://www.wikidata.org/wiki/Q172491","display_name":"Data mining","level":1,"score":0.6424294},{"id":"https://openalex.org/C24756922","wikidata":"https://www.wikidata.org/wiki/Q1757694","display_name":"Data quality","level":3,"score":0.61262023},{"id":"https://openalex.org/C98045186","wikidata":"https://www.wikidata.org/wiki/Q205663","display_name":"Process (computing)","level":2,"score":0.5885434},{"id":"https://openalex.org/C2779530757","wikidata":"https://www.wikidata.org/wiki/Q1207505","display_name":"Quality (philosophy)","level":2,"score":0.5606664},{"id":"https://openalex.org/C150670458","wikidata":"https://www.wikidata.org/wiki/Q4272815","display_name":"Data transformation","level":3,"score":0.556417},{"id":"https://openalex.org/C2781170535","wikidata":"https://www.wikidata.org/wiki/Q30587856","display_name":"Noisy data","level":2,"score":0.47183898},{"id":"https://openalex.org/C204241405","wikidata":"https://www.wikidata.org/wiki/Q461499","display_name":"Transformation (genetics)","level":3,"score":0.4567872},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3193216},{"id":"https://openalex.org/C135572916","wikidata":"https://www.wikidata.org/wiki/Q193351","display_name":"Data warehouse","level":2,"score":0.14115351},{"id":"https://openalex.org/C176217482","wikidata":"https://www.wikidata.org/wiki/Q860554","display_name":"Metric (unit)","level":2,"score":0.0948841},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C55493867","wikidata":"https://www.wikidata.org/wiki/Q7094","display_name":"Biochemistry","level":1,"score":0.0},{"id":"https://openalex.org/C185592680","wikidata":"https://www.wikidata.org/wiki/Q2329","display_name":"Chemistry","level":0,"score":0.0},{"id":"https://openalex.org/C187736073","wikidata":"https://www.wikidata.org/wiki/Q2920921","display_name":"Management","level":1,"score":0.0},{"id":"https://openalex.org/C111472728","wikidata":"https://www.wikidata.org/wiki/Q9471","display_name":"Epistemology","level":1,"score":0.0},{"id":"https://openalex.org/C104317684","wikidata":"https://www.wikidata.org/wiki/Q7187","display_name":"Gene","level":2,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/soli.2012.6273510","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/1","score":0.61,"display_name":"No poverty"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":6,"referenced_works":["https://openalex.org/W1570448133","https://openalex.org/W1964357740","https://openalex.org/W1982723649","https://openalex.org/W1996956037","https://openalex.org/W2073251771","https://openalex.org/W2158188757"],"related_works":["https://openalex.org/W962911587","https://openalex.org/W4255072332","https://openalex.org/W4200551113","https://openalex.org/W3202554610","https://openalex.org/W3126834064","https://openalex.org/W3110671107","https://openalex.org/W3021414116","https://openalex.org/W2353586736","https://openalex.org/W2270762093","https://openalex.org/W2028861106"],"abstract_inverted_index":{"Enterprise":[0],"datasets":[1],"are":[2],"often":[3],"noisy.":[4],"Several":[5],"columns":[6],"can":[7,17],"have":[8],"non-standard,":[9],"erroneous":[10],"or":[11],"missing":[12],"information.":[13],"Poor":[14],"quality":[15,51,105],"data":[16,32,38,50,56,104],"lead":[18],"to":[19,33,59,76,101],"incorrect":[20],"reporting":[21],"and":[22,53],"wrong":[23],"conclusions":[24],"being":[25],"drawn.":[26],"Data":[27],"cleansing":[28,39],"involves":[29,47],"standardizing":[30],"such":[31],"improve":[34],"its":[35],"quality.":[36],"Often":[37],"tasks":[40],"involve":[41],"writing":[42,55],"rules":[43,58],"manually.":[44],"The":[45,117],"step":[46,89,95],"understanding":[48],"the":[49,93,103,120],"issues":[52],"then":[54],"transformation":[57],"correct":[60],"these":[61],"issues.":[62],"This":[63,85],"is":[64,86,123],"a":[65,74,87,114,126],"human":[66],"intensive":[67],"task.":[68],"In":[69],"this":[70],"study":[71],"we":[72,96],"propose":[73],"method":[75,122],"identify":[77],"noisy":[78],"subsets":[79],"of":[80,119],"huge":[81],"unlabelled":[82,108],"textual":[83],"datasets.":[84],"two":[88],"process":[90],"where":[91],"in":[92],"first":[94],"develop":[97],"an":[98,107],"estimation":[99],"tool":[100],"predict":[102],"on":[106,125],"text":[109],"dataset":[110],"as":[111],"produced":[112],"by":[113],"segmentation":[115],"model.":[116],"accuracy":[118],"proposed":[121],"shown":[124],"real":[127],"life":[128],"dataset.":[129]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W2056085726","counts_by_year":[],"updated_date":"2024-12-12T23:33:45.160554","created_date":"2016-06-24"}