{"id":"https://openalex.org/W1995433548","doi":"https://doi.org/10.1145/2588555.2612178","title":"Similarity joins for uncertain strings","display_name":"Similarity joins for uncertain strings","publication_year":2014,"publication_date":"2014-06-18","ids":{"openalex":"https://openalex.org/W1995433548","doi":"https://doi.org/10.1145/2588555.2612178","mag":"1995433548"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1145/2588555.2612178","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5110582646","display_name":"Manish Patil","orcid":null},"institutions":[{"id":"https://openalex.org/I121820613","display_name":"Louisiana State University","ror":"https://ror.org/05ect4e57","country_code":"US","type":"education","lineage":["https://openalex.org/I121820613"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Manish Patil","raw_affiliation_strings":["Louisiana State University, Baton Rouge, LA, USA"],"affiliations":[{"raw_affiliation_string":"Louisiana State University, Baton Rouge, LA, USA","institution_ids":["https://openalex.org/I121820613"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5105778361","display_name":"Rahul Shah","orcid":null},"institutions":[{"id":"https://openalex.org/I121820613","display_name":"Louisiana State University","ror":"https://ror.org/05ect4e57","country_code":"US","type":"education","lineage":["https://openalex.org/I121820613"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Rahul Shah","raw_affiliation_strings":["Louisiana State University, Baton Rouge, LA, USA"],"affiliations":[{"raw_affiliation_string":"Louisiana State University, Baton Rouge, LA, USA","institution_ids":["https://openalex.org/I121820613"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.558,"has_fulltext":false,"cited_by_count":3,"citation_normalized_percentile":{"value":0.436114,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":76,"max":79},"biblio":{"volume":null,"issue":null,"first_page":"1471","last_page":"1482"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11719","display_name":"Data Quality and Management","score":0.9999,"subfield":{"id":"https://openalex.org/subfields/1803","display_name":"Management Science and Operations Research"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9933,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9869,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/edit-distance","display_name":"Edit distance","score":0.7483884},{"id":"https://openalex.org/keywords/similarity","display_name":"Similarity (geometry)","score":0.64283633},{"id":"https://openalex.org/keywords/string-kernel","display_name":"String kernel","score":0.49987125},{"id":"https://openalex.org/keywords/join","display_name":"Join (topology)","score":0.4805582},{"id":"https://openalex.org/keywords/pruning","display_name":"Pruning","score":0.4404126}],"concepts":[{"id":"https://openalex.org/C22820288","wikidata":"https://www.wikidata.org/wiki/Q9050568","display_name":"String metric","level":4,"score":0.79808503},{"id":"https://openalex.org/C157486923","wikidata":"https://www.wikidata.org/wiki/Q1376436","display_name":"String (physics)","level":2,"score":0.7776838},{"id":"https://openalex.org/C44359876","wikidata":"https://www.wikidata.org/wiki/Q5338467","display_name":"Edit distance","level":2,"score":0.7483884},{"id":"https://openalex.org/C32610155","wikidata":"https://www.wikidata.org/wiki/Q1798621","display_name":"Approximate string matching","level":3,"score":0.7377994},{"id":"https://openalex.org/C103278499","wikidata":"https://www.wikidata.org/wiki/Q254465","display_name":"Similarity (geometry)","level":3,"score":0.64283633},{"id":"https://openalex.org/C2778692605","wikidata":"https://www.wikidata.org/wiki/Q4041866","display_name":"Joins","level":2,"score":0.63656783},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.63440394},{"id":"https://openalex.org/C7757238","wikidata":"https://www.wikidata.org/wiki/Q374040","display_name":"String searching algorithm","level":3,"score":0.5713915},{"id":"https://openalex.org/C80444323","wikidata":"https://www.wikidata.org/wiki/Q2878974","display_name":"Theoretical computer science","level":1,"score":0.51975316},{"id":"https://openalex.org/C55851704","wikidata":"https://www.wikidata.org/wiki/Q7623983","display_name":"String kernel","level":5,"score":0.49987125},{"id":"https://openalex.org/C2776124973","wikidata":"https://www.wikidata.org/wiki/Q3183033","display_name":"Join (topology)","level":2,"score":0.4805582},{"id":"https://openalex.org/C2779960059","wikidata":"https://www.wikidata.org/wiki/Q7113681","display_name":"Overhead (engineering)","level":2,"score":0.4699141},{"id":"https://openalex.org/C75165309","wikidata":"https://www.wikidata.org/wiki/Q2258979","display_name":"Search engine indexing","level":2,"score":0.4596815},{"id":"https://openalex.org/C108010975","wikidata":"https://www.wikidata.org/wiki/Q500094","display_name":"Pruning","level":2,"score":0.4404126},{"id":"https://openalex.org/C11413529","wikidata":"https://www.wikidata.org/wiki/Q8366","display_name":"Algorithm","level":1,"score":0.3371104},{"id":"https://openalex.org/C68859911","wikidata":"https://www.wikidata.org/wiki/Q1503724","display_name":"Pattern matching","level":2,"score":0.33222115},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.25817382},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.22002235},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.20932409},{"id":"https://openalex.org/C114614502","wikidata":"https://www.wikidata.org/wiki/Q76592","display_name":"Combinatorics","level":1,"score":0.19009203},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.10634509},{"id":"https://openalex.org/C122280245","wikidata":"https://www.wikidata.org/wiki/Q620622","display_name":"Kernel method","level":3,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C160446489","wikidata":"https://www.wikidata.org/wiki/Q7226642","display_name":"Polynomial kernel","level":4,"score":0.0},{"id":"https://openalex.org/C12267149","wikidata":"https://www.wikidata.org/wiki/Q282453","display_name":"Support vector machine","level":2,"score":0.0},{"id":"https://openalex.org/C6557445","wikidata":"https://www.wikidata.org/wiki/Q173113","display_name":"Agronomy","level":1,"score":0.0},{"id":"https://openalex.org/C37914503","wikidata":"https://www.wikidata.org/wiki/Q156495","display_name":"Mathematical physics","level":1,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1145/2588555.2612178","pdf_url":null,"source":null,"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[],"grants":[{"funder":"https://openalex.org/F4320337387","funder_display_name":"Division of Computing and Communication Foundations","award_id":"CCF-1218904"},{"funder":"https://openalex.org/F4320337387","funder_display_name":"Division of Computing and Communication Foundations","award_id":"CCF-1017623"}],"datasets":[],"versions":[],"referenced_works_count":20,"referenced_works":["https://openalex.org/W1486776102","https://openalex.org/W1574479362","https://openalex.org/W1754752210","https://openalex.org/W1855864163","https://openalex.org/W1973001156","https://openalex.org/W2007682403","https://openalex.org/W2036435651","https://openalex.org/W2054693333","https://openalex.org/W2096598900","https://openalex.org/W2100355846","https://openalex.org/W2103014446","https://openalex.org/W2121516976","https://openalex.org/W2144562386","https://openalex.org/W2150916025","https://openalex.org/W2158779275","https://openalex.org/W2161936973","https://openalex.org/W2167331726","https://openalex.org/W2188600765","https://openalex.org/W3162942780","https://openalex.org/W58164167"],"related_works":["https://openalex.org/W52396946","https://openalex.org/W2950268498","https://openalex.org/W2405436873","https://openalex.org/W2399644331","https://openalex.org/W2187092961","https://openalex.org/W2102443632","https://openalex.org/W2074064717","https://openalex.org/W1995303482","https://openalex.org/W1824878879","https://openalex.org/W1505906253"],"abstract_inverted_index":{"A":[0],"string":[1,7,12,95,145,160,166,240,266],"similarity":[2,60,155],"join":[3,61,132],"finds":[4],"all":[5,136],"similar":[6,142],"pairs":[8,96],"between":[9,98,114],"two":[10,80],"input":[11,88],"collections.":[13],"It":[14],"is":[15,92,122,184],"an":[16,223],"essential":[17],"operation":[18],"in":[19,50,138,147],"many":[20,37,175,260],"applications,":[21],"such":[22,100,154],"as":[23,74,167],"data":[24],"integration":[25],"and":[26,28,87,116,200,226],"cleaning,":[27],"has":[29],"been":[30],"extensively":[31],"studied":[32],"for":[33,58,152,212],"deterministic":[34,165],"strings.":[35,216],"Increasingly,":[36],"applications":[38],"have":[39],"to":[40,93,143,180,253],"deal":[41],"with":[42,47],"imprecise":[43],"strings":[44,46,65,84,137],"or":[45],"fuzzy":[48],"information":[49],"them.":[51],"This":[52],"work":[53],"presents":[54],"the":[55,71,75,108,111,131,215,229,235,255,264,271,274],"first":[56],"solution":[57],"answering":[59,153],"queries":[62,157],"over":[63],"uncertain":[64,83,159],"that":[66,101,140,197,242],"implements":[67],"possible-world":[68],"semantics,":[69],"using":[70,246],"edit":[72,112],"distance":[73,113],"measure":[76],"of":[77,82,110,178,214,238,258,263,273],"similarity.":[78],"Given":[79],"collections":[81,99],"R,":[85],"S,":[86],"(k,\u03c4),":[89],"our":[90],"task":[91],"find":[94],"(R,S)":[97],"$Pr(ed(R,S)":[102],"\u2264":[103,206],"k)":[104,207],">":[105],"\u03c4":[106],"i.e.,":[107],"probability":[109,125],"R":[115,146],"S":[117,139],"being":[118],"at":[119],"most":[120],"k":[121],"more":[123],"than":[124],"threshold":[126],"\u03c4.":[127],"We":[128,192,217],"can":[129],"address":[130],"problem":[133],"by":[134,245,277],"obtaining":[135],"are":[141],"each":[144],"R.":[148],"However,":[149],"existing":[150],"solutions":[151,171],"search":[156],"on":[158,204],"databases":[161],"only":[162,186],"support":[163],"a":[164,239,247],"input.":[168],"Exploiting":[169],"these":[170,220],"would":[172],"require":[173],"exponentially":[174,259],"possible":[176,210,261],"worlds":[177,211],"$R$":[179],"be":[181],"considered,":[182],"which":[183,250],"not":[185],"ineffective":[187],"but":[188],"also":[189],"prohibitively":[190],"expensive.":[191],"propose":[193],"various":[194],"filtering":[195,230],"techniques":[196,221],"give":[198],"upper":[199],"(or)":[201],"lower":[202],"bound":[203],"Pr(ed(R,S)":[205],"without":[208],"instantiating":[209],"either":[213],"then":[218],"incorporate":[219],"into":[222],"indexing":[224],"scheme":[225],"significantly":[227],"reduce":[228],"overhead.":[231],"Further,":[232],"we":[233,269],"alleviate":[234],"verification":[236,256],"cost":[237,257],"pair":[241],"survives":[243],"pruning":[244],"trie":[248],"structure":[249],"allows":[251],"us":[252],"overlap":[254],"instances":[262],"candidate":[265],"pair.":[267],"Finally,":[268],"evaluate":[270],"effectiveness":[272],"proposed":[275],"approach":[276],"thorough":[278],"practical":[279],"experimentation.":[280]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W1995433548","counts_by_year":[{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":1}],"updated_date":"2025-01-15T06:08:29.806270","created_date":"2016-06-24"}