{"id":"https://openalex.org/W2125121047","doi":"https://doi.org/10.1145/1344411.1344412","title":"Classification-aware hidden-web text database selection","display_name":"Classification-aware hidden-web text database selection","publication_year":2008,"publication_date":"2008-03-01","ids":{"openalex":"https://openalex.org/W2125121047","doi":"https://doi.org/10.1145/1344411.1344412","mag":"2125121047"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1145/1344411.1344412","pdf_url":null,"source":{"id":"https://openalex.org/S87067389","display_name":"ACM transactions on office information systems","issn_l":"0734-2047","issn":["0734-2047","1558-1152"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"journal-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://archive.nyu.edu/bitstream/2451/27824/4/CeDER-PP-2008-07.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5010731709","display_name":"Panagiotis G. Ipeirotis","orcid":"https://orcid.org/0000-0002-2966-7402"},"institutions":[{"id":"https://openalex.org/I57206974","display_name":"New York University","ror":"https://ror.org/0190ak572","country_code":"US","type":"education","lineage":["https://openalex.org/I57206974"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Panagiotis G. Ipeirotis","raw_affiliation_strings":["New York University, New York, NY"],"affiliations":[{"raw_affiliation_string":"New York University, New York, NY","institution_ids":["https://openalex.org/I57206974"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5080063580","display_name":"Luis Gravano","orcid":null},"institutions":[{"id":"https://openalex.org/I78577930","display_name":"Columbia University","ror":"https://ror.org/00hj8s172","country_code":"US","type":"education","lineage":["https://openalex.org/I78577930"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Luis Gravano","raw_affiliation_strings":["Columbia University, New York, NY;"],"affiliations":[{"raw_affiliation_string":"Columbia University, New York, NY;","institution_ids":["https://openalex.org/I78577930"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":8.352,"has_fulltext":false,"cited_by_count":17,"citation_normalized_percentile":{"value":0.951667,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":87,"max":88},"biblio":{"volume":"26","issue":"2","first_page":"1","last_page":"66"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9997,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12016","display_name":"Web Data Mining and Analysis","score":0.9997,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10317","display_name":"Advanced Database Systems and Queries","score":0.9972,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11106","display_name":"Data Management and Algorithms","score":0.9971,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/zipfs-law","display_name":"Zipf's law","score":0.51687574}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8541876},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.7061358},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6045439},{"id":"https://openalex.org/C81917197","wikidata":"https://www.wikidata.org/wiki/Q628760","display_name":"Selection (genetic algorithm)","level":2,"score":0.59203005},{"id":"https://openalex.org/C125932096","wikidata":"https://www.wikidata.org/wiki/Q205472","display_name":"Zipf's law","level":2,"score":0.51687574},{"id":"https://openalex.org/C2780801425","wikidata":"https://www.wikidata.org/wiki/Q5164392","display_name":"Construct (python library)","level":2,"score":0.44419673},{"id":"https://openalex.org/C54239708","wikidata":"https://www.wikidata.org/wiki/Q1329910","display_name":"View","level":3,"score":0.4253698},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.41588682},{"id":"https://openalex.org/C148840519","wikidata":"https://www.wikidata.org/wiki/Q1049878","display_name":"Database design","level":2,"score":0.35987127},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.18497634},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.0},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1145/1344411.1344412","pdf_url":null,"source":{"id":"https://openalex.org/S87067389","display_name":"ACM transactions on office information systems","issn_l":"0734-2047","issn":["0734-2047","1558-1152"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310319798","host_organization_name":"Association for Computing Machinery","host_organization_lineage":["https://openalex.org/P4310319798"],"host_organization_lineage_names":["Association for Computing Machinery"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://archive.nyu.edu/handle/2451/27824","pdf_url":"http://archive.nyu.edu/bitstream/2451/27824/4/CeDER-PP-2008-07.pdf","source":{"id":"https://openalex.org/S4306401258","display_name":"The Faculty Digital Archive (New York University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I57206974","host_organization_name":"New York University","host_organization_lineage":["https://openalex.org/I57206974"],"host_organization_lineage_names":["New York University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://archive.nyu.edu/handle/2451/14759","pdf_url":"http://archive.nyu.edu/bitstream/2451/14759/4/CeDER-06-04.pdf","source":{"id":"https://openalex.org/S4306401258","display_name":"The Faculty Digital Archive (New York University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I57206974","host_organization_name":"New York University","host_organization_lineage":["https://openalex.org/I57206974"],"host_organization_lineage_names":["New York University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://archive.nyu.edu/handle/2451/27824","pdf_url":"http://archive.nyu.edu/bitstream/2451/27824/4/CeDER-PP-2008-07.pdf","source":{"id":"https://openalex.org/S4306401258","display_name":"The Faculty Digital Archive (New York University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I57206974","host_organization_name":"New York University","host_organization_lineage":["https://openalex.org/I57206974"],"host_organization_lineage_names":["New York University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.5,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, justice, and strong institutions"}],"grants":[{"funder":"https://openalex.org/F4320337389","funder_display_name":"Division of Information and Intelligent Systems","award_id":"IIS-97-33880IIS-98-17434IIS-0643846"}],"datasets":[],"versions":[],"referenced_works_count":91,"referenced_works":["https://openalex.org/W1480376833","https://openalex.org/W1489384387","https://openalex.org/W1490760466","https://openalex.org/W1500117362","https://openalex.org/W1504694836","https://openalex.org/W1506281249","https://openalex.org/W1508165687","https://openalex.org/W1517178556","https://openalex.org/W1523949738","https://openalex.org/W1529740232","https://openalex.org/W1539477445","https://openalex.org/W1541635151","https://openalex.org/W1585303489","https://openalex.org/W1587932711","https://openalex.org/W1862957262","https://openalex.org/W1904228841","https://openalex.org/W1956559956","https://openalex.org/W1964653195","https://openalex.org/W1965442115","https://openalex.org/W1972594981","https://openalex.org/W1974147361","https://openalex.org/W1975051470","https://openalex.org/W1980094776","https://openalex.org/W1986828474","https://openalex.org/W1990388042","https://openalex.org/W1992053235","https://openalex.org/W2002682102","https://openalex.org/W2009018050","https://openalex.org/W2012426233","https://openalex.org/W2016892599","https://openalex.org/W2019641348","https://openalex.org/W2021986193","https://openalex.org/W2023657004","https://openalex.org/W2025016813","https://openalex.org/W2034739942","https://openalex.org/W2035127985","https://openalex.org/W2037320173","https://openalex.org/W2038526807","https://openalex.org/W2038721957","https://openalex.org/W2049633694","https://openalex.org/W2058374187","https://openalex.org/W2059344928","https://openalex.org/W2061973112","https://openalex.org/W2077457270","https://openalex.org/W2078206416","https://openalex.org/W2079229534","https://openalex.org/W2079656678","https://openalex.org/W2086253379","https://openalex.org/W2090805977","https://openalex.org/W2091777463","https://openalex.org/W2093521110","https://openalex.org/W2094930182","https://openalex.org/W2096891167","https://openalex.org/W2097359597","https://openalex.org/W2103931177","https://openalex.org/W2109895155","https://openalex.org/W2110572720","https://openalex.org/W2112276485","https://openalex.org/W2112492518","https://openalex.org/W2115054880","https://openalex.org/W2116341550","https://openalex.org/W2124918383","https://openalex.org/W2125055259","https://openalex.org/W2125725207","https://openalex.org/W2125969310","https://openalex.org/W2128166836","https://openalex.org/W2131006463","https://openalex.org/W2136542423","https://openalex.org/W2136590255","https://openalex.org/W2137845970","https://openalex.org/W2141527751","https://openalex.org/W2149684865","https://openalex.org/W2152766222","https://openalex.org/W2160080653","https://openalex.org/W2162746367","https://openalex.org/W2169044456","https://openalex.org/W2170188121","https://openalex.org/W2170205495","https://openalex.org/W2301435905","https://openalex.org/W2560674852","https://openalex.org/W2562782","https://openalex.org/W2799061466","https://openalex.org/W2800860906","https://openalex.org/W2912306800","https://openalex.org/W36509427","https://openalex.org/W4239696231","https://openalex.org/W4240913316","https://openalex.org/W4246154067","https://openalex.org/W4247346926","https://openalex.org/W4255459561","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W584886505","https://openalex.org/W4240226448","https://openalex.org/W279222092","https://openalex.org/W2188578519","https://openalex.org/W2140379930","https://openalex.org/W2079992169","https://openalex.org/W1977805686","https://openalex.org/W1568401719","https://openalex.org/W1543230276","https://openalex.org/W1502610062"],"abstract_inverted_index":{"Many":[0],"valuable":[1],"text":[2,26,82],"databases":[3,27,48,83,107,222,232,274,339],"on":[4,64],"the":[5,39,50,68,73,106,119,133,137,142,150,153,156,192,205,211,217,260,269,272,276,292,299,347,357],"web":[6,338],"have":[7,225,243],"noncrawlable":[8],"contents":[9],"that":[10,117,129,148,163,219,249,346,356],"are":[11,17,49,130],"\u201chidden\u201d":[12],"behind":[13],"search":[14,370],"interfaces.":[15],"Metasearchers":[16],"helpful":[18],"tools":[19],"for":[20,53,96,164,197,263,287,324],"searching":[21],"over":[22,330],"multiple":[23],"such":[24,88],"\u201chidden-web\u201d":[25],"at":[28],"once":[29],"through":[30],"a":[31,54,112,123,234,264,284,325,331],"unified":[32],"query":[33,270],"interface.":[34],"An":[35],"important":[36],"step":[37],"in":[38,122,155,184,275,291],"metasearching":[40],"process":[41],"is":[42,141,322],"database":[43,60,69,74,124,193,212,246,300,312,358,365],"selection,":[44],"or":[45],"determining":[46],"which":[47],"most":[51],"relevant":[52,364],"given":[55],"user":[56],"query.":[57,326],"The":[58,253,279],"state-of-the-art":[59],"selection":[61,194,213,247,313,359,366],"techniques":[62],"rely":[63],"statistical":[65,285],"summaries":[66,100,147,170,188,302,354],"of":[67,132,136,152,294,333],"contents,":[70],"generally":[71],"including":[72,335],"vocabulary":[75],"and":[76,125,209,258,266,355,368],"associated":[77],"word":[78],"frequencies.":[79],"Unfortunately,":[80,158],"hidden-web":[81],"typically":[84],"do":[85],"not":[86],"export":[87],"summaries,":[89],"so":[90,228],"previous":[91],"research":[92],"has":[93],"developed":[94,244],"algorithms":[95,248,314,360],"constructing":[97],"approximate":[98],"content":[99,146,169,187,301,353],"from":[101,105,172,231],"document":[102,175,207],"samples":[103,176,208,229],"extracted":[104,230],"via":[108],"querying.":[109],"We":[110,242,306],"present":[111],"novel":[113],"\u201cfocused-probing\u201d":[114],"sampling":[115,349],"algorithm":[116,140,255,281],"detects":[118],"topics":[120],"covered":[121],"adaptively":[126,316],"extracts":[127],"documents":[128],"representative":[131],"topic":[134],"coverage":[135],"database.":[138,157],"Our":[139],"first":[143,254],"to":[144,179,224,271,297,309,315],"construct":[145],"include":[149],"frequencies":[151],"words":[154],"Zipf's":[159],"law":[160],"practically":[161],"guarantees":[162],"any":[165],"relatively":[166],"large":[167],"database,":[168],"built":[171],"moderately":[173],"sized":[174],"will":[177],"fail":[178],"cover":[180],"many":[181],"low-frequency":[182],"words;":[183],"turn,":[185],"incomplete":[186],"might":[189],"negatively":[190],"affect":[191],"process,":[195],"especially":[196],"short":[198],"queries":[199],"with":[200,233,303],"infrequent":[201],"words.":[202,305],"To":[203],"enhance":[204,298],"sparse":[206,295],"improve":[210],"decisions,":[214],"we":[215],"exploit":[216,250],"fact":[218],"topically":[220],"similar":[221,226,235],"tend":[223],"vocabularies,":[227],"topical":[236],"focus":[237],"can":[238],"complement":[239],"each":[240],"other.":[241],"two":[245],"this":[251],"observation.":[252],"proceeds":[256],"hierarchically":[257],"selects":[259],"best":[261],"categories":[262],"query,":[265],"then":[267],"sends":[268],"appropriate":[273],"chosen":[277],"categories.":[278],"second":[280],"uses":[282],"\u201cshrinkage,\u201d":[283],"technique":[286],"improving":[288],"parameter":[289],"estimation":[290],"face":[293],"data,":[296,344],"category-specific":[304],"describe":[307],"how":[308],"modify":[310],"existing":[311,373],"decide":[317],"(at":[318],"runtime)":[319],"whether":[320],"shrinkage":[321],"beneficial":[323],"A":[327],"thorough":[328],"evaluation":[329],"variety":[332],"databases,":[334],"315":[336],"real":[337],"as":[340,342],"well":[341],"TREC":[343],"suggests":[345],"proposed":[348],"methods":[350],"generate":[351],"high-quality":[352],"produce":[361],"significantly":[362],"more":[363],"decisions":[367],"overall":[369],"results":[371],"than":[372],"algorithms.":[374]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W2125121047","counts_by_year":[{"year":2020,"cited_by_count":1},{"year":2017,"cited_by_count":2},{"year":2015,"cited_by_count":2},{"year":2014,"cited_by_count":1},{"year":2013,"cited_by_count":1},{"year":2012,"cited_by_count":2}],"updated_date":"2024-12-07T20:42:54.489963","created_date":"2016-06-24"}