{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2022,8,11]],"date-time":"2022-08-11T20:51:44Z","timestamp":1660251104512},"reference-count":57,"publisher":"Elsevier BV","issue":"2","license":[{"start":{"date-parts":[[2017,3,1]],"date-time":"2017-03-01T00:00:00Z","timestamp":1488326400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2017,12,6]],"date-time":"2017-12-06T00:00:00Z","timestamp":1512518400000},"content-version":"am","delay-in-days":280,"URL":"http:\/\/www.elsevier.com\/open-access\/userlicense\/1.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["IIS-08-11038"],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000201","name":"DoI","doi-asserted-by":"publisher","award":["D11PC20153"],"id":[{"id":"10.13039\/100000201","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Processing & Management"],"published-print":{"date-parts":[[2017,3]]},"DOI":"10.1016\/j.ipm.2016.11.006","type":"journal-article","created":{"date-parts":[[2016,12,6]],"date-time":"2016-12-06T03:49:11Z","timestamp":1480996151000},"page":"309-331","update-policy":"http:\/\/dx.doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":7,"title":["Sampling strategies for information extraction over the deep web"],"prefix":"10.1016","volume":"53","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-4410-0682","authenticated-orcid":false,"given":"Pablo","family":"Barrio","sequence":"first","affiliation":[]},{"given":"Luis","family":"Gravano","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.ipm.2016.11.006_bib0001","series-title":"Proceedings of the fourteenth ACM international conference on information and knowledge management (CIKM \u201905)","first-page":"413","article-title":"Predicting accuracy of extracting information from unstructured text collections","author":"Agichtein","year":"2005"},{"key":"10.1016\/j.ipm.2016.11.006_bib0002","series-title":"Proceedings of the nineteenth international conference on data engineering (ICDE \u201903)","first-page":"113","article-title":"Querying text databases for efficient information extraction","author":"Agichtein","year":"2003"},{"key":"10.1016\/j.ipm.2016.11.006_bib0003","series-title":"Proceedings of the sixth international workshop on the web and databases (webDB \u201903)","first-page":"87","article-title":"Modeling query-based access to text databases","author":"Agichtein","year":"2003"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0004","first-page":"133","article-title":"Siphoning hidden-web data through keyword-based interfaces","volume":"1","author":"Barbosa","year":"2010","journal-title":"Journal on Information and Data Management"},{"key":"10.1016\/j.ipm.2016.11.006_bib0005","series-title":"Proceedings of the twenty-fourth ACM international conference on information and knowledge management (CIKM \u201915)","first-page":"153","article-title":"Ranking deep web text collections for scalable information extraction","author":"Barrio","year":"2015"},{"key":"10.1016\/j.ipm.2016.11.006_bib0006","series-title":"Proceedings of the 2014\u00a0ACM joint conference on digital libraries (JCDL \u201914)","first-page":"455","article-title":"REEL: A relation extraction learning framework","author":"Barrio","year":"2014"},{"key":"10.1016\/j.ipm.2016.11.006_bib0007","series-title":"Proceedings of the 2015 international conference on extending database technology (EDBT \u201915)","first-page":"241","article-title":"Learning to rank adaptively for scalable information extraction","author":"Barrio","year":"2015"},{"issue":"5","key":"10.1016\/j.ipm.2016.11.006_bib0008","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1411509.1411514","article-title":"Random sampling from a search engine\u2019s index","volume":"55","author":"Bar-Yossef","year":"2008","journal-title":"Journal of the ACM"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0009","doi-asserted-by":"crossref","DOI":"10.3998\/3336451.0007.104","article-title":"The deep web: Surfacing, hidden value","volume":"7","author":"Bergman","year":"2001","journal-title":"Journal of Electronic Publishing"},{"key":"10.1016\/j.ipm.2016.11.006_bib0010","series-title":"Proceedings of the fourteenth international workshop on the web and databases (webDB \u201911)","article-title":"Factcrawl: A fact retrieval framework for full-text indices","author":"Boden","year":"2011"},{"issue":"2","key":"10.1016\/j.ipm.2016.11.006_bib0011","doi-asserted-by":"crossref","first-page":"89","DOI":"10.1007\/s13222-012-0088-4","article-title":"Fact-aware document retrieval for information extraction","volume":"12","author":"Boden","year":"2012","journal-title":"Datenbank-Spektrum"},{"key":"10.1016\/j.ipm.2016.11.006_bib0012","series-title":"Proceedings of the nineteenth international conference on neural information processing systems (NIPS \u201905)","first-page":"171","article-title":"Subsequence kernels for relation extraction","author":"Bunescu","year":"2005"},{"issue":"2","key":"10.1016\/j.ipm.2016.11.006_bib0013","doi-asserted-by":"crossref","first-page":"97","DOI":"10.1145\/382979.383040","article-title":"Query-based sampling of text databases","volume":"19","author":"Callan","year":"2001","journal-title":"ACM Transactions on Information Systems"},{"issue":"11\u201316","key":"10.1016\/j.ipm.2016.11.006_bib0014","doi-asserted-by":"crossref","first-page":"1623","DOI":"10.1016\/S1389-1286(99)00052-3","article-title":"Focused crawling: A new approach to topic-specific web resource discovery","volume":"31","author":"Chakrabarti","year":"1999","journal-title":"Computer Networks: The International Journal of Computer and Telecommunications Networking"},{"key":"10.1016\/j.ipm.2016.11.006_bib0015","series-title":"Proceedings of the tenth IEEE international conference on data mining (ICDM \u201910)","first-page":"773","article-title":"Location and scatter matching for dataset shift in text mining","author":"Chen","year":"2010"},{"key":"10.1016\/j.ipm.2016.11.006_bib0016","series-title":"Proceedings of the twelfth international conference on machine learning (ICML \u201995)","first-page":"115","article-title":"Fast effective rule induction","author":"Cohen","year":"1995"},{"key":"10.1016\/j.ipm.2016.11.006_bib0017","series-title":"Proceedings of the 26th international conference on very large data bases (VLDB \u201900)","first-page":"527","article-title":"Focused crawling using context graphs","author":"Diligenti","year":"2000"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0018","first-page":"61","article-title":"Accurate methods for the statistics of surprise and coincidence","volume":"19","author":"Dunning","year":"1993","journal-title":"Computational Linguistics"},{"key":"10.1016\/j.ipm.2016.11.006_bib0019","series-title":"Proceedings of the fourth ACM international conference on web search and data mining (WSDM \u201911)","first-page":"825","article-title":"Searching patterns for relation extraction over the web: Rediscovering the pattern-relation duality","author":"Fang","year":"2011"},{"key":"10.1016\/j.ipm.2016.11.006_bib0020","first-page":"1289","article-title":"An extensive empirical study of feature selection metrics for text classification","volume":"3","author":"Forman","year":"2003","journal-title":"The Journal of Machine Learning Research"},{"key":"10.1016\/j.ipm.2016.11.006_bib0021","series-title":"Proceedings of the eleventh conference of the European chapter of the association for computational linguistics (EACL \u201906)","first-page":"3","article-title":"Exploiting shallow linguistic information for relation extraction from biomedical literature","author":"Giuliano","year":"2006"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0022","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/635484.635485","article-title":"QPRober: A system for automatic classification of hidden-web databases","volume":"21","author":"Gravano","year":"2003","journal-title":"ACM Transactions on Information Systems"},{"issue":"3","key":"10.1016\/j.ipm.2016.11.006_bib0023","doi-asserted-by":"crossref","first-page":"111","DOI":"10.14445\/22312803\/IJCTT-V12P122","article-title":"A comparative study of hidden web crawlers","volume":"12","author":"Gupta","year":"2014","journal-title":"International Journal of Computer Trends and Technology"},{"key":"10.1016\/j.ipm.2016.11.006_bib0024","series-title":"Proceedings of the sixth ACM international conference on web search and data mining (WSDM \u201913)","first-page":"355","article-title":"Crawling deep web entity pages","author":"He","year":"2013"},{"issue":"4","key":"10.1016\/j.ipm.2016.11.006_bib0025","doi-asserted-by":"crossref","first-page":"2","DOI":"10.1145\/1292609.1292611","article-title":"Towards a query optimizer for text-centric tasks","volume":"32","author":"Ipeirotis","year":"2007","journal-title":"ACM Transactions on Database Systems"},{"key":"10.1016\/j.ipm.2016.11.006_bib0026","series-title":"Proceedings of the 2008\u00a0IEEE 24th international conference on data engineering (ICDE \u201908)","first-page":"636","article-title":"Optimizing SQL queries over text databases","author":"Jain","year":"2008"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0027","doi-asserted-by":"crossref","first-page":"5:1","DOI":"10.1145\/1508857.1508862","article-title":"A quality-aware optimizer for information extraction","volume":"34","author":"Jain","year":"2009","journal-title":"ACM Transactions on Database Systems"},{"key":"10.1016\/j.ipm.2016.11.006_bib0028","series-title":"Proceedings of the 2009\u00a0IEEE international conference on data engineering (ICDE \u201909)","first-page":"186","article-title":"Join optimization of information extraction output: quality matters!","author":"Jain","year":"2009"},{"key":"10.1016\/j.ipm.2016.11.006_bib0029","series-title":"Proceedings of the 2009\u00a0IEEE international conference on data engineering (ICDE \u201909)","first-page":"616","article-title":"Exploring a few good tuples from text databases","author":"Jain","year":"2009"},{"key":"10.1016\/j.ipm.2016.11.006_bib0030","series-title":"Advances in kernel methods: Support vector machines","article-title":"Making large-scale support vector machine learning practical","author":"Joachims","year":"1998"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0031","doi-asserted-by":"crossref","first-page":"24","DOI":"10.1145\/2783888.2783898","article-title":"Automatic filling of hidden web forms: A survey","volume":"44","author":"Kantorski","year":"2015","journal-title":"SIGMOD Record"},{"key":"10.1016\/j.ipm.2016.11.006_bib0032","unstructured":"Khelghati, M. (2016). Deep web content monitoring (Ph.D. thesis). University of Twente, Enschede, The Netherlands."},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0033","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1214\/aoms\/1177729694","article-title":"On information and sufficiency","volume":"22","author":"Kullback","year":"1951","journal-title":"The Annals of Mathematical Statistics"},{"issue":"4","key":"10.1016\/j.ipm.2016.11.006_bib0034","doi-asserted-by":"crossref","first-page":"605","DOI":"10.1007\/s11280-015-0349-x","article-title":"Focused crawling for the hidden web","volume":"19","author":"Liakos","year":"2016","journal-title":"World Wide Web"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0035","doi-asserted-by":"crossref","first-page":"70","DOI":"10.1007\/s10791-009-9107-y","article-title":"Estimating deep web data source size by capture\u2014recapture method","volume":"13","author":"Lu","year":"2010","journal-title":"Information Retrieval"},{"key":"10.1016\/j.ipm.2016.11.006_bib0036","series-title":"Proceedings of the seventeenth international conference on machine learning (ICML \u201900)","first-page":"591","article-title":"Maximum entropy Markov models for information extraction and segmentation","author":"McCallum","year":"2000"},{"key":"10.1016\/j.ipm.2016.11.006_bib0037","series-title":"Proceedings of the ninth conference on computational natural language learning (coNLL \u201905)","first-page":"188","article-title":"Early results for named entity recognition with conditional random fields, feature induction and web-enhanced lexicons","author":"McCallum","year":"2003"},{"issue":"4","key":"10.1016\/j.ipm.2016.11.006_bib0038","doi-asserted-by":"crossref","first-page":"378","DOI":"10.1145\/1031114.1031117","article-title":"Topical web crawlers: Evaluating adaptive algorithms","volume":"4","author":"Menczer","year":"2004","journal-title":"ACM Transactions on Internet Technology"},{"key":"10.1016\/j.ipm.2016.11.006_bib0039","series-title":"Proceedings of the twenty-seventh ACM international conference on research and development in information retrieval (SIGIR \u201904)","first-page":"234","article-title":"Feature selection using linear classifier weights: interaction with classification models","author":"Mladenic","year":"2004"},{"key":"10.1016\/j.ipm.2016.11.006_bib0040","series-title":"Proceedings of the 2005\u00a0ACM joint conference on digital libraries (JCDL \u201905)","first-page":"100","article-title":"Downloading textual hidden web content through keyword queries","author":"Ntoulas","year":"2005"},{"issue":"3","key":"10.1016\/j.ipm.2016.11.006_bib0041","doi-asserted-by":"crossref","first-page":"175","DOI":"10.1561\/1500000017","article-title":"Web crawling","volume":"4","author":"Olston","year":"2010","journal-title":"Foundations and Trends in Information Retrieval"},{"issue":"1","key":"10.1016\/j.ipm.2016.11.006_bib0042","doi-asserted-by":"crossref","first-page":"107","DOI":"10.1109\/TKDE.2006.12","article-title":"Link contexts in classifier-guided topical crawlers","volume":"18","author":"Pant","year":"2006","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"issue":"302","key":"10.1016\/j.ipm.2016.11.006_bib0043","first-page":"157","article-title":"On the criterion that a given system of deviations from the probable in the case of a correlated system of variables is such that can be reasonably supposed to have arisen from random sampling","volume":"50","author":"Pearson","year":"1900","journal-title":"Philosophical Magazine"},{"key":"10.1016\/j.ipm.2016.11.006_bib0044","series-title":"Proceedings of the twenty-seventh international conference on very large databases (VLDB \u201901)","first-page":"129","article-title":"Crawling the hidden web","author":"Raghavan","year":"2001"},{"key":"10.1016\/j.ipm.2016.11.006_bib0045","series-title":"Model assisted survey sampling","author":"S\u00e4rndal","year":"1992"},{"issue":"2","key":"10.1016\/j.ipm.2016.11.006_sbref0045","doi-asserted-by":"crossref","first-page":"303","DOI":"10.1007\/s10115-009-0266-3","article-title":"Xcrawl: A high-recall crawling method for web mining","volume":"25","author":"Shchekotykhin","year":"2010","journal-title":"Knowledge and Information Systems"},{"issue":"2","key":"10.1016\/j.ipm.2016.11.006_bib0047","first-page":"282","article-title":"The invisible web: Uncovering sources search engines can\u2019t see","volume":"52","author":"Sherman","year":"2003","journal-title":"Library Trends"},{"issue":"13","key":"10.1016\/j.ipm.2016.11.006_bib0048","doi-asserted-by":"crossref","first-page":"1462","DOI":"10.14778\/2536258.2536259","article-title":"When speed has a price: Fast information extraction using approximate algorithms","volume":"6","author":"Sim\u00f5es","year":"2013","journal-title":"Proceedings of the VLDB Endowment"},{"key":"10.1016\/j.ipm.2016.11.006_bib0049","unstructured":"Tirado, J. M., Serban, O., Guo, Q., & Yoneki, E. (2016). Web data knowledge extraction. Technical report UCAM-CL-TR-881, University of Cambridge, Computer Laboratory. URL http:\/\/www.cl.cam.ac.uk\/techreports\/UCAM-CL-TR-881.pdf."},{"key":"10.1016\/j.ipm.2016.11.006_bib0050","series-title":"Proceedings of the seventeenth ACM international conference on information and knowledge management (CIKM \u201908)","first-page":"1361","article-title":"Siphon++: A hidden-web crawler for keyword-based interfaces","author":"Vieira","year":"2008"},{"key":"10.1016\/j.ipm.2016.11.006_bib0051","series-title":"Proceedings of the sixteenth international conference on web information systems engineering (WISE \u201915)","first-page":"384","article-title":"Crawling ranked deep web data sources","author":"Wang","year":"2015"},{"key":"10.1016\/j.ipm.2016.11.006_bib0052","series-title":"Proceedings of the 2014\u00a0IEEE\/ACM international conference on advances in social networks analysis and mining (ASONAM \u201914)","first-page":"712","article-title":"Estimating the size of hidden data sources by queries","author":"Wang","year":"2014"},{"issue":"3","key":"10.1016\/j.ipm.2016.11.006_bib0053","doi-asserted-by":"crossref","first-page":"203","DOI":"10.1007\/s10791-013-9230-7","article-title":"Discover hidden web properties by random walk on bipartite graph","volume":"17","author":"Wang","year":"2014","journal-title":"Information Retrieval"},{"issue":"2","key":"10.1016\/j.ipm.2016.11.006_bib0054","doi-asserted-by":"crossref","first-page":"217","DOI":"10.2307\/2983604","article-title":"Contingency tables involving small numbers and the chi-square test","volume":"1","author":"Yates","year":"1934","journal-title":"Supplement to the Journal of the Royal Statistical Society"},{"key":"10.1016\/j.ipm.2016.11.006_bib0055","series-title":"Proceedings of the 2011\u00a0ACM international conference on management of data (SIGMOD \u201911)","first-page":"793","article-title":"Mining a search engine\u2019s corpus: Efficient yet unbiased sampling and aggregate estimation","author":"Zhang","year":"2011"},{"key":"10.1016\/j.ipm.2016.11.006_bib0056","series-title":"Proceedings of the twenty-second ACM international conference on information and knowledge management (CIKM \u201913)","first-page":"29","article-title":"Mining a search engine\u2019s corpus without a query pool","author":"Zhang","year":"2013"},{"key":"10.1016\/j.ipm.2016.11.006_bib0057","unstructured":"Zillman, M. P. (2008). Deep web research 2008. http:\/\/www.llrx.com\/2007\/11\/deep-web-research-2008\/ [Online; accessed 06 Nov 2016]."}],"container-title":["Information Processing & Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457316306318?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0306457316306318?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2021,4,17]],"date-time":"2021-04-17T20:06:39Z","timestamp":1618689999000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0306457316306318"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,3]]},"references-count":57,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2017,3]]}},"alternative-id":["S0306457316306318"],"URL":"https:\/\/doi.org\/10.1016\/j.ipm.2016.11.006","relation":{},"ISSN":["0306-4573"],"issn-type":[{"value":"0306-4573","type":"print"}],"subject":[],"published":{"date-parts":[[2017,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Sampling strategies for information extraction over the deep web","name":"articletitle","label":"Article Title"},{"value":"Information Processing & Management","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.ipm.2016.11.006","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2016 Elsevier Ltd. All rights reserved.","name":"copyright","label":"Copyright"}]}}