{"id":"https://openalex.org/W4393518873","doi":"https://doi.org/10.5281/zenodo.3546193","title":"Webis Wikipedia Text Reuse Corpus 2018 (Webis-Wikipedia-Text-Reuse-18)","display_name":"Webis Wikipedia Text Reuse Corpus 2018 (Webis-Wikipedia-Text-Reuse-18)","publication_year":2018,"publication_date":"2018-07-05","ids":{"openalex":"https://openalex.org/W4393518873","doi":"https://doi.org/10.5281/zenodo.3546193"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://zenodo.org/record/3546193","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":["European Organization for Nuclear Research"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false},"type":"dataset","type_crossref":"dataset","indexed_in":["datacite"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://zenodo.org/record/3546193","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5025513989","display_name":"Milad Alshomary","orcid":"https://orcid.org/0000-0001-6142-9124"},"institutions":[{"id":"https://openalex.org/I206945453","display_name":"Paderborn University","ror":"https://ror.org/058kzsd48","country_code":"DE","type":"funder","lineage":["https://openalex.org/I206945453"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Milad Alshomary","raw_affiliation_strings":["Universit\u00e4t Paderborn"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Paderborn","institution_ids":["https://openalex.org/I206945453"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037682495","display_name":"Michael V\u00f6lske","orcid":"https://orcid.org/0000-0002-9283-6846"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"funder","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Michael V\u00f6lske","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014375244","display_name":"Henning Wachsmuth","orcid":"https://orcid.org/0000-0003-2792-621X"},"institutions":[{"id":"https://openalex.org/I206945453","display_name":"Paderborn University","ror":"https://ror.org/058kzsd48","country_code":"DE","type":"funder","lineage":["https://openalex.org/I206945453"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Henning Wachsmuth","raw_affiliation_strings":["Universit\u00e4t Paderborn"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Paderborn","institution_ids":["https://openalex.org/I206945453"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5027915931","display_name":"Benno Stein","orcid":"https://orcid.org/0000-0001-9033-2217"},"institutions":[{"id":"https://openalex.org/I51441396","display_name":"Bauhaus-Universit\u00e4t Weimar","ror":"https://ror.org/033bb5z47","country_code":"DE","type":"funder","lineage":["https://openalex.org/I51441396"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Benno Stein","raw_affiliation_strings":["Bauhaus-Universit\u00e4t Weimar"],"affiliations":[{"raw_affiliation_string":"Bauhaus-Universit\u00e4t Weimar","institution_ids":["https://openalex.org/I51441396"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014322854","display_name":"Matthias Hagen","orcid":"https://orcid.org/0000-0002-9733-2890"},"institutions":[{"id":"https://openalex.org/I68956291","display_name":"Martin Luther University Halle-Wittenberg","ror":"https://ror.org/05gqaka33","country_code":"DE","type":"education","lineage":["https://openalex.org/I68956291"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Matthias Hagen","raw_affiliation_strings":["Martin-Luther-Universit\u00e4t Halle-Wittenberg"],"affiliations":[{"raw_affiliation_string":"Martin-Luther-Universit\u00e4t Halle-Wittenberg","institution_ids":["https://openalex.org/I68956291"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5083712311","display_name":"Martin Potthast","orcid":"https://orcid.org/0000-0003-2451-0665"},"institutions":[{"id":"https://openalex.org/I926574661","display_name":"Leipzig University","ror":"https://ror.org/03s7gtk40","country_code":"DE","type":"funder","lineage":["https://openalex.org/I926574661"]}],"countries":["DE"],"is_corresponding":false,"raw_author_name":"Martin Potthast","raw_affiliation_strings":["Universit\u00e4t Leipzig"],"affiliations":[{"raw_affiliation_string":"Universit\u00e4t Leipzig","institution_ids":["https://openalex.org/I926574661"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":4,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":{"min":0,"max":62},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12478","display_name":"Wikis in Education and Collaboration","score":0.9694,"subfield":{"id":"https://openalex.org/subfields/3315","display_name":"Communication"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T12478","display_name":"Wikis in Education and Collaboration","score":0.9694,"subfield":{"id":"https://openalex.org/subfields/3315","display_name":"Communication"},"field":{"id":"https://openalex.org/fields/33","display_name":"Social Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C206588197","wikidata":"https://www.wikidata.org/wiki/Q846574","display_name":"Reuse","level":2,"score":0.8669921},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.661205},{"id":"https://openalex.org/C23123220","wikidata":"https://www.wikidata.org/wiki/Q816826","display_name":"Information retrieval","level":1,"score":0.6206739},{"id":"https://openalex.org/C136764020","wikidata":"https://www.wikidata.org/wiki/Q466","display_name":"World Wide Web","level":1,"score":0.44306406},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.42422867},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.13997734},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://zenodo.org/record/3546193","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":["European Organization for Nuclear Research"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.5281/zenodo.3546193","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://zenodo.org/record/3546193","pdf_url":null,"source":{"id":"https://openalex.org/S4306400562","display_name":"Zenodo (CERN European Organization for Nuclear Research)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I67311998","host_organization_name":"European Organization for Nuclear Research","host_organization_lineage":["https://openalex.org/I67311998"],"host_organization_lineage_names":["European Organization for Nuclear Research"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2478288626","https://openalex.org/W2390279801","https://openalex.org/W2384475851","https://openalex.org/W2382290278","https://openalex.org/W2376932109","https://openalex.org/W2358668433","https://openalex.org/W2353602216","https://openalex.org/W2001405890","https://openalex.org/W2000444236"],"abstract_inverted_index":{"The":[0,26,145],"Wikipedia":[1,14,18,36,167,183],"Text":[2],"Reuse":[3],"Corpus":[4],"2018":[5,156],"(Webis-Wikipedia-Text-Reuse-18)":[6],"containing":[7],"text":[8,52,79,122,162,175],"reuse":[9,53,80,123,163,176],"cases":[10],"extracted":[11,148],"from":[12],"within":[13],"and":[15,19,45,115,180,184],"in":[16,149],"between":[17],"a":[20,35,39,51,56,78,83,105,109,121,126],"sample":[21],"of":[22,42,59,86,112,129],"the":[23,150,161,185],"Common":[24],"Crawl.":[25],"corpus":[27],"has":[28],"following":[29],"structure:":[30],"wikipedia.jsonl.bz2:":[31],"Each":[32,48,75,102,118],"line,":[33,49,76,103,119],"representing":[34,50,77,104,120],"article,":[37],"contains":[38,55,82,108,125],"json":[40,57,84,110,127],"array":[41,58,85,128],"article_id,":[43],"article_title,":[44],"article_body":[46],"within-wikipedia-tr-01.jsonl.bz2:":[47],"case,":[54,81,124],"s_id":[60,87,130],"(source":[61,69,88,96],"article":[62,66,89,93,132],"id),":[63,67,90,94,133,137],"t_id":[64,91],"(target":[65,72,92,99],"s_text":[68,95,138],"text),":[70,97,140],"t_text":[71,98],"text)":[73,100],"within-wikipedia-tr-02.jsonl.bz2:":[74],"preprocessed-web-sample.jsonl.xz:":[101],"web":[106],"page,":[107],"object":[111],"d_id,":[113],"d_url,":[114],"content":[116],"without-wikipedia-tr.jsonl.bz2:":[117],"(Wikipedia":[131],"d_id":[134],"(web":[135,142],"page":[136,143],"(article":[139],"d_content":[141],"content)":[144],"datasets":[146],"were":[147],"work":[151],"by":[152],"Alshomary":[153],"et":[154],"al.":[155],"that":[157],"aimed":[158],"to":[159,166],"study":[160],"phenomena":[164],"related":[165],"at":[168],"scale.":[169],"A":[170],"pipeline":[171],"for":[172],"large":[173],"scale":[174],"extraction":[177],"was":[178],"developed":[179],"used":[181],"on":[182],"CommonCrawl.":[186]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4393518873","counts_by_year":[],"updated_date":"2025-01-26T05:55:41.782647","created_date":"2024-04-03"}