{"id":"https://openalex.org/W4386044703","doi":"https://doi.org/10.48550/arxiv.2308.09004","title":"Towards Lightweight Data Integration using Multi-workflow Provenance and Data Observability","display_name":"Towards Lightweight Data Integration using Multi-workflow Provenance and Data Observability","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4386044703","doi":"https://doi.org/10.48550/arxiv.2308.09004"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2308.09004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"journal-article","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2308.09004","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5042209170","display_name":"Renan P. Souza","orcid":"https://orcid.org/0000-0002-9479-4432"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Souza, Renan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079278437","display_name":"Tyler J. Skluzacek","orcid":"https://orcid.org/0000-0003-2242-4931"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Skluzacek, Tyler J.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5034261249","display_name":"Sean Wilkinson","orcid":"https://orcid.org/0000-0002-1443-7479"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wilkinson, Sean R.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5013879711","display_name":"Maxim Ziatdinov","orcid":"https://orcid.org/0000-0003-2570-4592"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ziatdinov, Maxim","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5072339196","display_name":"Rafael Ferreira da Silva","orcid":"https://orcid.org/0000-0002-1720-0928"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"da Silva, Rafael Ferreira","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":67},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9993,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},"topics":[{"id":"https://openalex.org/T11986","display_name":"Scientific Computing and Data Management","score":0.9993,"subfield":{"id":"https://openalex.org/subfields/1802","display_name":"Information Systems and Management"},"field":{"id":"https://openalex.org/fields/18","display_name":"Decision Sciences"},"domain":{"id":"https://openalex.org/domains/2","display_name":"Social Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9716,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10715","display_name":"Distributed and Parallel Computing Systems","score":0.969,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/observability","display_name":"Observability","score":0.7525083}],"concepts":[{"id":"https://openalex.org/C177212765","wikidata":"https://www.wikidata.org/wiki/Q627335","display_name":"Workflow","level":2,"score":0.7960949},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.76253843},{"id":"https://openalex.org/C36299963","wikidata":"https://www.wikidata.org/wiki/Q1369844","display_name":"Observability","level":2,"score":0.7525083},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.55918825},{"id":"https://openalex.org/C79974875","wikidata":"https://www.wikidata.org/wiki/Q483639","display_name":"Cloud computing","level":2,"score":0.55642474},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.55015755},{"id":"https://openalex.org/C115903868","wikidata":"https://www.wikidata.org/wiki/Q80993","display_name":"Software engineering","level":1,"score":0.37530863},{"id":"https://openalex.org/C2522767166","wikidata":"https://www.wikidata.org/wiki/Q2374463","display_name":"Data science","level":1,"score":0.36171237},{"id":"https://openalex.org/C77088390","wikidata":"https://www.wikidata.org/wiki/Q8513","display_name":"Database","level":1,"score":0.35179675},{"id":"https://openalex.org/C201995342","wikidata":"https://www.wikidata.org/wiki/Q682496","display_name":"Systems engineering","level":1,"score":0.3245991},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.21314791},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.10187867},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.0},{"id":"https://openalex.org/C28826006","wikidata":"https://www.wikidata.org/wiki/Q33521","display_name":"Applied mathematics","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2308.09004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2308.09004","pdf_url":"http://arxiv.org/pdf/2308.09004","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2308.09004","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2308.09004","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":false},"sustainable_development_goals":[{"score":0.52,"display_name":"Industry, innovation and infrastructure","id":"https://metadata.un.org/sdg/9"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4312300846","https://openalex.org/W4206221578","https://openalex.org/W3157641275","https://openalex.org/W3029572990","https://openalex.org/W2967463586","https://openalex.org/W2765830098","https://openalex.org/W2615757685","https://openalex.org/W2517338020","https://openalex.org/W2046459260","https://openalex.org/W1971989957"],"abstract_inverted_index":{"Modern":[0],"large-scale":[1],"scientific":[2,29],"discovery":[3],"requires":[4],"multidisciplinary":[5],"collaboration":[6],"across":[7],"diverse":[8],"computing":[9],"facilities,":[10],"including":[11],"High":[12],"Performance":[13],"Computing":[14],"(HPC)":[15],"machines":[16],"and":[17,44,64,75,94,101,121,146],"the":[18,33,48,109,112,185],"Edge-to-Cloud":[19],"continuum.":[20],"Integrated":[21,86],"data":[22,70,91,123,143],"analysis":[23,141],"plays":[24],"a":[25,127,149],"crucial":[26],"role":[27],"in":[28,32,111,148,169],"discovery,":[30],"especially":[31],"current":[34],"AI":[35,40],"era,":[36],"by":[37],"enabling":[38],"Responsible":[39],"development,":[41],"FAIR,":[42],"Reproducibility,":[43],"User":[45],"Steering.":[46],"However,":[47],"heterogeneous":[49],"nature":[50],"of":[51],"science":[52,158],"poses":[53],"challenges":[54],"such":[55],"as":[56],"dealing":[57],"with":[58,164],"multiple":[59,162],"supporting":[60],"tools,":[61],"cross-facility":[62],"environments,":[63],"efficient":[65],"HPC":[66],"execution.":[67],"Building":[68],"on":[69,161,180,184],"observability,":[71,106],"adapter":[72],"system":[73],"design,":[74],"provenance,":[76,120],"we":[77],"propose":[78],"MIDA:":[79],"an":[80],"approach":[81],"for":[82,97,131,156],"lightweight":[83],"runtime":[84,125],"Multi-workflow":[85],"Data":[87],"Analysis.":[88],"MIDA":[89],"defines":[90],"observability":[92],"strategies":[93],"adaptability":[95],"methods":[96],"various":[98],"parallel":[99],"systems":[100],"machine":[102],"learning":[103,153],"tools.":[104],"With":[105],"it":[107],"intercepts":[108],"dataflows":[110],"background":[113],"without":[114],"requiring":[115],"instrumentation":[116],"while":[117],"integrating":[118,142],"domain,":[119],"telemetry":[122],"at":[124],"into":[126],"unified":[128],"database":[129],"ready":[130],"user":[132],"steering":[133],"queries.":[134],"We":[135,171],"conduct":[136],"experiments":[137],"showing":[138],"end-to-end":[139],"multi-workflow":[140],"from":[144],"Dask":[145],"MLFlow":[147],"real":[150],"distributed":[151],"deep":[152],"use":[154],"case":[155],"materials":[157],"that":[159],"runs":[160],"environments":[163],"up":[165,176],"to":[166,177],"276":[167],"GPUs":[168],"parallel.":[170],"show":[172],"near-zero":[173],"overhead":[174],"running":[175],"100,000":[178],"tasks":[179],"1,680":[181],"CPU":[182],"cores":[183],"Summit":[186],"supercomputer.":[187]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4386044703","counts_by_year":[],"updated_date":"2025-01-21T07:53:04.693991","created_date":"2023-08-22"}