{"id":"https://openalex.org/W2318549902","doi":"https://doi.org/10.1109/tdsc.2016.2548463","title":"Understanding Practical Tradeoffs in HPC Checkpoint-Scheduling Policies","display_name":"Understanding Practical Tradeoffs in HPC Checkpoint-Scheduling Policies","publication_year":2016,"publication_date":"2016-03-30","ids":{"openalex":"https://openalex.org/W2318549902","doi":"https://doi.org/10.1109/tdsc.2016.2548463","mag":"2318549902"},"language":"en","primary_location":{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/tdsc.2016.2548463","pdf_url":null,"source":{"id":"https://openalex.org/S133795288","display_name":"IEEE Transactions on Dependable and Secure Computing","issn_l":"1545-5971","issn":["1545-5971","1941-0018","2160-9209"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false},"type":"article","type_crossref":"journal-article","indexed_in":["crossref"],"open_access":{"is_oa":false,"oa_status":"closed","oa_url":null,"any_repository_has_fulltext":false},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5041608911","display_name":"Nosayba El-Sayed","orcid":null},"institutions":[{"id":"https://openalex.org/I185261750","display_name":"University of Toronto","ror":"https://ror.org/03dbr7087","country_code":"CA","type":"education","lineage":["https://openalex.org/I185261750"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Nosayba El-Sayed","raw_affiliation_strings":["Department of Computer Science, University of Toronto"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Toronto","institution_ids":["https://openalex.org/I185261750"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5040893798","display_name":"Bianca Schroeder","orcid":"https://orcid.org/0000-0003-3289-1824"},"institutions":[{"id":"https://openalex.org/I185261750","display_name":"University of Toronto","ror":"https://ror.org/03dbr7087","country_code":"CA","type":"education","lineage":["https://openalex.org/I185261750"]}],"countries":["CA"],"is_corresponding":false,"raw_author_name":"Bianca Schroeder","raw_affiliation_strings":["Department of Computer Science, University of Toronto"],"affiliations":[{"raw_affiliation_string":"Department of Computer Science, University of Toronto","institution_ids":["https://openalex.org/I185261750"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":1,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":2.142,"has_fulltext":true,"fulltext_origin":"ngrams","cited_by_count":14,"citation_normalized_percentile":{"value":0.966658,"is_in_top_1_percent":false,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":88,"max":89},"biblio":{"volume":"15","issue":"2","first_page":"336","last_page":"350"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10101","display_name":"Cloud Computing and Resource Management","score":0.9998,"subfield":{"id":"https://openalex.org/subfields/1710","display_name":"Information Systems"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10772","display_name":"Distributed systems and fault tolerance","score":0.9994,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11181","display_name":"Advanced Data Storage Technologies","score":0.9985,"subfield":{"id":"https://openalex.org/subfields/1705","display_name":"Computer Networks and Communications"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.8692095},{"id":"https://openalex.org/C206729178","wikidata":"https://www.wikidata.org/wiki/Q2271896","display_name":"Scheduling (production processes)","level":2,"score":0.67267615},{"id":"https://openalex.org/C165696696","wikidata":"https://www.wikidata.org/wiki/Q11287","display_name":"Exploit","level":2,"score":0.6712246},{"id":"https://openalex.org/C63540848","wikidata":"https://www.wikidata.org/wiki/Q3140932","display_name":"Fault tolerance","level":2,"score":0.6535861},{"id":"https://openalex.org/C120314980","wikidata":"https://www.wikidata.org/wiki/Q180634","display_name":"Distributed computing","level":1,"score":0.60043114},{"id":"https://openalex.org/C2780165032","wikidata":"https://www.wikidata.org/wiki/Q16869822","display_name":"Energy consumption","level":2,"score":0.597401},{"id":"https://openalex.org/C105339364","wikidata":"https://www.wikidata.org/wiki/Q2297740","display_name":"Software deployment","level":2,"score":0.54421127},{"id":"https://openalex.org/C83283714","wikidata":"https://www.wikidata.org/wiki/Q121117","display_name":"Supercomputer","level":2,"score":0.5195306},{"id":"https://openalex.org/C2742236","wikidata":"https://www.wikidata.org/wiki/Q924713","display_name":"Efficient energy use","level":2,"score":0.4857195},{"id":"https://openalex.org/C149635348","wikidata":"https://www.wikidata.org/wiki/Q193040","display_name":"Embedded system","level":1,"score":0.3710505},{"id":"https://openalex.org/C200601418","wikidata":"https://www.wikidata.org/wiki/Q2193887","display_name":"Reliability engineering","level":1,"score":0.33652675},{"id":"https://openalex.org/C173608175","wikidata":"https://www.wikidata.org/wiki/Q232661","display_name":"Parallel computing","level":1,"score":0.30121952},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.22982696},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.07954383},{"id":"https://openalex.org/C18903297","wikidata":"https://www.wikidata.org/wiki/Q7150","display_name":"Ecology","level":1,"score":0.0},{"id":"https://openalex.org/C21547014","wikidata":"https://www.wikidata.org/wiki/Q1423657","display_name":"Operations management","level":1,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.0},{"id":"https://openalex.org/C162324750","wikidata":"https://www.wikidata.org/wiki/Q8134","display_name":"Economics","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":false,"landing_page_url":"https://doi.org/10.1109/tdsc.2016.2548463","pdf_url":null,"source":{"id":"https://openalex.org/S133795288","display_name":"IEEE Transactions on Dependable and Secure Computing","issn_l":"1545-5971","issn":["1545-5971","1941-0018","2160-9209"],"is_oa":false,"is_in_doaj":false,"is_core":true,"host_organization":"https://openalex.org/P4310320439","host_organization_name":"IEEE Computer Society","host_organization_lineage":["https://openalex.org/P4310320439","https://openalex.org/P4310319808"],"host_organization_lineage_names":["IEEE Computer Society","Institute of Electrical and Electronics Engineers"],"type":"journal"},"license":null,"license_id":null,"version":null,"is_accepted":false,"is_published":false}],"best_oa_location":null,"sustainable_development_goals":[{"id":"https://metadata.un.org/sdg/7","score":0.9,"display_name":"Affordable and clean energy"}],"grants":[{"funder":"https://openalex.org/F4320334593","funder_display_name":"Natural Sciences and Engineering Research Council of Canada","award_id":"Discovery Grant"}],"datasets":[],"versions":[],"referenced_works_count":17,"referenced_works":["https://openalex.org/W1558516248","https://openalex.org/W1870609547","https://openalex.org/W1977792483","https://openalex.org/W1991872262","https://openalex.org/W2003569970","https://openalex.org/W2008909725","https://openalex.org/W2033656974","https://openalex.org/W2040509723","https://openalex.org/W2056966287","https://openalex.org/W2060836295","https://openalex.org/W2081235423","https://openalex.org/W2116011221","https://openalex.org/W2133046454","https://openalex.org/W2152023716","https://openalex.org/W2166143798","https://openalex.org/W2200913013","https://openalex.org/W4285719527"],"related_works":["https://openalex.org/W4312814274","https://openalex.org/W4285370786","https://openalex.org/W3207760230","https://openalex.org/W2536018345","https://openalex.org/W2358353312","https://openalex.org/W2353836703","https://openalex.org/W2296488620","https://openalex.org/W17155033","https://openalex.org/W1590307681","https://openalex.org/W1496222301"],"abstract_inverted_index":{"As":[0],"the":[1,87,134,174,180],"scale":[2],"of":[3,47,86,98,167,176],"High-Performance":[4],"Computing":[5],"(HPC)":[6],"clusters":[7],"continues":[8],"to":[9,114,125],"grow,":[10],"their":[11,48],"increasing":[12],"failure":[13,143],"rates":[14],"and":[15,51,71,90,106,132,136,183],"energy":[16,52,89,130],"consumption":[17],"levels":[18],"are":[19,187],"emerging":[20],"as":[21],"serious":[22],"design":[23],"concerns.":[24],"Efficiently":[25],"running":[26],"systems":[27],"at":[28],"such":[29],"large":[30],"scales":[31],"critically":[32],"relies":[33],"on":[34,142,179],"deploying":[35],"effective,":[36],"practical":[37,103],"methods":[38,124,163],"for":[39,129,155,189],"fault":[40,58],"tolerance":[41,59],"while":[42],"having":[43],"a":[44,95,120],"good":[45],"understanding":[46],"respective":[49],"performance":[50],"overheads.":[53],"The":[54],"most":[55],"commonly":[56],"used":[57,113],"method":[60],"is":[61],"checkpoint/restart.":[62],"Checkpoint":[63],"scheduling":[64,128],"policies,":[65],"however,":[66],"have":[67],"been":[68],"traditionally":[69],"optimized":[70],"analysed":[72],"from":[73,145],"one":[74],"angle:":[75],"application":[76],"performance.":[77],"In":[78],"this":[79],"work,":[80],"we":[81],"provide":[82],"an":[83],"extensive":[84],"analysis":[85],"performance,":[88],"I/O":[91,190],"costs":[92],"associated":[93],"with":[94],"wide":[96],"array":[97],"checkpointing":[99,178],"policies.":[100],"We":[101,122,171],"consider":[102],"deployment":[104],"issues":[105],"show":[107,152],"that":[108,164,186],"simple":[109],"formulas":[110],"can":[111],"be":[112],"accurately":[115],"estimate":[116],"wasted":[117],"work":[118],"in":[119],"system.":[121],"propose":[123],"optimize":[126],"checkpoint":[127],"savings":[131],"evaluate":[133],"runtime-optimized":[135],"energy-optimized":[137,177],"policies":[138,185],"using":[139,162],"simulations":[140],"based":[141],"logs":[144],"10":[146],"production":[147],"HPC":[148],"clusters.":[149],"Our":[150],"results":[151],"ample":[153],"room":[154],"achieving":[156],"high":[157],"quality":[158],"energy/performance":[159],"tradeoffs":[160],"when":[161],"exploit":[165],"characteristics":[166],"real":[168],"world":[169],"failures.":[170],"also":[172],"analyze":[173],"impact":[175],"storage":[181],"subsystem":[182],"identify":[184],"optimal":[188],"savings.":[191]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W2318549902","counts_by_year":[{"year":2024,"cited_by_count":2},{"year":2023,"cited_by_count":1},{"year":2022,"cited_by_count":1},{"year":2021,"cited_by_count":3},{"year":2020,"cited_by_count":3},{"year":2019,"cited_by_count":1},{"year":2018,"cited_by_count":1},{"year":2016,"cited_by_count":1}],"updated_date":"2025-01-15T18:02:09.947845","created_date":"2016-06-24"}