{"id":"https://openalex.org/W4385767460","doi":"https://doi.org/10.24963/ijcai.2023/53","title":"Explanation-Guided Reward Alignment","display_name":"Explanation-Guided Reward Alignment","publication_year":2023,"publication_date":"2023-08-01","ids":{"openalex":"https://openalex.org/W4385767460","doi":"https://doi.org/10.24963/ijcai.2023/53"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2023/53","pdf_url":"https://www.ijcai.org/proceedings/2023/0053.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"bronze","oa_url":"https://www.ijcai.org/proceedings/2023/0053.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5056120915","display_name":"Saaduddin Mahmud","orcid":null},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Saaduddin Mahmud","raw_affiliation_strings":["University of Massachusetts Amherst, USA"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts Amherst, USA","institution_ids":["https://openalex.org/I24603500"]}]},{"author_position":"middle","author":{"id":"https://openalex.org/A5037326029","display_name":"Sandhya Saisubramanian","orcid":null},"institutions":[{"id":"https://openalex.org/I131249849","display_name":"Oregon State University","ror":"https://ror.org/00ysfqy60","country_code":"US","type":"education","lineage":["https://openalex.org/I131249849"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Sandhya Saisubramanian","raw_affiliation_strings":["Oregon State University, USA"],"affiliations":[{"raw_affiliation_string":"Oregon State University, USA","institution_ids":["https://openalex.org/I131249849"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5027224308","display_name":"Shlomo Zilberstein","orcid":"https://orcid.org/0000-0001-9817-7848"},"institutions":[{"id":"https://openalex.org/I24603500","display_name":"University of Massachusetts Amherst","ror":"https://ror.org/0072zz521","country_code":"US","type":"education","lineage":["https://openalex.org/I24603500"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Shlomo Zilberstein","raw_affiliation_strings":["University of Massachusetts Amherst, USA"],"affiliations":[{"raw_affiliation_string":"University of Massachusetts Amherst, USA","institution_ids":["https://openalex.org/I24603500"]}]}],"institution_assertions":[],"countries_distinct_count":1,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"fulltext_origin":"pdf","cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":67},"biblio":{"volume":null,"issue":null,"first_page":"473","last_page":"482"},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9965,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T12026","display_name":"Explainable Artificial Intelligence (XAI)","score":0.9965,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10462","display_name":"Reinforcement Learning in Robotics","score":0.99,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11689","display_name":"Adversarial Robustness in Machine Learning","score":0.9862,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/black-box","display_name":"Black box","score":0.5784205}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7526583},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.6390469},{"id":"https://openalex.org/C14036430","wikidata":"https://www.wikidata.org/wiki/Q3736076","display_name":"Function (biology)","level":2,"score":0.604388},{"id":"https://openalex.org/C94966114","wikidata":"https://www.wikidata.org/wiki/Q29256","display_name":"Black box","level":2,"score":0.5784205},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.55284053},{"id":"https://openalex.org/C13662910","wikidata":"https://www.wikidata.org/wiki/Q193139","display_name":"Trajectory","level":2,"score":0.5244218},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.501631},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C1276947","wikidata":"https://www.wikidata.org/wiki/Q333","display_name":"Astronomy","level":1,"score":0.0},{"id":"https://openalex.org/C78458016","wikidata":"https://www.wikidata.org/wiki/Q840400","display_name":"Evolutionary biology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2023/53","pdf_url":"https://www.ijcai.org/proceedings/2023/0053.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.24963/ijcai.2023/53","pdf_url":"https://www.ijcai.org/proceedings/2023/0053.pdf","source":null,"license":null,"license_id":null,"version":"publishedVersion","is_accepted":true,"is_published":true},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":28,"referenced_works":["https://openalex.org/W1591675293","https://openalex.org/W1986014385","https://openalex.org/W1999874108","https://openalex.org/W2098774185","https://openalex.org/W2214916780","https://openalex.org/W2516809705","https://openalex.org/W2546804108","https://openalex.org/W2555897561","https://openalex.org/W2615896489","https://openalex.org/W2763152536","https://openalex.org/W2786426168","https://openalex.org/W2897798332","https://openalex.org/W2928096804","https://openalex.org/W2962851944","https://openalex.org/W2962937519","https://openalex.org/W2963208223","https://openalex.org/W2964177756","https://openalex.org/W2965912206","https://openalex.org/W3003696480","https://openalex.org/W3008082783","https://openalex.org/W3170354064","https://openalex.org/W3201919343","https://openalex.org/W4214717370","https://openalex.org/W4281820326","https://openalex.org/W4287164755","https://openalex.org/W4300427736","https://openalex.org/W4320804553","https://openalex.org/W4321504837"],"related_works":["https://openalex.org/W4391331176","https://openalex.org/W4380318855","https://openalex.org/W4362501864","https://openalex.org/W4323768008","https://openalex.org/W4306904969","https://openalex.org/W3131574667","https://openalex.org/W3084456289","https://openalex.org/W2138720691","https://openalex.org/W2024136090","https://openalex.org/W1941703695"],"abstract_inverted_index":{"Agents":[0],"often":[1],"need":[2],"to":[3,10,55,111,145,152],"infer":[4,17],"a":[5,18,66,125],"reward":[6,19,34,72,116,123,166],"function":[7,20],"from":[8,99,105],"observations":[9],"learn":[11],"desired":[12],"behaviors.":[13],"However,":[14],"agents":[15],"may":[16],"that":[21],"does":[22],"not":[23],"align":[24],"with":[25,37],"the":[26,57,106,129,132,137,140,154,159],"original":[27],"intent":[28],"because":[29],"there":[30],"can":[31,46,80],"be":[32,47],"multiple":[33],"functions":[35],"consistent":[36],"its":[38,121,175],"observations.":[39],"Operating":[40],"based":[41],"on":[42],"such":[43],"misaligned":[44],"rewards":[45,59],"risky.":[48],"Furthermore,":[49],"black-box":[50],"representations":[51],"make":[52],"it":[53],"difficult":[54],"verify":[56],"learned":[58,122,155],"and":[60,70,76,84,114,124,173],"prevent":[61],"harmful":[62],"behavior.":[63],"We":[64,157],"present":[65],"framework":[67],"for":[68],"verifying":[69],"improving":[71,165],"alignment":[73,167],"using":[74,168],"explanations":[75,79,144,172],"show":[77],"how":[78],"help":[81],"detect":[82],"misalignment":[83],"reveal":[85],"failure":[86],"cases":[87,135],"in":[88,164,177],"novel":[89],"scenarios.":[90],"The":[91,118],"problem":[92],"is":[93,149],"formulated":[94],"as":[95],"inverse":[96],"reinforcement":[97],"learning":[98],"ranked":[100],"trajectories.":[101],"Verification":[102],"tests":[103],"created":[104],"trajectory":[107],"dataset":[108],"are":[109],"used":[110,151],"iteratively":[112],"validate":[113],"improve":[115,153],"alignment.":[117],"agent":[119,141],"explains":[120],"tester":[126],"signals":[127],"whether":[128],"explanation":[130,138],"passes":[131],"test.":[133],"In":[134],"where":[136],"fails,":[139],"offers":[142],"alternative":[143],"gather":[146],"feedback,":[147],"which":[148],"then":[150],"reward.":[156],"analyze":[158],"efficiency":[160],"of":[161,171],"our":[162],"approach":[163],"different":[169],"types":[170],"demonstrate":[174],"effectiveness":[176],"five":[178],"domains.":[179]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4385767460","counts_by_year":[],"updated_date":"2025-01-05T14:14:06.166052","created_date":"2023-08-12"}