{"id":"https://openalex.org/W4389519449","doi":"https://doi.org/10.18653/v1/2023.findings-emnlp.68","title":"The Internal State of an LLM Knows When It\u2019s Lying","display_name":"The Internal State of an LLM Knows When It\u2019s Lying","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4389519449","doi":"https://doi.org/10.18653/v1/2023.findings-emnlp.68"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2023.findings-emnlp.68","pdf_url":"https://aclanthology.org/2023.findings-emnlp.68.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},"type":"article","type_crossref":"proceedings-article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"hybrid","oa_url":"https://aclanthology.org/2023.findings-emnlp.68.pdf","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5082567694","display_name":"Amos Azaria","orcid":"https://orcid.org/0000-0002-5057-1309"},"institutions":[{"id":"https://openalex.org/I52170813","display_name":"Ariel University","ror":"https://ror.org/03nz8qe97","country_code":"IL","type":"education","lineage":["https://openalex.org/I52170813"]}],"countries":["IL"],"is_corresponding":false,"raw_author_name":"Amos Azaria","raw_affiliation_strings":["School of Computer Science, Ariel University, Israel"],"affiliations":[{"raw_affiliation_string":"School of Computer Science, Ariel University, Israel","institution_ids":["https://openalex.org/I52170813"]}]},{"author_position":"last","author":{"id":"https://openalex.org/A5102921433","display_name":"Tom M. Mitchell","orcid":"https://orcid.org/0000-0001-7373-0301"},"institutions":[{"id":"https://openalex.org/I74973139","display_name":"Carnegie Mellon University","ror":"https://ror.org/05x2bcf33","country_code":"US","type":"education","lineage":["https://openalex.org/I74973139"]}],"countries":["US"],"is_corresponding":false,"raw_author_name":"Tom Mitchell","raw_affiliation_strings":["Machine Learning Dept., Carnegie Mellon University, Pittsburgh, PA"],"affiliations":[{"raw_affiliation_string":"Machine Learning Dept., Carnegie Mellon University, Pittsburgh, PA","institution_ids":["https://openalex.org/I74973139"]}]}],"institution_assertions":[],"countries_distinct_count":2,"institutions_distinct_count":2,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":14.092,"has_fulltext":true,"fulltext_origin":"pdf","cited_by_count":37,"citation_normalized_percentile":{"value":0.999978,"is_in_top_1_percent":true,"is_in_top_10_percent":true},"cited_by_percentile_year":{"min":99,"max":100},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Natural Language Processing","score":0.997,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Natural Language Processing","score":0.997,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Statistical Machine Translation and Natural Language Processing","score":0.9833,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11636","display_name":"Artificial Intelligence in Medicine","score":0.9679,"subfield":{"id":"https://openalex.org/subfields/2718","display_name":"Health Informatics"},"field":{"id":"https://openalex.org/fields/27","display_name":"Medicine"},"domain":{"id":"https://openalex.org/domains/4","display_name":"Health Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/statement","display_name":"Statement (logic)","score":0.58637345},{"id":"https://openalex.org/keywords/language-modeling","display_name":"Language Modeling","score":0.546322}],"concepts":[{"id":"https://openalex.org/C2777530160","wikidata":"https://www.wikidata.org/wiki/Q41796","display_name":"Sentence","level":2,"score":0.8474411},{"id":"https://openalex.org/C95623464","wikidata":"https://www.wikidata.org/wiki/Q1096149","display_name":"Classifier (UML)","level":2,"score":0.8109231},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.7601501},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.5921321},{"id":"https://openalex.org/C2777026412","wikidata":"https://www.wikidata.org/wiki/Q2684591","display_name":"Statement (logic)","level":2,"score":0.58637345},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.5235509},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.3702156},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.32402253},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.09491822},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2023.findings-emnlp.68","pdf_url":"https://aclanthology.org/2023.findings-emnlp.68.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2304.13734","pdf_url":"https://arxiv.org/pdf/2304.13734","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://doi.org/10.18653/v1/2023.findings-emnlp.68","pdf_url":"https://aclanthology.org/2023.findings-emnlp.68.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true},"sustainable_development_goals":[{"score":0.46,"id":"https://metadata.un.org/sdg/16","display_name":"Peace, justice, and strong institutions"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":17,"referenced_works":["https://openalex.org/W2560647685","https://openalex.org/W2898875342","https://openalex.org/W2963961878","https://openalex.org/W3156158944","https://openalex.org/W3170432046","https://openalex.org/W4226278401","https://openalex.org/W4229005866","https://openalex.org/W4232704105","https://openalex.org/W4292779060","https://openalex.org/W4309674289","https://openalex.org/W4310419543","https://openalex.org/W4310926773","https://openalex.org/W4322718421","https://openalex.org/W4323572061","https://openalex.org/W4360836968","https://openalex.org/W4385495736","https://openalex.org/W4385570703"],"related_works":["https://openalex.org/W4389944781","https://openalex.org/W4383099232","https://openalex.org/W3200522959","https://openalex.org/W2997993211","https://openalex.org/W2475705533","https://openalex.org/W2253954117","https://openalex.org/W1993275793","https://openalex.org/W186129870","https://openalex.org/W167088980","https://openalex.org/W120415280"],"abstract_inverted_index":{"While":[0],"Large":[1],"Language":[2],"Models":[3],"(LLMs)":[4],"have":[5],"shown":[6],"exceptional":[7],"performance":[8,144],"in":[9,183,187,213],"various":[10],"tasks,":[11],"one":[12],"of":[13,46,85,101,104,118,181,206],"their":[14],"most":[15],"prominent":[16],"drawbacks":[17],"is":[18,65,77,165,172],"generating":[19],"inaccurate":[20],"or":[21,91],"false":[22],"information":[23],"with":[24],"a":[25,68,75,99,192],"confident":[26],"tone.":[27],"In":[28],"this":[29,170],"paper,":[30],"we":[31,137],"provide":[32],"evidence":[33],"that":[34,58,70,74,97,160],"the":[35,44,54,59,72,81,86,93,132,139,149,153,156,179,184,204],"LLM's":[36],"internal":[37],"state":[38],"can":[39],"be":[40],"used":[41],"to":[42,53,66,120,152,167,196,202],"reveal":[43],"truthfulness":[45],"statements.":[47],"This":[48],"includes":[49],"both":[50],"statements":[51,57],"provided":[52],"LLM,":[55],"and":[56,109,145,178,209],"LLM":[60,87,133],"itself":[61],"generates.":[62],"Our":[63],"approach":[64,195],"train":[67],"classifier":[69,114,190],"outputs":[71],"probability":[73,150,164,171],"statement":[76],"truthful,":[78],"based":[79,147],"on":[80,131,148,175],"hidden":[82],"layer":[83],"activations":[84],"as":[88],"it":[89],"reads":[90],"generates":[92],"statement.":[94],"Experiments":[95],"demonstrate":[96],"given":[98],"set":[100],"test":[102],"sentences,":[103],"which":[105,124],"half":[106,110],"are":[107,126],"true":[108,127],"false,":[111,129],"our":[112,142,188],"trained":[113,189],"achieves":[115],"an":[116],"average":[117],"71%":[119],"83%":[121],"accuracy":[122],"labeling":[123],"sentences":[125],"versus":[128],"depending":[130],"base":[134],"model.":[135],"Furthermore,":[136],"explore":[138],"relationship":[140],"between":[141],"classifier's":[143],"approaches":[146],"assigned":[151],"sentence":[154,163,168,176],"by":[155],"LLM.":[157],"We":[158],"show":[159],"while":[161],"LLM-assigned":[162],"related":[166],"truthfulness,":[169,198],"also":[173],"dependent":[174],"length":[177],"frequencies":[180],"words":[182],"sentence,":[185],"resulting":[186],"providing":[191],"more":[193],"reliable":[194],"detecting":[197],"highlighting":[199],"its":[200,210],"potential":[201],"enhance":[203],"reliability":[205],"LLM-generated":[207],"content":[208],"practical":[211],"applicability":[212],"real-world":[214],"scenarios.":[215]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4389519449","counts_by_year":[{"year":2024,"cited_by_count":16},{"year":2023,"cited_by_count":17}],"updated_date":"2024-11-28T20:33:29.836730","created_date":"2023-12-10"}