{"id":"https://openalex.org/W4391462748","doi":"https://doi.org/10.48550/arxiv.2401.17633","title":"Navigating the OverKill in Large Language Models","display_name":"Navigating the OverKill in Large Language Models","publication_year":2024,"publication_date":"2024-01-31","ids":{"openalex":"https://openalex.org/W4391462748","doi":"https://doi.org/10.48550/arxiv.2401.17633"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.17633","pdf_url":"http://arxiv.org/pdf/2401.17633","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"http://arxiv.org/pdf/2401.17633","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5101796337","display_name":"Chenyu Shi","orcid":"https://orcid.org/0000-0002-9097-118X"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Shi, Chenyu","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100411477","display_name":"Xiao Wang","orcid":"https://orcid.org/0000-0002-5552-2354"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Xiao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109786310","display_name":"Qiming Ge","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ge, Qiming","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5050969774","display_name":"Songyang Gao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gao, Songyang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109445159","display_name":"Xianjun Yang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Xianjun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5058353652","display_name":"Tao Gui","orcid":"https://orcid.org/0000-0002-6154-0751"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gui, Tao","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101829098","display_name":"Qi Zhang","orcid":"https://orcid.org/0000-0002-3586-1164"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Qi","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5088834359","display_name":"Xuanjing Huang","orcid":"https://orcid.org/0000-0001-9197-9426"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Huang, Xuanjing","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100975490","display_name":"Xun Zhao","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Xun","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5101338641","display_name":"Dahua Lin","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Lin, Dahua","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":77},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7703,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.7703,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.7427,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.5142843},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.39549482},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.393122},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.15152124}],"mesh":[],"locations_count":1,"locations":[{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.17633","pdf_url":"http://arxiv.org/pdf/2401.17633","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false}],"best_oa_location":{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2401.17633","pdf_url":"http://arxiv.org/pdf/2401.17633","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2748952813","https://openalex.org/W2530322880","https://openalex.org/W2390279801","https://openalex.org/W2382290278","https://openalex.org/W2376932109","https://openalex.org/W2359140296","https://openalex.org/W2358668433","https://openalex.org/W2350741829","https://openalex.org/W2001405890","https://openalex.org/W1596801655"],"abstract_inverted_index":{"Large":[0],"language":[1],"models":[2,22,41],"are":[3],"meticulously":[4],"aligned":[5],"to":[6,16,25,59,88,108],"be":[7],"both":[8],"helpful":[9],"and":[10,43,67,85],"harmless.":[11],"However,":[12],"recent":[13],"research":[14],"points":[15],"a":[17,83],"potential":[18],"overkill":[19,37],"which":[20],"means":[21],"may":[23],"refuse":[24],"answer":[26],"benign":[27],"queries.":[28,48],"In":[29],"this":[30,90],"paper,":[31],"we":[32,78,121],"investigate":[33],"the":[34,45,52,99,102,123,129,132,149],"factors":[35],"for":[36],"by":[38,97,127,152],"exploring":[39],"how":[40],"handle":[42],"determine":[44,122],"safety":[46,70],"of":[47,54,62,148],"Our":[49],"findings":[50],"reveal":[51],"presence":[53],"shortcuts":[55],"within":[56],"models,":[57],"leading":[58],"an":[60,116,145],"over-attention":[61,96,130],"harmful":[63],"words":[64],"like":[65],"'kill'":[66],"prompts":[68,110],"emphasizing":[69],"will":[71],"exacerbate":[72],"overkill.":[73],"Based":[74],"on":[75,118,159],"these":[76],"insights,":[77],"introduce":[79],"Self-Contrastive":[80],"Decoding":[81],"(Self-CD),":[82],"training-free":[84],"model-agnostic":[86],"strategy,":[87],"alleviate":[89],"phenomenon.":[91],"We":[92],"first":[93],"extract":[94],"such":[95],"amplifying":[98],"difference":[100],"in":[101],"model's":[103],"output":[104],"distributions":[105],"when":[106],"responding":[107],"system":[109],"that":[111,140],"either":[112],"include":[113],"or":[114],"omit":[115],"emphasis":[117],"safety.":[119,160],"Then":[120],"final":[124],"next-token":[125],"predictions":[126],"downplaying":[128],"from":[131],"model":[133],"via":[134],"contrastive":[135],"decoding.":[136],"Empirical":[137],"results":[138],"indicate":[139],"our":[141],"method":[142],"has":[143],"achieved":[144],"average":[146],"reduction":[147],"refusal":[150],"rate":[151],"20\\%":[153],"while":[154],"having":[155],"almost":[156],"no":[157],"impact":[158]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4391462748","counts_by_year":[],"updated_date":"2025-04-22T07:35:20.209293","created_date":"2024-02-02"}