{"id":"https://openalex.org/W4388032098","doi":"https://doi.org/10.48550/arxiv.2310.18313","title":"FP8-LM: Training FP8 Large Language Models","display_name":"FP8-LM: Training FP8 Large Language Models","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4388032098","doi":"https://doi.org/10.48550/arxiv.2310.18313"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2310.18313","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2310.18313","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080495327","display_name":"Houwen Peng","orcid":"https://orcid.org/0000-0001-8544-8952"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Peng, Houwen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100368801","display_name":"Kan Wu","orcid":"https://orcid.org/0000-0002-8306-5066"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, Kan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5009248953","display_name":"Yixuan Wei","orcid":"https://orcid.org/0000-0003-1775-7301"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wei, Yixuan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5079805574","display_name":"Guoshuai Zhao","orcid":"https://orcid.org/0000-0003-4392-8450"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhao, Guoshuai","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5016817843","display_name":"Yuxiang Yang","orcid":"https://orcid.org/0000-0002-1837-3628"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Yuxiang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100349457","display_name":"Ze Liu","orcid":"https://orcid.org/0000-0002-5121-7547"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Ze","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102245054","display_name":"Yifan Xiong","orcid":"https://orcid.org/0000-0002-1921-7424"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiong, Yifan","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5029661948","display_name":"Ziyue Yang","orcid":"https://orcid.org/0000-0002-1658-0260"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yang, Ziyue","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5012017422","display_name":"Bolin Ni","orcid":"https://orcid.org/0009-0000-7160-5523"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ni, Bolin","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5109632338","display_name":"Jingcheng Hu","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Jingcheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5019706917","display_name":"Ruihang Li","orcid":"https://orcid.org/0009-0008-5164-5470"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Ruihang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104189549","display_name":"Miaosen Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Miaosen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100369870","display_name":"Chen Li","orcid":"https://orcid.org/0000-0002-7508-7222"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Li, Chen","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5101709263","display_name":"Ning Jia","orcid":"https://orcid.org/0000-0002-2395-3432"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Ning, Jia","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5102754531","display_name":"Ruizhe Wang","orcid":"https://orcid.org/0000-0002-2455-9074"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wang, Ruizhe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5057219313","display_name":"Zheng Zhang","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Zhang, Zheng","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100384089","display_name":"Shuguang Liu","orcid":"https://orcid.org/0000-0003-3564-8336"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Shuguang","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5104754746","display_name":"Joe Chau","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Chau, Joe","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5091049278","display_name":"Hu Han","orcid":"https://orcid.org/0000-0001-6010-1792"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Hu, Han","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5111090174","display_name":"Peng Cheng","orcid":"https://orcid.org/0000-0002-4453-2274"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Cheng, Peng","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":2,"citation_normalized_percentile":{"value":0.710701,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":76,"max":82},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9995,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9995,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10181","display_name":"Natural Language Processing Techniques","score":0.9987,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9667,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/aka","display_name":"AKA","score":0.6750889}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.79177654},{"id":"https://openalex.org/C121158502","wikidata":"https://www.wikidata.org/wiki/Q4652161","display_name":"AKA","level":2,"score":0.6750889},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.52869403},{"id":"https://openalex.org/C97541855","wikidata":"https://www.wikidata.org/wiki/Q830687","display_name":"Reinforcement learning","level":2,"score":0.47694364},{"id":"https://openalex.org/C2777211547","wikidata":"https://www.wikidata.org/wiki/Q17141490","display_name":"Training (meteorology)","level":2,"score":0.46914592},{"id":"https://openalex.org/C66322947","wikidata":"https://www.wikidata.org/wiki/Q11658","display_name":"Transformer","level":3,"score":0.45092887},{"id":"https://openalex.org/C137293760","wikidata":"https://www.wikidata.org/wiki/Q3621696","display_name":"Language model","level":2,"score":0.44349357},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.4292401},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.38174164},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.11498684},{"id":"https://openalex.org/C121332964","wikidata":"https://www.wikidata.org/wiki/Q413","display_name":"Physics","level":0,"score":0.0},{"id":"https://openalex.org/C62520636","wikidata":"https://www.wikidata.org/wiki/Q944","display_name":"Quantum mechanics","level":1,"score":0.0},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.0},{"id":"https://openalex.org/C161191863","wikidata":"https://www.wikidata.org/wiki/Q199655","display_name":"Library science","level":1,"score":0.0},{"id":"https://openalex.org/C153294291","wikidata":"https://www.wikidata.org/wiki/Q25261","display_name":"Meteorology","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2310.18313","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2310.18313","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2310.18313","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"other-oa","license_id":"https://openalex.org/licenses/other-oa","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4288365749","https://openalex.org/W4287826556","https://openalex.org/W4287598411","https://openalex.org/W3198458223","https://openalex.org/W3126642501","https://openalex.org/W3100913109","https://openalex.org/W3098382480","https://openalex.org/W3094871513","https://openalex.org/W3013624417","https://openalex.org/W2936497627"],"abstract_inverted_index":{"In":[0],"this":[1],"paper,":[2],"we":[3,49],"explore":[4],"FP8":[5,53,66,106,155,186],"low-bit":[6],"data":[7,36],"formats":[8,37],"for":[9,57,75,149],"efficient":[10],"training":[11,32,58,74,97,108,147,157,188],"of":[12,65,98,137],"large":[13,150],"language":[14],"models":[15],"(LLMs).":[16],"Our":[17,185],"key":[18],"insight":[19],"is":[20,159,190],"that":[21],"most":[22],"variables,":[23],"such":[24,169],"as":[25,170],"gradients":[26],"and":[27,42,71,84,174],"optimizer":[28,82],"states,":[29,83],"in":[30,87,117,182],"LLM":[31,171],"can":[33,162],"employ":[34],"low-precision":[35,187],"without":[38],"compromising":[39],"model":[40,100],"accuracy":[41],"requiring":[43],"no":[44],"changes":[45],"to":[46,68,166],"hyper-parameters.":[47],"Specifically,":[48],"propose":[50],"a":[51,113],"new":[52],"automatic":[54],"mixed-precision":[55,70,107,156],"framework":[56,61,109,131,189],"LLMs.":[59,76],"This":[60,143],"offers":[62],"three":[63],"levels":[64],"utilization":[67],"streamline":[69],"distributed":[72,85],"parallel":[73],"It":[77,161],"gradually":[78],"incorporates":[79],"8-bit":[80],"gradients,":[81],"learning":[86,176],"an":[88],"incremental":[89],"manner.":[90],"Experiment":[91],"results":[92],"show":[93],"that,":[94],"during":[95],"the":[96,127,135,146],"GPT-175B":[99],"on":[101],"H100":[102],"GPU":[103],"platform,":[104],"our":[105,154],"not":[110],"only":[111],"achieved":[112],"remarkable":[114],"39%":[115],"reduction":[116],"real":[118],"memory":[119],"usage":[120],"but":[121],"also":[122],"ran":[123],"75%":[124],"faster":[125],"than":[126],"widely":[128],"adopted":[129],"BF16":[130],"(i.e.,":[132],"Megatron-LM),":[133],"surpassing":[134],"speed":[136],"Nvidia":[138],"Transformer":[139],"Engine":[140],"by":[141],"37%.":[142],"largely":[144],"reduces":[145],"costs":[148],"foundation":[151],"models.":[152],"Furthermore,":[153],"methodology":[158],"generic.":[160],"be":[163],"seamlessly":[164],"applied":[165],"other":[167],"tasks":[168],"instruction":[172],"tuning":[173],"reinforcement":[175],"with":[177],"human":[178],"feedback,":[179],"offering":[180],"savings":[181],"fine-tuning":[183],"expenses.":[184],"open-sourced":[191],"at":[192],"{https://github.com/Azure/MS-AMP}{aka.ms/MS.AMP}.":[193]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4388032098","counts_by_year":[{"year":2024,"cited_by_count":2}],"updated_date":"2025-05-02T07:49:45.840060","created_date":"2023-11-01"}