{"id":"https://openalex.org/W4322616354","doi":"https://doi.org/10.48550/arxiv.2302.12444","title":"On the Training Instability of Shuffling SGD with Batch Normalization","display_name":"On the Training Instability of Shuffling SGD with Batch Normalization","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4322616354","doi":"https://doi.org/10.48550/arxiv.2302.12444"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.12444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2302.12444","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5045114235","display_name":"David X. Wu","orcid":"https://orcid.org/0000-0003-4863-4689"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wu, David X.","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5045562332","display_name":"Chulhee Yun","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Yun, Chulhee","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5058767558","display_name":"Suvrit Sra","orcid":"https://orcid.org/0000-0001-8516-4925"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sra, Suvrit","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":65},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9963,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T12261","display_name":"Statistical Mechanics and Entropy","score":0.9884,"subfield":{"id":"https://openalex.org/subfields/3109","display_name":"Statistical and Nonlinear Physics"},"field":{"id":"https://openalex.org/fields/31","display_name":"Physics and Astronomy"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10057","display_name":"Face and Expression Recognition","score":0.9534,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/normalization","display_name":"Normalization","score":0.93592495},{"id":"https://openalex.org/keywords/shuffling","display_name":"Shuffling","score":0.7486067},{"id":"https://openalex.org/keywords/divergence","display_name":"Divergence (linguistics)","score":0.6766157},{"id":"https://openalex.org/keywords/kullback\u2013leibler-divergence","display_name":"Kullback\u2013Leibler divergence","score":0.66842234},{"id":"https://openalex.org/keywords/local-optimum","display_name":"Local optimum","score":0.44919702},{"id":"https://openalex.org/keywords/maxima","display_name":"Maxima","score":0.43841818}],"concepts":[{"id":"https://openalex.org/C136886441","wikidata":"https://www.wikidata.org/wiki/Q926129","display_name":"Normalization (sociology)","level":2,"score":0.93592495},{"id":"https://openalex.org/C167927819","wikidata":"https://www.wikidata.org/wiki/Q1930567","display_name":"Shuffling","level":2,"score":0.7486067},{"id":"https://openalex.org/C207390915","wikidata":"https://www.wikidata.org/wiki/Q1230525","display_name":"Divergence (linguistics)","level":2,"score":0.6766157},{"id":"https://openalex.org/C171752962","wikidata":"https://www.wikidata.org/wiki/Q255166","display_name":"Kullback\u2013Leibler divergence","level":2,"score":0.66842234},{"id":"https://openalex.org/C83546350","wikidata":"https://www.wikidata.org/wiki/Q1139051","display_name":"Regression","level":2,"score":0.50552535},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.46752715},{"id":"https://openalex.org/C141934464","wikidata":"https://www.wikidata.org/wiki/Q3305386","display_name":"Local optimum","level":2,"score":0.44919702},{"id":"https://openalex.org/C91528185","wikidata":"https://www.wikidata.org/wiki/Q520952","display_name":"Maxima","level":3,"score":0.43841818},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3868639},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.34999517},{"id":"https://openalex.org/C105795698","wikidata":"https://www.wikidata.org/wiki/Q12483","display_name":"Statistics","level":1,"score":0.24574286},{"id":"https://openalex.org/C142362112","wikidata":"https://www.wikidata.org/wiki/Q735","display_name":"Art","level":0,"score":0.0},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C144024400","wikidata":"https://www.wikidata.org/wiki/Q21201","display_name":"Sociology","level":0,"score":0.0},{"id":"https://openalex.org/C554144382","wikidata":"https://www.wikidata.org/wiki/Q213156","display_name":"Performance art","level":2,"score":0.0},{"id":"https://openalex.org/C19165224","wikidata":"https://www.wikidata.org/wiki/Q23404","display_name":"Anthropology","level":1,"score":0.0},{"id":"https://openalex.org/C52119013","wikidata":"https://www.wikidata.org/wiki/Q50637","display_name":"Art history","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.12444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2302.12444","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2302.12444","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_indexed_in_scopus":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4388857216","https://openalex.org/W4377131110","https://openalex.org/W4294559962","https://openalex.org/W4213239787","https://openalex.org/W3104650831","https://openalex.org/W2952944668","https://openalex.org/W2583823008","https://openalex.org/W2114337652","https://openalex.org/W2046590706","https://openalex.org/W2045745654"],"abstract_inverted_index":{"We":[0,107,132],"uncover":[1],"how":[2,21,113],"SGD":[3,35],"interacts":[4],"with":[5,68,153],"batch":[6,44,69,154],"normalization":[7,155],"and":[8,25,75,101,104,121,130,143,150],"can":[9],"exhibit":[10],"undesirable":[11],"training":[12,54,97],"dynamics":[13],"such":[14],"as":[15],"divergence.":[16,131],"More":[17],"precisely,":[18],"we":[19,71,92],"study":[20],"Single":[22],"Shuffle":[23],"(SS)":[24],"Random":[26],"Reshuffle":[27],"(RR)":[28],"--":[29,36],"two":[30],"widely":[31],"used":[32,152],"variants":[33],"of":[34,43,53],"interact":[37],"surprisingly":[38],"differently":[39],"in":[40,119,140,158],"the":[41,146],"presence":[42],"normalization:":[45],"RR":[46,76,102,126,151],"leads":[47,115],"to":[48,78,111,116],"much":[49],"more":[50],"stable":[51],"evolution":[52],"loss":[55],"than":[56],"SS.":[57],"As":[58],"a":[59,65],"concrete":[60],"example,":[61],"for":[62,90,99,123],"regression":[63,120],"using":[64],"linear":[66],"network":[67],"normalization,":[70],"prove":[72],"that":[73,82,145],"SS":[74,100,114,149],"converge":[77],"distinct":[79],"global":[80],"optima":[81,118],"are":[83],"\"distorted\"":[84],"away":[85],"from":[86],"gradient":[87],"descent.":[88],"Thereafter,":[89],"classification":[91],"characterize":[93],"conditions":[94],"under":[95],"which":[96],"divergence":[98,122],"can,":[103],"cannot":[105],"occur.":[106],"present":[108],"explicit":[109],"constructions":[110],"show":[112],"distorted":[117],"classification,":[124],"whereas":[125],"avoids":[127],"both":[128],"distortion":[129],"validate":[133],"our":[134],"results":[135],"by":[136],"confirming":[137],"them":[138],"empirically":[139],"realistic":[141],"settings,":[142],"conclude":[144],"separation":[147],"between":[148],"is":[156],"relevant":[157],"practice.":[159]},"abstract_inverted_index_v3":null,"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4322616354","counts_by_year":[],"updated_date":"2025-04-13T23:22:49.575237","created_date":"2023-03-01"}