{"id":"https://openalex.org/W4318908026","doi":"https://doi.org/10.48550/arxiv.2301.13703","title":"Dissecting the Effects of SGD Noise in Distinct Regimes of Deep Learning","display_name":"Dissecting the Effects of SGD Noise in Distinct Regimes of Deep Learning","publication_year":2023,"publication_date":"2023-01-01","ids":{"openalex":"https://openalex.org/W4318908026","doi":"https://doi.org/10.48550/arxiv.2301.13703"},"language":"en","primary_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2301.13703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"type":"preprint","type_crossref":"posted-content","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/abs/2301.13703","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5080770867","display_name":"Antonio Sclocchi","orcid":"https://orcid.org/0000-0001-7755-1634"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sclocchi, Antonio","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5002843500","display_name":"Mario Geiger","orcid":"https://orcid.org/0000-0001-5433-0900"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Geiger, Mario","raw_affiliation_strings":[],"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5019813807","display_name":"Matthieu Wyart","orcid":"https://orcid.org/0000-0003-0644-0990"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Wyart, Matthieu","raw_affiliation_strings":[],"affiliations":[]}],"institution_assertions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":false,"cited_by_count":0,"citation_normalized_percentile":{"value":0.0,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":{"min":0,"max":68},"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"primary_topic":{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9957,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T11948","display_name":"Machine Learning in Materials Science","score":0.9957,"subfield":{"id":"https://openalex.org/subfields/2505","display_name":"Materials Chemistry"},"field":{"id":"https://openalex.org/fields/25","display_name":"Materials Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10320","display_name":"Neural Networks and Applications","score":0.9929,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10775","display_name":"Generative Adversarial Networks and Image Synthesis","score":0.9788,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/stochastic-gradient-descent","display_name":"Stochastic Gradient Descent","score":0.77236015},{"id":"https://openalex.org/keywords/initialization","display_name":"Initialization","score":0.5213253},{"id":"https://openalex.org/keywords/mnist-database","display_name":"MNIST database","score":0.43446738}],"concepts":[{"id":"https://openalex.org/C206688291","wikidata":"https://www.wikidata.org/wiki/Q7617819","display_name":"Stochastic gradient descent","level":3,"score":0.77236015},{"id":"https://openalex.org/C99498987","wikidata":"https://www.wikidata.org/wiki/Q2210247","display_name":"Noise (video)","level":3,"score":0.7397446},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.55000156},{"id":"https://openalex.org/C26517878","wikidata":"https://www.wikidata.org/wiki/Q228039","display_name":"Key (lock)","level":2,"score":0.5332529},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.53209674},{"id":"https://openalex.org/C114466953","wikidata":"https://www.wikidata.org/wiki/Q6034165","display_name":"Initialization","level":2,"score":0.5213253},{"id":"https://openalex.org/C153258448","wikidata":"https://www.wikidata.org/wiki/Q1199743","display_name":"Gradient descent","level":3,"score":0.49040747},{"id":"https://openalex.org/C50644808","wikidata":"https://www.wikidata.org/wiki/Q192776","display_name":"Artificial neural network","level":2,"score":0.47207},{"id":"https://openalex.org/C177148314","wikidata":"https://www.wikidata.org/wiki/Q170084","display_name":"Generalization","level":2,"score":0.43785074},{"id":"https://openalex.org/C190502265","wikidata":"https://www.wikidata.org/wiki/Q17069496","display_name":"MNIST database","level":3,"score":0.43446738},{"id":"https://openalex.org/C177264268","wikidata":"https://www.wikidata.org/wiki/Q1514741","display_name":"Set (abstract data type)","level":2,"score":0.41095665},{"id":"https://openalex.org/C119857082","wikidata":"https://www.wikidata.org/wiki/Q2539","display_name":"Machine learning","level":1,"score":0.34059805},{"id":"https://openalex.org/C33923547","wikidata":"https://www.wikidata.org/wiki/Q395","display_name":"Mathematics","level":0,"score":0.2684291},{"id":"https://openalex.org/C134306372","wikidata":"https://www.wikidata.org/wiki/Q7754","display_name":"Mathematical analysis","level":1,"score":0.08683935},{"id":"https://openalex.org/C38652104","wikidata":"https://www.wikidata.org/wiki/Q3510521","display_name":"Computer security","level":1,"score":0.0},{"id":"https://openalex.org/C115961682","wikidata":"https://www.wikidata.org/wiki/Q860623","display_name":"Image (mathematics)","level":2,"score":0.0},{"id":"https://openalex.org/C199360897","wikidata":"https://www.wikidata.org/wiki/Q9143","display_name":"Programming language","level":1,"score":0.0}],"mesh":[],"locations_count":3,"locations":[{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2301.13703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":true,"landing_page_url":"http://arxiv.org/abs/2301.13703","pdf_url":"http://arxiv.org/pdf/2301.13703","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":null,"license_id":null,"version":"submittedVersion","is_accepted":false,"is_published":false},{"is_oa":false,"landing_page_url":"https://api.datacite.org/dois/10.48550/arxiv.2301.13703","pdf_url":null,"source":{"id":"https://openalex.org/S4393179698","display_name":"DataCite API","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I4210145204","host_organization_name":"DataCite","host_organization_lineage":["https://openalex.org/I4210145204"],"host_organization_lineage_names":["DataCite"],"type":"metadata"},"license":null,"license_id":null,"version":null}],"best_oa_location":{"is_oa":true,"landing_page_url":"https://arxiv.org/abs/2301.13703","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":["Cornell University"],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false},"sustainable_development_goals":[{"display_name":"Peace, justice, and strong institutions","score":0.83,"id":"https://metadata.un.org/sdg/16"}],"grants":[],"datasets":[],"versions":[],"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4315785049","https://openalex.org/W4309434778","https://openalex.org/W4283773090","https://openalex.org/W4226299596","https://openalex.org/W4225929918","https://openalex.org/W3214759249","https://openalex.org/W3118286710","https://openalex.org/W3095450733","https://openalex.org/W3015792155","https://openalex.org/W2999408031"],"abstract_inverted_index":{"Understanding":[0],"when":[1,260],"the":[2,20,34,43,46,51,69,96,112,124,128,145,150,156,173,181,213,241,261,264,274,288,294,299],"noise":[3,38,103,151,206,280],"in":[4,26,95,209,273],"stochastic":[5],"gradient":[6,59],"descent":[7],"(SGD)":[8],"affects":[9,40],"generalization":[10],"of":[11,36,45,53,80,152,166,184,204,263,290,296],"deep":[12],"neural":[13],"networks":[14,23],"remains":[15],"a":[16,63,163,201,232,245,253],"challenge,":[17],"complicated":[18],"by":[19,211],"fact":[21],"that":[22,66,101,144,175,200,224],"can":[24,104,133,281],"operate":[25],"distinct":[27],"training":[28,47,113,210,247,255,265],"regimes.":[29],"Here":[30],"we":[31,142,222,303],"study":[32],"how":[33],"magnitude":[35],"this":[37,170],"$T$":[39,118,191],"performance":[41,94],"as":[42,180,194,302],"size":[44,262],"set":[48,266],"$P$":[49,193,267],"and":[50,82,192,252,279],"scale":[52],"initialization":[54],"$\\alpha$":[55,61,121],"are":[56,219,257],"varied.":[57],"For":[58,78],"descent,":[60],"is":[62,71,162],"key":[64,176,202],"parameter":[65],"controls":[67],"if":[68],"network":[70],"`lazy'($\\alpha\\gg1$)":[72],"or":[73,107,119],"instead":[74,108],"learns":[75],"features":[76],"($\\alpha\\ll1$).":[77],"classification":[79],"MNIST":[81],"CIFAR10":[83],"images,":[84],"our":[85],"central":[86],"results":[87,198],"are:":[88],"(i)":[89],"obtaining":[90],"phase":[91],"diagrams":[92],"for":[93],"$(\\alpha,T)$":[97],"plane.":[98],"They":[99],"show":[100],"SGD":[102,153,205,227,291],"be":[105,282],"detrimental":[106],"useful":[109],"depending":[110],"on":[111,137,189,293],"regime.":[114],"Moreover,":[115],"although":[116],"increasing":[117],"decreasing":[120],"both":[122,190],"allow":[123],"net":[125],"to":[126,226,239,244],"escape":[127],"lazy":[129],"regime,":[130],"these":[131,271],"changes":[132],"have":[134],"opposite":[135],"effects":[136],"performance.":[138],"(ii)":[139],"Most":[140],"importantly,":[141],"find":[143],"characteristic":[146],"temperature":[147],"$T_c$":[148],"where":[149,277],"starts":[154],"affecting":[155,212],"trained":[157],"model":[158],"(and":[159],"eventually":[160],"performance)":[161],"power":[164,195],"law":[165],"$P$.":[167],"We":[168,269],"relate":[169],"finding":[171],"with":[172],"observation":[174],"dynamical":[177],"quantities,":[178],"such":[179],"total":[182],"variation":[183],"weights":[185],"during":[186],"training,":[187],"depend":[188,292],"laws.":[196],"These":[197],"indicate":[199],"effect":[203,289],"occurs":[207],"late":[208],"stopping":[214],"process":[215],"whereby":[216],"all":[217],"data":[218,297],"fitted.":[220],"Indeed,":[221],"argue":[223],"due":[225],"noise,":[228],"nets":[229],"must":[230],"develop":[231],"stronger":[233,250],"`signal',":[234],"i.e.":[235],"larger":[236],"informative":[237],"weights,":[238],"fit":[240],"data,":[242],"leading":[243],"longer":[246,254],"time.":[248],"A":[249],"signal":[251,278],"time":[256],"also":[258],"required":[259],"increases.":[268],"confirm":[270],"views":[272],"perceptron":[275],"model,":[276],"precisely":[283],"measured.":[284],"Interestingly,":[285],"exponents":[286],"characterizing":[287],"density":[295],"near":[298],"decision":[300],"boundary,":[301],"explain.":[304]},"cited_by_api_url":"https://api.openalex.org/works?filter=cites:W4318908026","counts_by_year":[],"updated_date":"2024-12-07T05:26:04.573651","created_date":"2023-02-03"}