{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T11:35:58Z","timestamp":1730201758240,"version":"3.28.0"},"reference-count":49,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,11,1]],"date-time":"2022-11-01T00:00:00Z","timestamp":1667260800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,11,1]],"date-time":"2022-11-01T00:00:00Z","timestamp":1667260800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,11]]},"DOI":"10.1109\/candar57322.2022.00016","type":"proceedings-article","created":{"date-parts":[[2023,2,8]],"date-time":"2023-02-08T18:57:03Z","timestamp":1675882623000},"page":"66-75","source":"Crossref","is-referenced-by-count":1,"title":["Component-Wise Natural Gradient Descent - An Efficient Neural Network Optimization"],"prefix":"10.1109","author":[{"ORCID":"http:\/\/orcid.org\/0000-0002-4211-049X","authenticated-orcid":false,"given":"Tran","family":"Van Sang","sequence":"first","affiliation":[{"name":"The University of Tokyo,Japan"}]},{"given":"Mhd","family":"Irvan","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Japan"}]},{"ORCID":"http:\/\/orcid.org\/0000-0002-6359-2221","authenticated-orcid":false,"given":"Rie Shigetomi","family":"Yamaguchi","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Japan"}]},{"ORCID":"http:\/\/orcid.org\/0000-0001-6383-7105","authenticated-orcid":false,"given":"Toshiyuki","family":"Nakata","sequence":"additional","affiliation":[{"name":"The University of Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_38"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"volume-title":"Neural networks for machine learning, overview of mini-batch gradient descent","author":"Hinton","key":"ref3"},{"key":"ref4","article-title":"Adadelta: An adaptive learning rate method","author":"Zeiler","year":"2012","journal-title":"ArXiv"},{"key":"ref5","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","author":"Duchi","year":"2011","journal-title":"J. Mach. Learn. Res."},{"key":"ref6","article-title":"On the importance of initialization and momentum in deep learning","volume-title":"Proceedings of the 30th International Conference on Machine Learning","author":"Sutskever","year":"2013"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CAC53003.2021.9727247"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1162\/089976698300017746"},{"volume-title":"Asymptotic methods in statistical decision theory","year":"1986","author":"LeCam","key":"ref9"},{"key":"ref10","article-title":"Exact natural gradient in deep linear networks and its application to the nonlinear case","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Bernacchia"},{"key":"ref11","article-title":"Fast convergence of natural gradient descent for over-parameterized neural networks","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Zhang"},{"volume-title":"Gram-gauss-newton method: Learning overparameterized neural networks for regression problems","year":"2019","author":"Cai","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ac3ae3"},{"key":"ref14","first-page":"2408","article-title":"Optimizing neural networks with kronecker-factored approximate curvature","volume-title":"Proceedings of the 32nd International Conference on Machine Learning","volume":"37","author":"Martens"},{"key":"ref15","first-page":"573","article-title":"A kronecker-factored approximate fisher matrix for convolution layers","volume-title":"Proceedings of the 33rd International Conference on International Conference on Machine Learning - Volume 48","author":"Grosse"},{"key":"ref16","article-title":"Distributed second-order optimization using kronecker-factored approximations","volume-title":"International Conference on Learning Representations","author":"Ba","year":"2017"},{"key":"ref17","article-title":"Kronecker-factored curvature approximations for recurrent neural networks","volume-title":"International Conference on Learning Representations","author":"Martens","year":"2018"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1162\/089976600300015637"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-57369-0_29"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1093\/imaiai\/iav006"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1162\/08997660260028683"},{"key":"ref22","article-title":"Limitations of the empirical fisher approximation for natural gradient descent","volume-title":"Advances in Neural Information Processing Systems","author":"Kunstner","year":"2019"},{"key":"ref23","article-title":"Topmoumoute online natural gradient algorithm","volume-title":"Advances in Neural Information Processing Systems","volume":"20","author":"Roux"},{"key":"ref24","article-title":"Fisher information and natural gradient learning in random deep networks","volume-title":"Proceedings of the Twenty-Second International Conference on Artificial Intelligence and Statistics","author":"Amari","year":"2019"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/BF01589116"},{"key":"ref26","article-title":"A multi-batch l-bfgs method for machine learning","volume-title":"Advances in Neural Information Processing Systems","author":"Berahas","year":"2016"},{"volume-title":"MNIST handwritten digit database","year":"2010","author":"LeCun","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/S0893-6080(00)00051-4"},{"key":"ref29","first-page":"1032","article-title":"Universal statistics of fisher information in deep neural networks: Mean field approach","volume-title":"Proceedings of the Twenty-Second International Conference on Artificial Intelligence and Statistics","volume":"89","author":"Karakida"},{"key":"ref30","article-title":"New insights and perspectives on the natural gradient method","author":"Martens","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"ref31","article-title":"Revisiting natural gradient for deep networks","volume-title":"International Conference on Learning Representations","author":"Pascanu","year":"2014"},{"key":"ref32","first-page":"2865","article-title":"Mean field residual networks: On the edge of chaos","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Yang"},{"key":"ref33","article-title":"Random feedback weights support learning in deep neural networks","volume":"abs\/1411.0247","author":"Lillicrap","year":"2014","journal-title":"ArXiv"},{"key":"ref34","article-title":"Deep information propagation","author":"Schoenholz","year":"2016","journal-title":"arXiv preprint"},{"key":"ref35","article-title":"Scaling limits of wide neural networks with weight sharing: Gaussian process behavior, gradient independence, and neural tangent kernel derivation","volume":"abs\/1902.04760","author":"Yang","year":"2019","journal-title":"CoRR"},{"key":"ref36","article-title":"Mean field theory for deep dropout networks: digging up gradient backpropagation deeply","volume-title":"24th European Conference on Artificial Intelligence - ECAI 2020","author":"Huang","year":"2020"},{"key":"ref37","article-title":"Deep mean field theory: Layer-wise variance and width variation as methods to control gradient explosion","volume-title":"ICLR 2018","author":"Yang","year":"2018"},{"volume-title":"Dynamical isometry and a mean field theory of cnns: How to train 10,000-layer vanilla convolutional neural networks","year":"2018","author":"Xiao","key":"ref38"},{"key":"ref39","article-title":"The normalization method for alleviating pathological sharpness in wide neural networks","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Karakida"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1002\/bimj.19640060317"},{"volume-title":"Derivations of the fisher information","author":"Rothman","key":"ref41"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511813658"},{"issue":"026261183x","key":"ref43","volume-title":"A Guide to Econometrics","volume":"1","author":"Kennedy","year":"2003"},{"volume-title":"Numerical Optimization","year":"2006","author":"Nocedal","key":"ref44"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.123"},{"volume-title":"TensorFlow: Large-scale machine learning on heterogeneous systems","year":"2015","author":"Abadi","key":"ref46"},{"volume-title":"K-fac: Kronecker-factored approximate curvature","year":"2022","key":"ref47"},{"volume-title":"Cifar-10 (canadian institute for advanced research)","author":"Krizhevsky","key":"ref48"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"}],"event":{"name":"2022 Tenth International Symposium on Computing and Networking (CANDAR)","start":{"date-parts":[[2022,11,21]]},"location":"Himeji, Japan","end":{"date-parts":[[2022,11,24]]}},"container-title":["2022 Tenth International Symposium on Computing and Networking (CANDAR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10035105\/10035109\/10035232.pdf?arnumber=10035232","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,13]],"date-time":"2024-02-13T13:47:21Z","timestamp":1707832041000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10035232\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11]]},"references-count":49,"URL":"https:\/\/doi.org\/10.1109\/candar57322.2022.00016","relation":{},"subject":[],"published":{"date-parts":[[2022,11]]}}}