{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T09:47:34Z","timestamp":1726048054345},"reference-count":25,"publisher":"Springer Science and Business Media LLC","issue":"2-3","license":[{"start":{"date-parts":[[2015,3,3]],"date-time":"2015-03-03T00:00:00Z","timestamp":1425340800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2015,9]]},"DOI":"10.1007\/s10994-015-5484-1","type":"journal-article","created":{"date-parts":[[2015,3,2]],"date-time":"2015-03-02T20:37:57Z","timestamp":1425328677000},"page":"255-283","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Policy gradient in Lipschitz Markov Decision Processes"],"prefix":"10.1007","volume":"100","author":[{"given":"Matteo","family":"Pirotta","sequence":"first","affiliation":[]},{"given":"Marcello","family":"Restelli","sequence":"additional","affiliation":[]},{"given":"Luca","family":"Bascetta","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,3,3]]},"reference":[{"key":"5484_CR1","doi-asserted-by":"crossref","unstructured":"Amari, S., & Douglas, S. (1998). Why natural gradient? In: Acoustics, Speech and Signal Processing, 1998. Proceedings of the 1998 IEEE international conference on, vol 2, pp. 1213\u20131216 vol. 2, doi: 10.1109\/ICASSP.1998.675489 .","DOI":"10.1109\/ICASSP.1998.675489"},{"key":"5484_CR2","doi-asserted-by":"crossref","first-page":"21","DOI":"10.1007\/978-3-642-28499-1_2","volume-title":"Adaptive and learning agents, lecture notes in computer science","author":"H Ammar","year":"2012","unstructured":"Ammar, H., & Taylor, M. (2012). Reinforcement learning transfer via common subspaces. Adaptive and learning agents, lecture notes in computer science (Vol. 7113, pp. 21\u201336). Berlin: Springer."},{"issue":"1","key":"5484_CR3","doi-asserted-by":"crossref","first-page":"1","DOI":"10.2140\/pjm.1966.16.1","volume":"16","author":"L Armijo","year":"1966","unstructured":"Armijo, L. (1966). Minimization of functions having lipschitz continuous first partial derivatives. Pacific Journal of Mathematics, 16(1), 1\u20133.","journal-title":"Pacific Journal of Mathematics"},{"key":"5484_CR4","doi-asserted-by":"crossref","first-page":"319","DOI":"10.1613\/jair.806","volume":"15","author":"J Baxter","year":"2001","unstructured":"Baxter, J., & Bartlett, P. L. (2001). Infinite-horizon policy-gradient estimation. Journal of Artificial Intelligence Research, 15, 319\u2013350.","journal-title":"Journal of Artificial Intelligence Research"},{"key":"5484_CR5","volume-title":"Stochastic optimal control: The discrete time case","author":"DP Bertsekas","year":"1978","unstructured":"Bertsekas, D. P., & Shreve, S. E. (1978). Stochastic optimal control: The discrete time case (Vol. 139). New York: Academic Press."},{"issue":"1\u20132","key":"5484_CR6","first-page":"1","volume":"2","author":"MP Deisenroth","year":"2013","unstructured":"Deisenroth, M. P., Neumann, G., Peters, J., et al. (2013). A survey on policy search for robotics. Foundations and Trends in Robotics, 2(1\u20132), 1\u2013142.","journal-title":"Foundations and Trends in Robotics"},{"key":"5484_CR7","first-page":"201","volume-title":"Proceedings of the twenty-first conference annual conference on uncertainty in artificial intelligence (UAI-05)","author":"N Ferns","year":"2005","unstructured":"Ferns, N., Panangaden, P., & Precup, D. (2005). Metrics for markov decision processes with infinite state spaces. Proceedings of the twenty-first conference annual conference on uncertainty in artificial intelligence (UAI-05) (pp. 201\u2013208). Arlington, Virginia: AUAI Press."},{"issue":"6","key":"5484_CR8","doi-asserted-by":"crossref","first-page":"1291","DOI":"10.1109\/TSMCC.2012.2218595","volume":"42","author":"I Grondman","year":"2012","unstructured":"Grondman, I., Busoniu, L., Lopes, G. A., & Babuska, R. (2012). A survey of actor-critic reinforcement learning: Standard and natural policy gradients. Systems, Man, and Cybernetics, Part C: Applications and Reviews, IEEE Transactions on, 42(6), 1291\u20131307.","journal-title":"Systems, Man, and Cybernetics, Part C: Applications and Reviews, IEEE Transactions on"},{"issue":"1","key":"5484_CR9","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1007\/s00186-005-0438-1","volume":"62","author":"K Hinderer","year":"2005","unstructured":"Hinderer, K. (2005). Lipschitz continuity of value functions in markovian decision processes. Mathematical Methods of Operations Research, 62(1), 3\u201322.","journal-title":"Mathematical Methods of Operations Research"},{"key":"5484_CR10","unstructured":"Kakade, S. (2001). A natural policy gradient. In Advances in neural information processing systems 14 (Vol. 14, pp. 1531\u20131538). Vancouver, British Columbia: MIT Press."},{"key":"5484_CR11","unstructured":"Kober, J., & Peters, J. (2008). Policy search for motor primitives in robotics. In Advances in neural information processing systems 21 (Vol. 21, pp. 849\u2013856). Vancouver, British Columbia: Curran Associates, Inc."},{"issue":"3","key":"5484_CR12","doi-asserted-by":"crossref","first-page":"286","DOI":"10.1145\/192115.192132","volume":"20","author":"JJ Mor\u00e9","year":"1994","unstructured":"Mor\u00e9, J. J., & Thuente, D. J. (1994). Line search algorithms with guaranteed sufficient decrease. ACM Transactions on Mathematical Software, 20(3), 286\u2013307.","journal-title":"ACM Transactions on Mathematical Software"},{"key":"5484_CR13","doi-asserted-by":"crossref","unstructured":"Peters, J., & Schaal, S. (2006). Policy gradient methods for robotics. In: Intelligent robots and systems, 2006 IEEE\/RSJ international conference on, (pp. 2219\u20132225).","DOI":"10.1109\/IROS.2006.282564"},{"issue":"7\u20139","key":"5484_CR14","doi-asserted-by":"crossref","first-page":"1180","DOI":"10.1016\/j.neucom.2007.11.026","volume":"71","author":"J Peters","year":"2008","unstructured":"Peters, J., & Schaal, S. (2008a). Natural actor-critic. Neurocomputing, 71(7\u20139), 1180\u20131190.","journal-title":"Neurocomputing"},{"issue":"4","key":"5484_CR15","doi-asserted-by":"crossref","first-page":"682","DOI":"10.1016\/j.neunet.2008.02.003","volume":"21","author":"J Peters","year":"2008","unstructured":"Peters, J., & Schaal, S. (2008b). Reinforcement learning of motor skills with policy gradients. Neural Networks, 21(4), 682\u2013697.","journal-title":"Neural Networks"},{"key":"5484_CR16","first-page":"1394","volume":"26","author":"M Pirotta","year":"2013","unstructured":"Pirotta, M., Restelli, M., & Bascetta, L. (2013). Adaptive step-size for policy gradient methods. Advances in Neural Information Processing Systems, 26, 1394\u20131402.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"5484_CR17","doi-asserted-by":"crossref","DOI":"10.1002\/9780470316887","volume-title":"Markov decision processes: Discrete stochastic dynamic programming","author":"ML Puterman","year":"1994","unstructured":"Puterman, M. L. (1994). Markov decision processes: Discrete stochastic dynamic programming. New York, NY: Wiley."},{"key":"5484_CR18","unstructured":"Rachelson, E., & Lagoudakis, M.G. (2010). On the locality of action domination in sequential decision making. In: International symposium on artificial intelligence and mathematics."},{"key":"5484_CR19","doi-asserted-by":"crossref","unstructured":"Robbins, H., & Monro, S. (1951). A stochastic approximation method. The Annals of Mathematical Statistics, 22(3), 400\u2013407.","DOI":"10.1214\/aoms\/1177729586"},{"key":"5484_CR20","unstructured":"Rosenstein, M. T., & Barto, A. G. (2004). Supervised actor-critic reinforcement learning. In J. Si, A. Barto, W. Powell, & D. Wunsch (Eds.), Handbook of learning and approximate dynamic programming (pp.\u00a0359\u2013380). John Wiley & Sons, Inc."},{"issue":"3","key":"5484_CR21","doi-asserted-by":"crossref","first-page":"332","DOI":"10.1109\/9.119632","volume":"37","author":"JC Spall","year":"1992","unstructured":"Spall, J. C. (1992). Multivariate stochastic approximation using a simultaneous perturbation gradient approximation. Automatic Control, IEEE Transactions on, 37(3), 332\u2013341.","journal-title":"Automatic Control, IEEE Transactions on"},{"key":"5484_CR22","unstructured":"Sutton, R.S., McAllester, D.A., Singh, S.P., & Mansour, Y. (1999). Policy gradient methods for reinforcement learning with function approximation. In: Advances in neural information processing systems 12, [NIPS Conference, Denver, Colorado, USA, November 29 - December 4, 1999], (pp. 1057\u20131063)."},{"issue":"2","key":"5484_CR23","doi-asserted-by":"crossref","first-page":"123","DOI":"10.1007\/s10514-009-9132-0","volume":"27","author":"N Vlassis","year":"2009","unstructured":"Vlassis, N., Toussaint, M., Kontes, G., & Piperidis, S. (2009). Learning model-free robot control by a monte carlo em algorithm. Autonomous Robots, 27(2), 123\u2013130.","journal-title":"Autonomous Robots"},{"key":"5484_CR24","first-page":"2573","volume":"24","author":"P Wagner","year":"2011","unstructured":"Wagner, P. (2011). A reinterpretation of the policy oscillation phenomenon in approximate policy iteration. Advances in Neural Information Processing Systems, 24, 2573\u20132581.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"5484_CR25","first-page":"229","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R. J. (1992). Simple statistical gradient-following algorithms for connectionist reinforcement learning. Machine Learning, 8, 229\u2013256.","journal-title":"Machine Learning"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-015-5484-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s10994-015-5484-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-015-5484-1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,8,21]],"date-time":"2019-08-21T13:29:17Z","timestamp":1566394157000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s10994-015-5484-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,3,3]]},"references-count":25,"journal-issue":{"issue":"2-3","published-print":{"date-parts":[[2015,9]]}},"alternative-id":["5484"],"URL":"https:\/\/doi.org\/10.1007\/s10994-015-5484-1","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015,3,3]]}}}