<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> Advantage Actor Critic (A2C) • High variance:
<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline:
<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):
<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):
<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> <latexit sha1_base64="5+vMcXT+CN5kLeqSR9srGNolKAg=">AC3icbZDLSgMxFIYz9VbrbdSlm9AiVNAyI4JuhKobly3YC7TDkEkzbWjmQnJGKV7N76KGxeKuPUF3Pk2ZtpZ1NYfAl/+cw7J+b1YcAW9WPkVlbX1jfym4Wt7Z3dPXP/oKmiRFLWoJGIZNsjigkesgZwEKwdS0YCT7CWN7xL61HJhWPwgcYxcwJSD/kPqcEtOWaxZsyceEUKxdO8DWuz93OcLOcgmuWrIo1FV4GO4MSylRze9uL6JwEKgijVsa0YnDGRwKlgk0I3USwmdEj6rKMxJAFTzni6ywQfa6eH/UjqEwKeuvMTYxIoNQo83RkQGKjFWmr+V+sk4F85Yx7GCbCQzh7yE4EhwmkwuMcloyBGgiVXP8V0wGRhIKOr6BDsBdXobmecW2Knb9olS9zeLIoyNURGVko0tURfeohqIoif0gt7Qu/FsvBofxuesNWdkM4foj4yvX5iQl5Y=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):
<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> <latexit sha1_base64="5+vMcXT+CN5kLeqSR9srGNolKAg=">AC3icbZDLSgMxFIYz9VbrbdSlm9AiVNAyI4JuhKobly3YC7TDkEkzbWjmQnJGKV7N76KGxeKuPUF3Pk2ZtpZ1NYfAl/+cw7J+b1YcAW9WPkVlbX1jfym4Wt7Z3dPXP/oKmiRFLWoJGIZNsjigkesgZwEKwdS0YCT7CWN7xL61HJhWPwgcYxcwJSD/kPqcEtOWaxZsyceEUKxdO8DWuz93OcLOcgmuWrIo1FV4GO4MSylRze9uL6JwEKgijVsa0YnDGRwKlgk0I3USwmdEj6rKMxJAFTzni6ywQfa6eH/UjqEwKeuvMTYxIoNQo83RkQGKjFWmr+V+sk4F85Yx7GCbCQzh7yE4EhwmkwuMcloyBGgiVXP8V0wGRhIKOr6BDsBdXobmecW2Knb9olS9zeLIoyNURGVko0tURfeohqIoif0gt7Qu/FsvBofxuesNWdkM4foj4yvX5iQl5Y=</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):
<latexit sha1_base64="Q87OeiBHTj2uynKBf+Fc2+4r43I=">ACKXicbVBNSwMxEM36bf2qevQSLIJeyq4Ieix68ajSqtAty2w2bYPZElmhbL073jxr3hRUNSrf8RsW8GvB4E3782QmRdnUlj0/Tdvanpmdm5+YbGytLyulZd37i0OjeMt5iW2lzHYLkUirdQoOTXmeGQxpJfxTcnpX91y40VWjVxkPFOCj0luoIBOimqNkIFsYQoxD5HoKHUPRpm4qvehQhpmIqE2gj3aMgSXdaAfQayuBhGzaha8+v+CPQvCSakRiY4i6pPYaJZnKFTIK17cDPsFOAQcEkH1bC3PIM2A30eNtRBSm3nWJ06ZDuOCWhXW3cU0hH6veJAlJrB2nsOsl7W+vFP/z2jl2jzqFUFmOXLHxR91cUtS0jI0mwnCGcuAIMCPcrpT1wQBDF27FhRD8PvkvudyvB349OD+oNY4ncSyQLbJNdklADkmDnJIz0iKM3JEH8kxevHv0Xv13setU95kZpP8gPfxCf5KpnQ=</latexit> <latexit sha1_base64="stVTPD2NxfInfe0m+8mdo3PSnQ=">ACL3icbVBNSysxFM348dQ+37Pq0k2wPKiLV2ZE0KUoiMsqrQpNGe5k0jaYSYbkjlCG/iM3/hU3Ioq49V+Y1gp+HQicnHMv96T5Eo6DMP7YGZ2bv7XwuJS5fyn78r1dW1M2cKy0WbG2XsRQJOKlFGyUqcZFbAVmixHlyeTj2z6+EdLoFg5z0c2gr2VPckAvxdUjpiFREDMcCATKlOlTlsv3fx1ipCyTKXUxblHGU4O0zjLAQdVno7iFv1Pk624Wgsb4QT0O4mpEamaMbVW5YaXmRCI1fgXCcKc+yWYFyJUYVjiRA7+Evuh4qiETrltO7h3Rf15Jac9Y/zTSifqxo4TMuWGW+Mrxpu6rNxZ/8joF9va6pdR5gULzt0G9QlE0dBweTaUVHNXQE+BW+l0pH4AFj7ig8h+nryd3K23YjCRnSyU9s/mMaxSDbIJqmTiOySfXJMmqRNOLkmt+SBPAY3wV3wFDy/lc4E0518gnBysfJqfQ</latexit> <latexit sha1_base64="UAloJD1/ubWRcMSZ24yHXdTCXs=">ACNHicbVBNaxsxENXmO85H3eSYi6gJOIeY3VBoj6a5BHpJi+0ELPMamVbRCst0mzALP5RueSH5BICOTSEXPsbqrVdaJM8EDy9N8PMvCRX0mEYPgRLyura+sbm7Wt7Z3dD/WPez1nCstFlxtl7GUCTipRclKnGZWwFZosRFcnVa+RfXwjpdAcnuRhkMNJyKDmgl+L6d6YhURAzHAsEypQZUZbLv/8mxEhZJlPqYjyijKcGaZNlgGMOqvw5jTv0mPalXsU1xthK5yBviXRgjTIAudx/Y6lheZ0MgVONePwhwHJViUXIlpjRVO5MCvYCT6nmrIhBuUs6On9NArKR0a659GOlP/7Sghc26SJb6yWte9irxPa9f4PDroJQ6L1BoPh80LBRFQ6sEaSqt4KgmngC30u9K+RgscPQ513wI0euT35LeSsKW9GPz432t0UcG+SAfCJNEpEvpE3OyDnpEk5uyD35RZ6C2+AxeA5e5qVLwaJn/yH4Pcfr46pjQ=</latexit> <latexit sha1_base64="5+vMcXT+CN5kLeqSR9srGNolKAg=">AC3icbZDLSgMxFIYz9VbrbdSlm9AiVNAyI4JuhKobly3YC7TDkEkzbWjmQnJGKV7N76KGxeKuPUF3Pk2ZtpZ1NYfAl/+cw7J+b1YcAW9WPkVlbX1jfym4Wt7Z3dPXP/oKmiRFLWoJGIZNsjigkesgZwEKwdS0YCT7CWN7xL61HJhWPwgcYxcwJSD/kPqcEtOWaxZsyceEUKxdO8DWuz93OcLOcgmuWrIo1FV4GO4MSylRze9uL6JwEKgijVsa0YnDGRwKlgk0I3USwmdEj6rKMxJAFTzni6ywQfa6eH/UjqEwKeuvMTYxIoNQo83RkQGKjFWmr+V+sk4F85Yx7GCbCQzh7yE4EhwmkwuMcloyBGgiVXP8V0wGRhIKOr6BDsBdXobmecW2Knb9olS9zeLIoyNURGVko0tURfeohqIoif0gt7Qu/FsvBofxuesNWdkM4foj4yvX5iQl5Y=</latexit> <latexit sha1_base64="73WF1LFqO48zTabI2caWEeiC8nc=">ACJHicbVDLSgMxFM3UV62vUZdugkXoUC0zIiIUHXjsop9QFuGTJq2oZkHyR2hDP0YN/6KGxc+cOHGbzHTdlGrBwLnMvN/d4keAKbPvLyCwsLi2vZFdza+sbm1vm9k5NhbGkrEpDEcqGRxQTPGBV4CBYI5KM+J5gdW9wnfr1ByYVD4N7GEas7ZNewLucEtCSa5fFpQLh5i4YOELXGj5BPqUiORuNGMUcU1XCRSdkWXho3EFlmvm7ZI9Bv5LnCnJoykqrvne6oQ09lkAVBClmo4dQTshEjgVbJRrxYpFhA5IjzU1DYjPVDsZHznCB1rp4G4o9QsAj9XZiYT4Sg19T3emN6h5LxX/85oxdM/aCQ+iGFhAJ4u6scAQ4jQx3OGSURBDTQiVXP8V0z6RhILONadDcOZP/ktqxyXHLjm3J/ny1TSOLNpD+6iAHSKyugGVAVUfSIntErejOejBfjw/ictGaM6cwu+gXj+wcTkaC8</latexit> Advantage Actor Critic (A2C) • High variance: • Reduce variance with baseline: • Use value-function as the baseline (A2C):
Advantage Actor Critic (A2C) • A2C is great, but you can only use each rollout once!
Advantage Actor Critic (A2C) • A2C is great, but you can only use each rollout once! Why?
Advantage Actor Critic (A2C) • A2C is great, but you can only use each rollout once! • No theoretical grounding to do so
Advantage Actor Critic (A2C) • Works poorly in-practice
Advantage Actor Critic (A2C) • Works poorly in-practice Image credit: Alberto Metelli, 2018
Outline • RL Refresher/Advantage Actor Critic (A2C) • Trust Region Policy Optimization (TRPO) • Proximal Policy Optimization (PPO) • Application: PointGoal Navigation Results
Trust Region Policy Optimization (TRPO) A2C Maximizes:
<latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> Trust Region Policy Optimization (TRPO) Given a policy:
<latexit sha1_base64="m/+Baqn3U8yhwCE2Svpz4XjOY=">ACTXicbVFNb9NAEF2nQNPwUbcuYyIiBK1jWyEVC6VAlw4FkTSHGwxutNur6o7tjpMjyH+wFiRv/opceihBineYQGkZa6c17b7Q7b6NcSUOe9NpbD14+Gi7udN6/OTps13b39kskJzMeSZyvQ4QiOUTMWQJCkxzrXAJFLiLr4UOtn34Q2Mku/0CIX0wTnqZxJjmSp0I0DwgI6EBiZwCV0MSQIEhmDCakHQCtd18vu7Y5BKyZDpxYHemcoyo/V2vKAYyWzpIO/KoHR1D3UBt6odv2+t6yYBP4K9BmqzoN3R9BnPEiESlxhcZMfC+naYmaJFeiagWFETnyC5yLiYUpJsJMy2UaFbyTAyzTNuTEizZ9YkSE2MWSWSd9SLmvlaT/9MmBc3eTkuZ5gWJlN9dNCsUAZ1tBLTiphQXItbRvBX6OGjnZD2jZEPz7K2+C0eu+7/X9T2/ag/erOJrsBXvJusxnx2zAPrJTNmScXbFrdst+Od+dG+e38+fO2nBWM8/ZP9XY/gsupq3+</latexit> <latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> Trust Region Policy Optimization (TRPO) Given a policy: Collect experience and calculate advantage
<latexit sha1_base64="m/+Baqn3U8yhwCE2Svpz4XjOY=">ACTXicbVFNb9NAEF2nQNPwUbcuYyIiBK1jWyEVC6VAlw4FkTSHGwxutNur6o7tjpMjyH+wFiRv/opceihBineYQGkZa6c17b7Q7b6NcSUOe9NpbD14+Gi7udN6/OTps13b39kskJzMeSZyvQ4QiOUTMWQJCkxzrXAJFLiLr4UOtn34Q2Mku/0CIX0wTnqZxJjmSp0I0DwgI6EBiZwCV0MSQIEhmDCakHQCtd18vu7Y5BKyZDpxYHemcoyo/V2vKAYyWzpIO/KoHR1D3UBt6odv2+t6yYBP4K9BmqzoN3R9BnPEiESlxhcZMfC+naYmaJFeiagWFETnyC5yLiYUpJsJMy2UaFbyTAyzTNuTEizZ9YkSE2MWSWSd9SLmvlaT/9MmBc3eTkuZ5gWJlN9dNCsUAZ1tBLTiphQXItbRvBX6OGjnZD2jZEPz7K2+C0eu+7/X9T2/ag/erOJrsBXvJusxnx2zAPrJTNmScXbFrdst+Od+dG+e38+fO2nBWM8/ZP9XY/gsupq3+</latexit> <latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> <latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Given a policy: Collect experience and calculate advantage Maximize:
<latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Maximize: Read as: Policy is better than if it takes good actions ( ) more often and takes bad actions ( ) less often
<latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Maximize: Why this objective? Read as: Policy is better than if it takes good actions ( ) more often and takes bad actions ( ) less often
<latexit sha1_base64="m/+Baqn3U8yhwCE2Svpz4XjOY=">ACTXicbVFNb9NAEF2nQNPwUbcuYyIiBK1jWyEVC6VAlw4FkTSHGwxutNur6o7tjpMjyH+wFiRv/opceihBineYQGkZa6c17b7Q7b6NcSUOe9NpbD14+Gi7udN6/OTps13b39kskJzMeSZyvQ4QiOUTMWQJCkxzrXAJFLiLr4UOtn34Q2Mku/0CIX0wTnqZxJjmSp0I0DwgI6EBiZwCV0MSQIEhmDCakHQCtd18vu7Y5BKyZDpxYHemcoyo/V2vKAYyWzpIO/KoHR1D3UBt6odv2+t6yYBP4K9BmqzoN3R9BnPEiESlxhcZMfC+naYmaJFeiagWFETnyC5yLiYUpJsJMy2UaFbyTAyzTNuTEizZ9YkSE2MWSWSd9SLmvlaT/9MmBc3eTkuZ5gWJlN9dNCsUAZ1tBLTiphQXItbRvBX6OGjnZD2jZEPz7K2+C0eu+7/X9T2/ag/erOJrsBXvJusxnx2zAPrJTNmScXbFrdst+Od+dG+e38+fO2nBWM8/ZP9XY/gsupq3+</latexit> <latexit sha1_base64="MNyLJ0iEjL6De720taE7+Lp+Axg=">ACI3icbVDLSgMxFM3Ud31VXboJFqFuyowIiAU3bhUsCp0hiGTuW1DMw+TO2IZ5l/c+CtuXCjFjQv/xUzbha8DIYdziW5J0il0GjbH1ZlZnZufmFxqbq8srq2XtvYvNZJpji0eSITdRswDVLE0EaBEm5TBSwKJNwEg7PSv7kHpUSX+EwBS9ivVh0BWdoJL92fNdgPlI3EiHVPu7RE+qmws9d7AOy8oYHzBMZFkVBf0b9Wt1u2mPQv8SZkjqZ4sKvjdw4VkEMXLJtO4dopezhQKLqGoupmGlPEB60H0JhFoL18vGNBd40S0m6izImRjtXvEzmLtB5GgUlGDPv6t1eK/3mdDLtHXi7iNEOI+eShbiYpJrQsjIZCAUc5NIRxJcxfKe8zxTiaWqumBOf3yn/J9X7TsZvO5UG9dTqtY5Fskx3SIA45JC1yTi5Im3DySJ7JK3mznqwXa2S9T6IVazqzRX7A+vwCEC2j7A=</latexit> <latexit sha1_base64="Y/Q+5m3fIVE7ZlVTIpodCXH5oc=">ACQ3icbZDLSgMxFIYzXmu9V26CRahBSkzIuhG8LIRVxWsip06nMlkbGjmYnJGKEPfzY0v4M4XcONCEbeC6Wh1QOBP9/Dsn5/VQKjb9bE1MTk3PzBbmivMLi0vLpZXVC51kivEGS2SirnzQXIqYN1Cg5Fep4hD5kl/6neO+f3nPlRZJfI7dlLciuI1FKBigQV7p2o0A2wxkftqruNjmCFW6T91QAcvdVHhDVgEPqRuJgGoPq738bgxQlwUJ0sObu4q5b1HjVr1S2a7Zg6J/hTMSZTKquld6coOEZRGPkUnQunYKbZyUCiY5L2im2meAuvALW8aGUPEdSsfZNCjm4YENEyUOTHSAf05kUOkdTfyTWd/Yz3u9eF/XjPDcK+VizjNkMds+FCYSYoJ7QdKA6E4Q9k1ApgS5q+UtcHEhyb2ognBGV/5r7jYrjl2zTnbKR8cjeIokHWyQSrEIbvkgJyQOmkQRh7IC3kj79aj9Wp9WJ/D1glrNLNGfpX19Q185a/1</latexit> Trust Region Policy Optimization (TRPO) Given a policy: Collect experience and calculate advantage Maximize:
Trust Region Policy Optimization (TRPO) Image credit: Alberto Metelli, 2018
Trust Region Policy Optimization (TRPO) • Use a trust-region !
Trust Region Policy Optimization (TRPO) • PS 1 problem 1
Trust Region Policy Optimization (TRPO) • PS 1 problem 1 • In this problem, you showed that the gradient descent update rule can be seen as the minimizer of the affine-lower bound of subject to a trust-region:
<latexit sha1_base64="twLRSR1S0ZOn3JfPmQdbvpQgdY=">ACJ3icbZDLSsNAFIYnXmu9RV26GSxCuymJCLpRim5cVrAXaEKYTCbt0MnFmROhL6NG1/FjaAiuvRNnLZaOsPAz/fOYcz5/dTwRVY1pextLyurZe2ihvbm3v7Jp7+2VZJKyFk1EIrs+UzwmLWAg2DdVDIS+YJ1/OH1pN5YFLxJL6DUcrciPRjHnJKQCPvJQeVB0YMCA1fIGdUBKaOyn3ZqxKPMBOxAOsPKiN8/s54JkVq25NhReNXZgKtT0zFcnSGgWsRioIEr1bCsFNycSOBVsXHYyxVJCh6TPetrGJGLKzad3jvGxJgEOE6lfDHhKf0/kJFJqFPm6MyIwUPO1Cfyv1sgPHdzHqcZsJjOFoWZwJDgSWg4JRECNtCJVc/xXTAdFRgY62rEOw509eNO2Tum3V7dvTSuOqiKOEDtERqiIbnaEGukFN1EIUPaJn9IbejSfjxfgwPmetS0Yxc4D+yPj+Ab5bpT0=</latexit> <latexit sha1_base64="98O0mbVqYkorID1M2C1j2kUbRIg=">ACznicbVFbaxNBFJ5dbzXeoj76MhiEBGzZFUFfhKovYgWjNG0hky5nZ0+SoTuzm5mzsXFZfPX3+eYP8H84uaCm9cAMH+c71+kZa4cRdHPILxy9dr1Gzs3W7du37l7r3/wZErKitxIu8sCcpOMyVwQEpyvGktAg6zfE4PXu75I/naJ0qzCEtShxpmBg1VhLIu5L2L6GBphLy+n1zWh9+7n9suoKmSNDjr1qiMhna1ILE2ib0h3l9Ou6hJ5ySKjXJLUgPKda6bKwBEbi7hdUkylhxiGbgyGYNPwXd4SswqyrbIi9RW5kFlB/OCDSNVk0hWlStatur4BF1pl3LfrdHqm20zq7y/k5S2OFda0YITWt0SbsT7Ur45dBvAEdtrF+0v4hskJWGg3JHJwbxlFJoxosKZlj4zdwWI84sNPTSg0Y3q1Tka/sR7Mj4urH+G+Mr7b0YN2rmFTn3kUnx3kVs6/8cNKxq/HNXKlBWhketG4yrnVPDlbXmLErKFx6AtMrPyuUvMxeBdfyIsQXV74Mjp7txdFe/Ol5Z/NRo4d9og9Zl0Wsxdsn71jfTZgMjgIZsHXoA74Txswm/r0DY5DxkWxZ+/w2RmuEV</latexit> Trust Region Policy Optimization (TRPO)
Trust Region Policy Optimization (TRPO) • Advantage • Able to perform multiple optimization steps per rollout
Trust Region Policy Optimization (TRPO) • Advantage • Able to perform multiple optimization steps per rollout • Disadvantage • Choosing the correct value for beta is challenging and problem/network dependent
Outline • RL Refresher/Advantage Actor Critic (A2C) • Trust Region Policy Optimization (TRPO) • Proximal Policy Optimization (PPO) • Application: PointGoal Navigation Results
Proximal Policy Optimization (PPO)
Proximal Policy Optimization (PPO)
<latexit sha1_base64="WRGxFEVI9rYjIA4fp4fgmoJwGI4=">AB+nicbVBNS8NAEN34WetXqkcvi0Wol5KIoMeiF48V7Ae0IWw2m3bpZhN3J0qJ/SlePCji1V/izX/jts1BWx8MPN6bYWZekAquwXG+rZXVtfWNzdJWeXtnd2/frhy0dZIpylo0EYnqBkQzwSVrAQfBuqliJA4E6wSj6nfeWBK80TewThlXkwGkecEjCSb1fucY34gPsxD7H24RT7dtWpOzPgZeIWpIoKNH37qx8mNIuZBCqI1j3XScHLiQJOBZuU+5lmKaEjMmA9QyWJmfby2ekTfGKUEeJMiUBz9TfEzmJtR7HgemMCQz1ojcV/N6GUSXs5lmgGTdL4oygSGBE9zwCFXjIYG0Ko4uZWTIdEQomrbIJwV18eZm0z+quU3dvz6uNqyKOEjpCx6iGXHSBGugGNVELUfSIntErerOerBfr3fqYt65Yxcwh+gPr8wczm5Kn</latexit> Proximal Policy Optimization (PPO) Given a policy:
<latexit sha1_base64="MNyLJ0iEjL6De720taE7+Lp+Axg=">ACI3icbVDLSgMxFM3Ud31VXboJFqFuyowIiAU3bhUsCp0hiGTuW1DMw+TO2IZ5l/c+CtuXCjFjQv/xUzbha8DIYdziW5J0il0GjbH1ZlZnZufmFxqbq8srq2XtvYvNZJpji0eSITdRswDVLE0EaBEm5TBSwKJNwEg7PSv7kHpUSX+EwBS9ivVh0BWdoJL92fNdgPlI3EiHVPu7RE+qmws9d7AOy8oYHzBMZFkVBf0b9Wt1u2mPQv8SZkjqZ4sKvjdw4VkEMXLJtO4dopezhQKLqGoupmGlPEB60H0JhFoL18vGNBd40S0m6izImRjtXvEzmLtB5GgUlGDPv6t1eK/3mdDLtHXi7iNEOI+eShbiYpJrQsjIZCAUc5NIRxJcxfKe8zxTiaWqumBOf3yn/J9X7TsZvO5UG9dTqtY5Fskx3SIA45JC1yTi5Im3DySJ7JK3mznqwXa2S9T6IVazqzRX7A+vwCEC2j7A=</latexit> Proximal Policy Optimization (PPO) Given a policy:
<latexit sha1_base64="twLRSR1S0ZOn3JfPmQdbvpQgdY=">ACJ3icbZDLSsNAFIYnXmu9RV26GSxCuymJCLpRim5cVrAXaEKYTCbt0MnFmROhL6NG1/FjaAiuvRNnLZaOsPAz/fOYcz5/dTwRVY1pextLyurZe2ihvbm3v7Jp7+2VZJKyFk1EIrs+UzwmLWAg2DdVDIS+YJ1/OH1pN5YFLxJL6DUcrciPRjHnJKQCPvJQeVB0YMCA1fIGdUBKaOyn3ZqxKPMBOxAOsPKiN8/s54JkVq25NhReNXZgKtT0zFcnSGgWsRioIEr1bCsFNycSOBVsXHYyxVJCh6TPetrGJGLKzad3jvGxJgEOE6lfDHhKf0/kJFJqFPm6MyIwUPO1Cfyv1sgPHdzHqcZsJjOFoWZwJDgSWg4JRECNtCJVc/xXTAdFRgY62rEOw509eNO2Tum3V7dvTSuOqiKOEDtERqiIbnaEGukFN1EIUPaJn9IbejSfjxfgwPmetS0Yxc4D+yPj+Ab5bpT0=</latexit> <latexit sha1_base64="MNyLJ0iEjL6De720taE7+Lp+Axg=">ACI3icbVDLSgMxFM3Ud31VXboJFqFuyowIiAU3bhUsCp0hiGTuW1DMw+TO2IZ5l/c+CtuXCjFjQv/xUzbha8DIYdziW5J0il0GjbH1ZlZnZufmFxqbq8srq2XtvYvNZJpji0eSITdRswDVLE0EaBEm5TBSwKJNwEg7PSv7kHpUSX+EwBS9ivVh0BWdoJL92fNdgPlI3EiHVPu7RE+qmws9d7AOy8oYHzBMZFkVBf0b9Wt1u2mPQv8SZkjqZ4sKvjdw4VkEMXLJtO4dopezhQKLqGoupmGlPEB60H0JhFoL18vGNBd40S0m6izImRjtXvEzmLtB5GgUlGDPv6t1eK/3mdDLtHXi7iNEOI+eShbiYpJrQsjIZCAUc5NIRxJcxfKe8zxTiaWqumBOf3yn/J9X7TsZvO5UG9dTqtY5Fskx3SIA45JC1yTi5Im3DySJ7JK3mznqwXa2S9T6IVazqzRX7A+vwCEC2j7A=</latexit> Proximal Policy Optimization (PPO) Given a policy: Objective: Maximize
Proximal Policy Optimization (PPO)
<latexit sha1_base64="a75IYbuwf6T/xNrGACEcFVl6gI=">AB+nicbVDLSsNAFL3xWesr1aWbwSK4KokIuiy6ETdGsA9oY5lMJ+3QySTMTJQS8yluXCji1i9x5984abvQ1gMDh3Pu5Z45QcKZ0o7zbS0tr6yurZc2yptb2zu7dmWvqeJUEtogMY9lO8CKciZoQzPNaTuRFEcBp61gdFn4rQcqFYvFnR4n1I/wQLCQEayN1LMr3QjrIcE8u87vM8+7yXt21ak5E6BF4s5IFWbwevZXtx+TNKJCE46V6rhOov0MS80Ip3m5myqaYDLCA9oxVOCIKj+bRM/RkVH6KIyleUKjifp7I8ORUuMoMJNFUDXvFeJ/XifV4bmfMZGkmgoyPRSmHOkYFT2gPpOUaD42BPJTFZEhlhiok1bZVOCO/lRdI8qblOzb09rdYvZnWU4AO4RhcOIM6XIEHDSDwCM/wCm/Wk/VivVsf09Ela7azD39gf4AZSyUEg=</latexit> <latexit sha1_base64="a75IYbuwf6T/xNrGACEcFVl6gI=">AB+nicbVDLSsNAFL3xWesr1aWbwSK4KokIuiy6ETdGsA9oY5lMJ+3QySTMTJQS8yluXCji1i9x5984abvQ1gMDh3Pu5Z45QcKZ0o7zbS0tr6yurZc2yptb2zu7dmWvqeJUEtogMY9lO8CKciZoQzPNaTuRFEcBp61gdFn4rQcqFYvFnR4n1I/wQLCQEayN1LMr3QjrIcE8u87vM8+7yXt21ak5E6BF4s5IFWbwevZXtx+TNKJCE46V6rhOov0MS80Ip3m5myqaYDLCA9oxVOCIKj+bRM/RkVH6KIyleUKjifp7I8ORUuMoMJNFUDXvFeJ/XifV4bmfMZGkmgoyPRSmHOkYFT2gPpOUaD42BPJTFZEhlhiok1bZVOCO/lRdI8qblOzb09rdYvZnWU4AO4RhcOIM6XIEHDSDwCM/wCm/Wk/VivVsf09Ela7azD39gf4AZSyUEg=</latexit> <latexit sha1_base64="XU6mzGKxL8N+dKuvm07YfKiYqg=">AB+HicbVBNS8NAEN3Ur1o/GvXoZbEIFaQkIujBQ9WLxwr2A9oQNtNu3SzCbsToYb+Ei8eFPHqT/Hmv3Hb5qCtDwYe780wMy9IBNfgON9WYWV1bX2juFna2t7ZLdt7+y0dp4qyJo1FrDoB0UxwyZrAQbBOohiJAsHaweh26rcfmdI8lg8wTpgXkYHkIacEjOTb5euq9uEUEx9O8BV2fLvi1JwZ8DJxc1JBORq+/dXrxzSNmAQqiNZd10nAy4gCTgWblHqpZgmhIzJgXUMliZj2stnhE3xslD4OY2VKAp6pvycyEmk9jgLTGREY6kVvKv7ndVMIL72MyQFJul8UZgKDGepoD7XDEKYmwIoYqbWzEdEkUomKxKJgR38eVl0jqruU7NvT+v1G/yOIroEB2hKnLRBaqjO9RATURip7RK3qznqwX6936mLcWrHzmAP2B9fkD4qiRQ=</latexit> Proximal Policy Optimization (PPO)
Recommend
More recommend