Local SGD for non-i.i.d. data Konstantin Mishchenko Work done together with Ahmed Khaled and Peter Richtárik
<latexit sha1_base64="DJWDtL5mSWrGw/2Afw1MWCqws=">ACDnicbVC7SgNBFJ2NrxhfUubwRCITdiNgjZC0MYmEME8IBuX2clsMmRmdpmZlYRlv8DGX7GxUMTW2s6/cfIoNPHAhcM593LvPX7EqNK2/W1lVlbX1jeym7mt7Z3dvfz+QVOFscSkgUMWyraPFGFUkIampF2JAniPiMtf3g98VsPRCoaijs9jkiXo76gAcVIG8nLF1OhTeCbiARTpw0qaWuirmX8Esnva/BwOl0YmXL9hlewq4TJw5KYA56l7+y+2FOZEaMyQUh3HjnQ3QVJTzEiac2NFIoSHqE86hgrEieom03dSWDRKDwahNCU0nKq/JxLElRpz3RypAdq0ZuI/3mdWAcX3YSKNZE4NmiIGZQh3CSDexRSbBmY0MQltTcCvEAmWC0STBnQnAWX14mzUrZOS1Xbs8K1at5HFlwBI5BCTjgHFTBDaiDBsDgETyDV/BmPVkv1rv1MWvNWPOZQ/AH1ucPkhmbxw=</latexit> Problem M 1 X min f m ( x ) M x m =1 Convex
<latexit sha1_base64="DJWDtL5mSWrGw/2Afw1MWCqws=">ACDnicbVC7SgNBFJ2NrxhfUubwRCITdiNgjZC0MYmEME8IBuX2clsMmRmdpmZlYRlv8DGX7GxUMTW2s6/cfIoNPHAhcM593LvPX7EqNK2/W1lVlbX1jeym7mt7Z3dvfz+QVOFscSkgUMWyraPFGFUkIampF2JAniPiMtf3g98VsPRCoaijs9jkiXo76gAcVIG8nLF1OhTeCbiARTpw0qaWuirmX8Esnva/BwOl0YmXL9hlewq4TJw5KYA56l7+y+2FOZEaMyQUh3HjnQ3QVJTzEiac2NFIoSHqE86hgrEieom03dSWDRKDwahNCU0nKq/JxLElRpz3RypAdq0ZuI/3mdWAcX3YSKNZE4NmiIGZQh3CSDexRSbBmY0MQltTcCvEAmWC0STBnQnAWX14mzUrZOS1Xbs8K1at5HFlwBI5BCTjgHFTBDaiDBsDgETyDV/BmPVkv1rv1MWvNWPOZQ/AH1ucPkhmbxw=</latexit> Problem M 1 X min f m ( x ) M x m =1 Convex In practice, usually a neural network
<latexit sha1_base64="DJWDtL5mSWrGw/2Afw1MWCqws=">ACDnicbVC7SgNBFJ2NrxhfUubwRCITdiNgjZC0MYmEME8IBuX2clsMmRmdpmZlYRlv8DGX7GxUMTW2s6/cfIoNPHAhcM593LvPX7EqNK2/W1lVlbX1jeym7mt7Z3dvfz+QVOFscSkgUMWyraPFGFUkIampF2JAniPiMtf3g98VsPRCoaijs9jkiXo76gAcVIG8nLF1OhTeCbiARTpw0qaWuirmX8Esnva/BwOl0YmXL9hlewq4TJw5KYA56l7+y+2FOZEaMyQUh3HjnQ3QVJTzEiac2NFIoSHqE86hgrEieom03dSWDRKDwahNCU0nKq/JxLElRpz3RypAdq0ZuI/3mdWAcX3YSKNZE4NmiIGZQh3CSDexRSbBmY0MQltTcCvEAmWC0STBnQnAWX14mzUrZOS1Xbs8K1at5HFlwBI5BCTjgHFTBDaiDBsDgETyDV/BmPVkv1rv1MWvNWPOZQ/AH1ucPkhmbxw=</latexit> <latexit sha1_base64="2TvImUb0Ro/I8bGbe3H5+DgMKlo=">ACDnicbZDLSsNAFIYn9VbrLerSzWAptJuSVEFBhKILivYCzQhTKaTdujkwsxEWkKfwI2v4saFIm5du/NtnKRZaOuBgY/P4c53cjRoU0jG+tsLK6tr5R3Cxtbe/s7un7Bx0RxhyTNg5ZyHsuEoTRgLQlYz0Ik6Q7zLSdcfXqd9IFzQMLiX04jYPhoG1KMYSU5esVz/OqkBi+h5SM5ct3kZuYk1oTOYOZcQMU1Ry8bdSMruAxmDmWQV8vRv6xBiGOfBIzJETfNCJpJ4hLihmZlaxYkAjhMRqSvsIA+UTYSXbODFaUMoBeyNULJMzU3xMJ8oWY+q7qTHcWi14q/uf1Y+md2wkNoliSAM8/8mIGZQjTbOCAcoIlmypAmFO1K8QjxBGWKsGSCsFcPHkZOo26eVJv3J2Wm1d5HEVwBI5BFZjgDTBLWiBNsDgETyDV/CmPWkv2rv2MW8taPnMIfhT2ucPI2Sa5w=</latexit> Problem M 1 X min f m ( x ) M x m =1 f m ( x ) = E ξ f m ( x ; ξ )
<latexit sha1_base64="TU3D7h/GIHQvE30NZkRDc74gSRE=">ACenicbVHbatAEF2pt9S9ueljKQwxbROSGCkNJBACoX3JYwp1ErBcsVqN7CW7K7E7Sm2EP6K/lrd+SV/60LWtQpt0YOHMOXPbmaxS0lEU/QjCe/cfPHy09rjz5Omz5y+6L9fPXVlbgQNRqtJeZtyhkgYHJEnhZWR60zhRXb1aFfXKN1sjRfaFbhSPOxkYUnDyVdr9P04a24/lXDceQZDiWphG+npt3kgknaOWdhHBKjSxgDgQrB3SZe/f0OEoS6ExT8jV2IRlzrTkhmeKQ5HqzaVyBMlULsDWzp/0kiZov0mH4Huhydu+abcX9aOlwV0Qt6DHWjtLuzdJXopaoyGhuHPDOKpo1HBLUij0tWuHFRdXfIxDw3X6EbNcnVzeOuZHIrS+mcIluzfGQ3Xzs105iM1p4m7rS3I/2nDmorDUSNVRMasWpU1AqohMUdIJcWBamZB1xY6WcFMeGWC/LX6vglxLe/fBec7/XjD/29z/u9k4/tOtbYa7bBNlnMDtgJO2VnbMAE+xm8Cd4F74Nf4Ua4FW6vQsOgzXnF/rFw/zcCfb63</latexit> <latexit sha1_base64="DJWDtL5mSWrGw/2Afw1MWCqws=">ACDnicbVC7SgNBFJ2NrxhfUubwRCITdiNgjZC0MYmEME8IBuX2clsMmRmdpmZlYRlv8DGX7GxUMTW2s6/cfIoNPHAhcM593LvPX7EqNK2/W1lVlbX1jeym7mt7Z3dvfz+QVOFscSkgUMWyraPFGFUkIampF2JAniPiMtf3g98VsPRCoaijs9jkiXo76gAcVIG8nLF1OhTeCbiARTpw0qaWuirmX8Esnva/BwOl0YmXL9hlewq4TJw5KYA56l7+y+2FOZEaMyQUh3HjnQ3QVJTzEiac2NFIoSHqE86hgrEieom03dSWDRKDwahNCU0nKq/JxLElRpz3RypAdq0ZuI/3mdWAcX3YSKNZE4NmiIGZQh3CSDexRSbBmY0MQltTcCvEAmWC0STBnQnAWX14mzUrZOS1Xbs8K1at5HFlwBI5BCTjgHFTBDaiDBsDgETyDV/BmPVkv1rv1MWvNWPOZQ/AH1ucPkhmbxw=</latexit> Local SGD M 1 X min f m ( x ) M x m =1 ( x t +1 , if t mod H = 0 ˆ x m t +1 = x m t � γ r f m ( x m t ; ξ m t ) , otherwise
<latexit sha1_base64="lgA18Epjy9RAdJcl5dGIGQd9cg=">ACsXicbVFda9swFJW9ry7SrfHvVwWBg3rgt0OVhiBsr30pdB03bEjicrcqLEko10vSUI/789723/ZnLiwtb2guDo3HuOru5Ny1wYDI/n/v/oOHj3Yed548fb8RXf35YUpKs34iBV5oa9SanguFB+hwJxflZpTmeb8Ml1+afKXP7g2olDnuC5LOlMiUwio5Kur9WicV3YT2RMIQo5TOhLHN+pu5EmabMhrU9rSNTycQuhq7uFPZWCU4W8B6iGZWSQqRomlPIksU28wmilWhAv78fIV+hFRnUgLC9gCym7noyDKIOo1C3uElN17y2kv296/lBc65/ikMB9ciV9O23aTbCwbBJuA2CFvQI2cJd3f0bRgleQKWU6NGYdBibGlGgXLufOuDC8pW9IZHzuoqOQmtpuJ1/DWMVPICu2OQtiw/yoslcasZeoqJcW5uZlryLty4wqzo9gKVbIFds+lFU5YAHN+mAqNGeYrx2gTAvXK7A5dYtCt+SOG0J48u3wcXBIDwcHz90Dv+3I5jh7wmb8geCclHckxOyBkZEeYNvHMv9ib+of/N/+6n21LfazWvyH/hL/8CRMjSKA=</latexit> <latexit sha1_base64="DJWDtL5mSWrGw/2Afw1MWCqws=">ACDnicbVC7SgNBFJ2NrxhfUubwRCITdiNgjZC0MYmEME8IBuX2clsMmRmdpmZlYRlv8DGX7GxUMTW2s6/cfIoNPHAhcM593LvPX7EqNK2/W1lVlbX1jeym7mt7Z3dvfz+QVOFscSkgUMWyraPFGFUkIampF2JAniPiMtf3g98VsPRCoaijs9jkiXo76gAcVIG8nLF1OhTeCbiARTpw0qaWuirmX8Esnva/BwOl0YmXL9hlewq4TJw5KYA56l7+y+2FOZEaMyQUh3HjnQ3QVJTzEiac2NFIoSHqE86hgrEieom03dSWDRKDwahNCU0nKq/JxLElRpz3RypAdq0ZuI/3mdWAcX3YSKNZE4NmiIGZQh3CSDexRSbBmY0MQltTcCvEAmWC0STBnQnAWX14mzUrZOS1Xbs8K1at5HFlwBI5BCTjgHFTBDaiDBsDgETyDV/BmPVkv1rv1MWvNWPOZQ/AH1ucPkhmbxw=</latexit> Local SGD M 1 X min f m ( x ) M x m =1 ( P M j =1 ( x j t � γ r f j ( x j t ; ξ j 1 t )) , if t mod H = 0 x m M t +1 = x m t � γ r f m ( x m t ; ξ m t ) , otherwise
<latexit sha1_base64="EetfaAhe/yrdpwryMbhAPUM3Qgo=">ACEHicbVC7SgNBFJ31bXytWtoMRtHGsBsLbQTRJmUEY4QkhLuTyWbI7Mwyc1cJIZ9g46/YWChia2n3zh5FBo9MHA45x7u3BOlUlgMgi9vZnZufmFxaTm3srq2vuFvbt1YnRnGK0xLbW4jsFwKxSsoUPLb1HBIsmrUfdy6FfvuLFCq2vspbyRQKxEWzBAJzX9g73S2XVdahUbEXcQjNH3e1QrfmQ7Gim4LMRCxU0/HxSCEehfEk5InkxQbvqf9ZmWcIVMgnW1sIgxUYfDAom+SBXzyxPgXUh5jVHFSTcNvqjgwZ03ykt2tbGPYV0pP5M9CGxtpdEbjIB7Nhpbyj+59UybJ82+kKlGXLFxovamaSo6bAd2hKGM5Q9R4AZ4f5KWQcMHQd5lwJ4fTJf8lNsRAeF4pXxfz5xaSOJbJDdskhCckJOSclUiYVwsgDeSIv5NV79J69N+9PDrjTLb5Be8j29W9JzK</latexit> <latexit sha1_base64="DJWDtL5mSWrGw/2Afw1MWCqws=">ACDnicbVC7SgNBFJ2NrxhfUubwRCITdiNgjZC0MYmEME8IBuX2clsMmRmdpmZlYRlv8DGX7GxUMTW2s6/cfIoNPHAhcM593LvPX7EqNK2/W1lVlbX1jeym7mt7Z3dvfz+QVOFscSkgUMWyraPFGFUkIampF2JAniPiMtf3g98VsPRCoaijs9jkiXo76gAcVIG8nLF1OhTeCbiARTpw0qaWuirmX8Esnva/BwOl0YmXL9hlewq4TJw5KYA56l7+y+2FOZEaMyQUh3HjnQ3QVJTzEiac2NFIoSHqE86hgrEieom03dSWDRKDwahNCU0nKq/JxLElRpz3RypAdq0ZuI/3mdWAcX3YSKNZE4NmiIGZQh3CSDexRSbBmY0MQltTcCvEAmWC0STBnQnAWX14mzUrZOS1Xbs8K1at5HFlwBI5BCTjgHFTBDaiDBsDgETyDV/BmPVkv1rv1MWvNWPOZQ/AH1ucPkhmbxw=</latexit> <latexit sha1_base64="TrRkAl2UzCO/3evHqQkuSPWkQ0=">AC3icbVC7SgNBFJ31GeMramkzJBGswm4stBGCqaMaB6QhDA7mWyGzM4sM3eVsKS38VdsLBSx9Qfs/Bsnj0ITD1w4nHMv97jR4IbcN1vZ2l5ZXVtPbWR3tza3tnN7O3XjIo1ZVWqhNINnxgmuGRV4CBYI9KMhL5gdX9wOfbr90wbruQdDCPWDkgeY9TAlbqZL58rnXEkoGmgd9IFqrhzwOueQ+AdrHt9dXnUzOLbgT4EXizUgOzVDpZL5aXUXjkEmghjT9NwI2gnRwKlgo3QrNiwidEAC1rRUkpCZdjL5ZYSPrNLFPaVtScAT9fdEQkJjhqFvO0MCfTPvjcX/vGYMvbN2wmUA5N0uqgXCwKj4PBXa4ZBTG0hFDN7a2Y9okmFGx8aRuCN/yIqkVC95JoXhTzJUuZnGk0CHKomPkoVNUQmVUQVE0SN6Rq/ozXlyXpx352PauTMZg7QHzifP5omiE=</latexit> <latexit sha1_base64="TU3D7h/GIHQvE30NZkRDc74gSRE=">ACenicbVHbatAEF2pt9S9ueljKQwxbROSGCkNJBACoX3JYwp1ErBcsVqN7CW7K7E7Sm2EP6K/lrd+SV/60LWtQpt0YOHMOXPbmaxS0lEU/QjCe/cfPHy09rjz5Omz5y+6L9fPXVlbgQNRqtJeZtyhkgYHJEnhZWR60zhRXb1aFfXKN1sjRfaFbhSPOxkYUnDyVdr9P04a24/lXDceQZDiWphG+npt3kgknaOWdhHBKjSxgDgQrB3SZe/f0OEoS6ExT8jV2IRlzrTkhmeKQ5HqzaVyBMlULsDWzp/0kiZov0mH4Huhydu+abcX9aOlwV0Qt6DHWjtLuzdJXopaoyGhuHPDOKpo1HBLUij0tWuHFRdXfIxDw3X6EbNcnVzeOuZHIrS+mcIluzfGQ3Xzs105iM1p4m7rS3I/2nDmorDUSNVRMasWpU1AqohMUdIJcWBamZB1xY6WcFMeGWC/LX6vglxLe/fBec7/XjD/29z/u9k4/tOtbYa7bBNlnMDtgJO2VnbMAE+xm8Cd4F74Nf4Ua4FW6vQsOgzXnF/rFw/zcCfb63</latexit> Local SGD M 1 X min f m ( x ) M x m =1 ( x t +1 , if t mod H = 0 ˆ x m t +1 = x m t � γ r f m ( x m t ; ξ m t ) , otherwise H = 1 − → minibatch SGD H = T − → one-shot averaging
<latexit sha1_base64="/Ss/hIUrOmuSuaZw6WP6qfoGsY=">ACFXicbVBNSwMxEM36WetX1aOXYBEUteyqoBeh6MWjglWhrctsmq2hSXZJZsWy9E948a948aCIV8Gb/8a09uDXg2Ee782QzItSKSz6/oc3Mjo2PjFZmCpOz8zOzZcWFs9tkhnGayRibmMwHIpNK+hQMkvU8NBRZJfRJ2jvn9xw40ViT7DbsqbCtpaxIBOiksbd6GOW4EvStFD+htiK5v0UYblALa0BJoHGo1gbOelgq+xV/APqXBENSJkOchKX3RithmeIamQRr64GfYjMHg4J3is2MstTYB1o87qjGhS3zXxwVY+uOqVF48S40kgH6veNHJS1XRW5SQV4bX97fE/r5hvN/MhU4z5Jp9PRnkmJC+xHRljCcoew6AswI91fKrsEAQxdk0YUQ/D75LznfrgQ7le3T3XL1cBhHgSyTFbJGArJHquSYnJAaYeSOPJAn8uzde4/ei/f6NTriDXeWyA94b5/fPZ1j</latexit> Local GD x m t +1 = x m t � γ r f m ( x m t )
<latexit sha1_base64="/Ss/hIUrOmuSuaZw6WP6qfoGsY=">ACFXicbVBNSwMxEM36WetX1aOXYBEUteyqoBeh6MWjglWhrctsmq2hSXZJZsWy9E948a948aCIV8Gb/8a09uDXg2Ee782QzItSKSz6/oc3Mjo2PjFZmCpOz8zOzZcWFs9tkhnGayRibmMwHIpNK+hQMkvU8NBRZJfRJ2jvn9xw40ViT7DbsqbCtpaxIBOiksbd6GOW4EvStFD+htiK5v0UYblALa0BJoHGo1gbOelgq+xV/APqXBENSJkOchKX3RithmeIamQRr64GfYjMHg4J3is2MstTYB1o87qjGhS3zXxwVY+uOqVF48S40kgH6veNHJS1XRW5SQV4bX97fE/r5hvN/MhU4z5Jp9PRnkmJC+xHRljCcoew6AswI91fKrsEAQxdk0YUQ/D75LznfrgQ7le3T3XL1cBhHgSyTFbJGArJHquSYnJAaYeSOPJAn8uzde4/ei/f6NTriDXeWyA94b5/fPZ1j</latexit> <latexit sha1_base64="3gVDmUvpLV4RZgkqaHQ+IBOrOg=">ACOnicbVBNaxRBFOyJX3H9WvXopXERodlZiPoJRD04iWQgJsEtneHN71vNk26e4buN5KlM7/Li7/CmwcvHhTx6g+wZ7MHTSxoKrq8fpVUWvlKU2/JBvXrt+4eWvzdu/O3Xv3H/QfPjr0VeMkjmWlK3dcgEetLI5Jkcbj2iGYQuNRcfq2848+oPOqsu9pWePUwMKqUkmgKOX9A+HVwkBezkai6oJIQRCeUZhj2bZhp+WidCBD1oa9VvjG5MHsZO1sj4tzYaHQwMvc8K2z/MVzcT4b9fL+IB2mK/CrJFuTAVtjP+9/FvNKNgYtSQ3eT7K0pmkAR0pqbHui8ViDPIUFTiK1YNBPw+r0lj+LypyXlYvPEl+pf08EMN4vTRGTBujEX/Y68X/epKHy9TQoWzeEVl4sKhvNqeJdj3yuHErSy0hAOhX/yuUJxKYoltiVkF0+So5HA2z7eHo4OVg9826jk32hD1lWyxjr9gue8f2ZhJ9pF9Zd/Zj+RT8i35mfy6iG4k65nH7B8kv/8AIC6uEA=</latexit> The Variance of Local GD x m t +1 = x m t � γ r f m ( x m t ) M 1 def X σ 2 kr f m ( x ∗ ) k 2 = f M m =1
Recommend
More recommend