LAG: Lazily Aggregated Gradient for Communication-Efficient Distributed Learning Tianyi Chen Georgios Giannakis Tao Sun Wotao Yin UMN, ECE UCLA, Math NeurIPS 2018 1
<latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> <latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> <latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> <latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> Overview M := { 1 , . . . , M } J. Dean, G. Corrado, R. Monga, K. Chen, M. Devin, M. Mao, A. Senior, P. Tucker, K. Yang, Q. V. Le et al., 2 “Large-scale distributed deep networks,” Proc. NIPS ., Lake Tahoe, NV, pp. 1223–1231, 2012
<latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> <latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> <latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> <latexit sha1_base64="K7JrU5gtA9yB+iPfJpt/ZlnO0yY=">ACAXicbVDLSsNAFJ3UV62vqBvBzWARXJSERhIbN4UK9gFNKJPJpB06mQkzE6GEuvFX3LhQxK1/4c6/cdpmoa0HLhzOuZd7wkSRpV2nG+rsLK6tr5R3Cxtbe/s7tn7B20lUolJCwsmZDdAijDKSUtTzUg3kQTFASOdYHQz9TsPRCoq+L0eJ8SP0YDTiGKkjdS3jzIPIwYbk6trL3MrHguFVpWGN+nbZafqzACXiZuTMsjR7NtfXihwGhOuMUNK9Vwn0X6GpKaYkUnJSxVJEB6hAekZylFMlJ/NPpjAU6OEMBLSFNdwpv6eyFCs1DgOTGeM9FAtelPxP6+X6ujSzyhPUk04ni+KUga1gNM4YEglwZqNDUFYUnMrxEMkEdYmtJIJwV18eZm0z6uU3XvauV6LY+jCI7BCTgDLrgAdXALmqAFMHgEz+AVvFlP1ov1bn3MWwtWPnMI/sD6/AEieJX9</latexit> Overview M := { 1 , . . . , M } q Solvers: gradient descent (GD), momentum methods… q Our method improves GD by § same convergence rate in theory § reduced communication in theory § more than 90% communication saving in practice J. Dean, G. Corrado, R. Monga, K. Chen, M. Devin, M. Mao, A. Senior, P. Tucker, K. Yang, Q. V. Le et al., 3 “Large-scale distributed deep networks,” Proc. NIPS ., Lake Tahoe, NV, pp. 1223–1231, 2012
Recommend
More recommend