function call
play

Function Call Re-Vectorization Pupil: Rubens Emilio Alves Moreira - PowerPoint PPT Presentation

Function Call Re-Vectorization Pupil: Rubens Emilio Alves Moreira Advisor: Fernando Magno Quinto Pereira Function Call Re-Vectorization Programmability Efficiency Function Call Re-Vectorization CUDA: kernel <<<#warps,


  1. Divergences void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { if ( threadId.x < 3) memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). DIVERGENCE! then T 3 WAIT! memcpy(A, B, N); T 0 T 1 T 2 else SIMD: LOCKSTEP EXECUTION! ; Control flow graph for kernel .

  2. Divergences void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { if ( threadId.x < 3) memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). DIVERGENCE! then memcpy(A, B, N); T 0 T 1 T 2 else SIMD: LOCKSTEP EXECUTION! ; T 3 Control flow graph for kernel .

  3. Divergences void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { if ( threadId.x < 3) memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } And waiting to process Kernel for parallel execution (CUDA). can be quite costly! DIVERGENCE! then memcpy(A, B, N); T 0 T 1 T 2 else SIMD: LOCKSTEP EXECUTION! ; T 3 Control flow graph for kernel .

  4. Interlude: The Kernels of Samuel __global__ void dec2zero( int * data, int N) { int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[xIndex]--; } } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  5. Interlude: The Kernels of Samuel __global__ void dec2zero( int * data, int N) { int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[xIndex]--; } } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  6. Interlude: The Kernels of Samuel __global__ void dec2zero( int * data, int N) { int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[xIndex]--; } } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  7. Interlude: The Kernels of Samuel Seeking for the lowest execution time, what is the best initialization of data[] ? __global__ void dec2zero( int * data, int N) { int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[xIndex]--; } } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  8. Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } F assigns the result of (size - i + 1) to data[i] Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  9. Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; } M assigns the constant } value size to data[i] Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  10. Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; Q does also assign size to } data[i] , but only for } threads with odd index i void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  11. Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; P calls function random } and assigns its value, } modulo size , to data[i] void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  12. __global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  13. __global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  14. __global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  15. __global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { 30210µs: for ( int i = idx ; i < size; i += dimx ) { normal distribution data[i] = random() % size; of values } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  16. __global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 32193µs: if (i % 2) data[i] = size; half the values differ! } } void P( int * data, int size) { 30210µs: for ( int i = idx ; i < size; i += dimx ) { normal distribution data[i] = random() % size; of values } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  17. __global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } Divergence is int dimx = threadDim.x ; } harmful to void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ performance! data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 32193µs: if (i % 2) data[i] = size; half the values differ! } } void P( int * data, int size) { 30210µs: for ( int i = idx ; i < size; i += dimx ) { normal distribution data[i] = random() % size; of values } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

  18. Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 2 T 1 T 3 T 0 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! } Kernel for parallel execution (CUDA). Divergent region: FUNCTION memcpy only active threads run memcpy Control flow graph for memcpy .

  19. Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! } T 2 T 1 T 0 Kernel for parallel execution (CUDA). Divergent region: FUNCTION memcpy only active threads run memcpy Control flow graph for memcpy .

  20. Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! } Kernel for parallel execution (CUDA). Divergent region: FUNCTION memcpy only active threads run memcpy T 2 T 1 T 0 Control flow graph for memcpy .

  21. Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; Suboptimal behavior: } DIVERGENCE! } thread T 3 is inactive . Right? Kernel for parallel execution (CUDA). Observed behavior: FUNCTION memcpy t 3 Thr e ads c 5 c 6 t 2 c 0 c 1 c 2 c 3 c 4 T 2 T 1 T 0 b 5 b 6 b 7 b 0 b 1 b 2 b b 4 t 1 3 a 0 a 1 a 2 a a 4 t 0 3 Time Control flow graph for memcpy .

  22. Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! Not really! We are using } Dynamic Parallelism Kernel for parallel execution (CUDA). Observed behavior: FUNCTION memcpy t 3 Thr e ads c 5 c 6 t 2 c 0 c 1 c 2 c 3 c 4 T 2 T 1 T 0 b 5 b 6 b 7 b 0 b 1 b 2 b b 4 t 1 3 a 0 a 1 a 2 a a 4 t 0 3 Time Control flow graph for memcpy .

  23. D EPARTMENT OF C OMPUTER S CIENCE U NIVERSIDADE F EDERAL DE M INAS G ERAIS F EDERAL U NIVERSITY OF M INAS G ERAIS , B RAZIL D YNAMIC P ARALLELISM

  24. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). CUDA’s nested kernel call : kernel<<<#warps, #threads>>>( args … )

  25. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). CUDA’s nested kernel call : kernel<<<#warps, #threads>>>( args … ) Launches a new kernel , with all threads active , to process the target function

  26. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy runs once memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); per active thread at } else { memcpy<<<1,4>>> ; call site! } } Kernel for parallel execution (CUDA). Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

  27. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); SIMD } else { ; kernels! } } Kernel for parallel execution (CUDA). Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

  28. Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

  29. Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } All threads } work on a SIMD implementation of memory copy. single vector! Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

  30. Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } All threads } work on a SIMD implementation of memory copy. single vector! Dynamic parallelism changes the dimension of the parallelism Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

  31. Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } All threads } work on a SIMD implementation of memory copy. single vector! Dynamic parallelism changes the dimension of the parallelism Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 All threads are b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 active upon entry! a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

  32. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA).

  33. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA). Has the overhead of allocating and scheduling a new kernel

  34. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA). Has the overhead of allocating and scheduling a new kernel kernel <<<#warps , #threads>>> (args...);

  35. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA). Has the overhead of allocating and scheduling a new kernel kernel <<<#warps , #threads>>> (args...); Parallel Time ~ Kernel Launching Overhead + Sequential Time #warps x #threads

  36. Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { Important benefits when new work is invoked within an executing GPU int tid( threadId.x ); program include removing the burden on the programmer to if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); marshal and transfer the data on which to operate. Additional Parallelism: } else { parallelism can be exposed to the GPU’s hardware schedulers and Nested kernel calls ; } load balancers dynamically, adapting in response to data-driven } decisions or workloads. Algorithms and programming patterns that Kernel for parallel execution (CUDA). had previously required modifications to eliminate recursion, irregular loop structure, or other constructs that do not fit a flat, single-level of parallelism can be more transparently expressed . Dynamic Parallelism in CUDA kernel <<<#warps , #threads>>> (args...); Source : http://developer.download.nvidia.com/assets/cuda/files/CUDADownloads/TechBrief_Dynamic_Parallelism_in_CUDA.pdf Parallel Time ~ Kernel Launching Overhead + Sequential Time Has the overhead of allocating and scheduling #warps x #threads a new kernel

  37. D EPARTMENT OF C OMPUTER S CIENCE U NIVERSIDADE F EDERAL DE M INAS G ERAIS F EDERAL U NIVERSITY OF M INAS G ERAIS , B RAZIL W ARP -S YNCHRONOUS P ROGRAMMING

  38. Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Warp-synchronous: All threads must be active upon entrance to the procedure!

  39. Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  40. Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Warp-level parallelism! Mappings : T 0 T 1 T 2 T 3 T 0 T 1 T 2 T 3 int value = [10 20 30 10] increment int value = [11 21 31 11]

  41. Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Warp-level parallelism! Mappings : T 0 T 1 T 2 T 3 T 0 T 1 T 2 T 3 int value = [10 20 30 10] increment int value = [11 21 31 11] Reductions : T 0 T 1 T 2 T 3 T 0 T 1 T 2 T 3 int value = [10 20 30 10] sum int scalar = (70 70 70 70)

  42. Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. * Everywhere blocks (early languages for SIMD machines): • C* • MPL • POMPC Block wherein threads are temporarily re-enabled !

  43. Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. * Everywhere blocks : T 2 T 3 T 1 DIVERGENCE: T 0

  44. Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. All threads are temporarily * Everywhere blocks : re-enabled to process code within EVERYWHERE block! T 2 T 3 T 1 DIVERGENCE: T 0 T 1 T 2 T 3 T 0 EVERYWHERE: EVERYWHERE { code... }

  45. Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. All threads are temporarily * Everywhere blocks : re-enabled to process code within EVERYWHERE block! T 2 T 3 T 1 DIVERGENCE: T 0 T 1 T 2 T 3 T 0 EVERYWHERE: EVERYWHERE { code... } Divergences restored ! T 2 T 3 T 1 DIVERGENCE: T 0

  46. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. * Everywhere blocks (early languages for SIMD machines): • C* • MPL • POMPC * Shuffle (warp aware instruction): shuffle(v, i) allows thread to read the value stored in variable v , but in the register space of thread i

  47. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  48. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  49. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } 1. everywhere re-enables all threads! 2. Skip formerly divergent threads! SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  50. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } 2. Skip formerly divergent threads! SIMD implementation of memory copy. 3. Extracts values for current thread “ i ”. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  51. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } 2. Skip formerly divergent threads! 3. Extracts values for current thread “ i ”. SIMD implementation of memory copy. 4. We then call our SIMD kernel memcpy . void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  52. Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } 2. Skip formerly divergent threads! 3. Extracts values for current thread “ i ”. SIMD implementation of memory copy. 4. We then call our SIMD kernel memcpy . void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” The target architecture must provide a dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); directive to re-enable inactive threads . N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

  53. Warp-Synchronous Programming: Everywhere blocks + Shuffle everywhere temporarily re-enables all threads within the warp SPMD/SIMT SIMD handle divergences all threads must be active at the call site shuffle extracts private values and broadcasts them to all threads

  54. Warp-Synchronous Programming: Everywhere blocks + Shuffle everywhere temporarily re-enables all threads within the warp crev SPMD/SIMT SIMD handle divergences all threads must be active at the call site shuffle extracts private values and broadcasts them to all threads

  55. Warp-Synchronous Programming: Everywhere blocks + Shuffle We have defined the semantics of EVERYWHERE in the SIMD world: Semantics of everywhere in SIMD: encode the building blocks to implement this construct

  56. Warp-Synchronous Programming: Everywhere blocks + Shuffle We have defined the semantics of EVERYWHERE in the SIMD world: Implemented an abstract SIMD Extended Intel's SPMD compiler machine in Prolog, with with a new idiom , function call support to everywhere blocks. re-vectorization , that enhances native dynamic parallelism.

  57. Warp-Synchronous Programming: CREV crev memcmp(i)

  58. D EPARTMENT OF C OMPUTER S CIENCE U NIVERSIDADE F EDERAL DE M INAS G ERAIS F EDERAL U NIVERSITY OF M INAS G ERAIS , B RAZIL F UNCTION C ALL R E -V ECTORIZATION

  59. Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD function SIMD implementation of memory copy.

  60. Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD function SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { memcpy<<<1, 4>>> (dest[tid], src[tid], N[tid]); } Too much overhead CUDA’s nested kernel call : Dynamic parallelism

  61. Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD function SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { memcpy<<<1, 4>>> (dest[tid], src[tid], N[tid]); } Too much overhead CUDA’s nested kernel call : Dynamic parallelism void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Too many lines of code Warp-synchronous wrapper for SIMD memory copy.

  62. Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { crev memcpy(dest[tid], src[tid], N[tid]); dest[i] = src[i]; } } Simplicity + Performance, a.k.a. } CREV SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { memcpy<<<1, 4>>> (dest[tid], src[tid], N[tid]); } Too much overhead CUDA’s nested kernel call : Dynamic parallelism void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Too many lines of code Warp-synchronous wrapper for SIMD memory copy.

  63. Function Call Re-Vectorization: Reprise Programmability Efficiency

  64. Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Dynamic Parallelism Programmability Efficiency

  65. Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Dynamic Parallelism Programmability ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

  66. Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Dynamic Function Call Parallelism Re-Vectorization Programmability ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

  67. Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

  68. Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability High performance ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

  69. Function Call Re-Vectorization: Reprise Re-enable all threads within warp, CUDA: kernel <<<#warps, #threads>>> (args...) avoiding kernel allocation and scheduling Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability High performance ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

  70. Function Call Re-Vectorization: Reprise Re-enable all threads within warp, CUDA: kernel <<<#warps, #threads>>> (args...) avoiding kernel allocation and scheduling Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability High performance Allowing SIMD functions to be executed, ... without diving into __shuffle (data, tid, var) __shuffle (data, tid, var) warp-synchronous coding! ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

  71. Function Call Re-Vectorization: Properties Composability We are able to nest everywhere blocks: crev can be called recursively ! // Traverses the matrix in a depth-first fashion Important benefits when new work is invoked within an void dfs ( uniform struct Graph& graph, uniform int root, float * uniform f) { executing GPU program include removing the burden on the programmer to marshal and transfer the data on which to if (graph.node[root].visited) return ; operate. Additional parallelism can be exposed to the GPU’s graph.node[root].visited = true ; hardware schedulers and load balancers dynamically, adapting // Eventual computations in response to data-driven decisions or workloads. f[root] = graph.node[root].length / Algorithms and programming patterns that had previously (float) graph.num_nodes; required modifications to eliminate recursion, // Traversal irregular loop structure, or other constructs that foreach (i = 0 ... graph.node[root].length) { do not fit a flat , single-level of parallelism can be more int child = graph.node[root].edge[i].node; if (!graph.node[child].visited) { transparently expressed . crev dfs (graph, child, f); } Dynamic Parallelism in CUDA } Source :http://developer.download.nvidia.com/assets/cuda/files/CUD ADownloads/TechBrief_Dynamic_Parallelism_in_CUDA.pdf }

  72. Function Call Re-Vectorization: Properties Multiplicative composition The target crev function runs once per active thread. In a warp of W threads, the function may run up to W times. If the call is recursive, up to W N times. // Traverses the matrix in a depth-first fashion T 2 T 1 T 3 T 0 void dfs ( uniform struct Graph& graph, uniform int root, float * uniform f) { dfs() n = 1 Vectorized! if (graph.node[root].visited) return ; graph.node[root].visited = true ; // Eventual computations f[root] = graph.node[root].length / (float) graph.num_nodes; // Traversal foreach (i = 0 ... graph.node[root].length) { int child = graph.node[root].edge[i].node; if (!graph.node[child].visited) { crev dfs (graph, child, f); } } }

Recommend


More recommend