Function Call Re-Vectorization Pupil: Rubens Emilio Alves Moreira - PowerPoint PPT Presentation

Divergences void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { if ( threadId.x < 3) memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). DIVERGENCE! then T 3 WAIT! memcpy(A, B, N); T 0 T 1 T 2 else SIMD: LOCKSTEP EXECUTION! ; Control flow graph for kernel .

Divergences void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { if ( threadId.x < 3) memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). DIVERGENCE! then memcpy(A, B, N); T 0 T 1 T 2 else SIMD: LOCKSTEP EXECUTION! ; T 3 Control flow graph for kernel .

Divergences void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { if ( threadId.x < 3) memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } And waiting to process Kernel for parallel execution (CUDA). can be quite costly! DIVERGENCE! then memcpy(A, B, N); T 0 T 1 T 2 else SIMD: LOCKSTEP EXECUTION! ; T 3 Control flow graph for kernel .

Interlude: The Kernels of Samuel __global__ void dec2zero( int * data, int N) { int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[xIndex]--; } } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

Interlude: The Kernels of Samuel Seeking for the lowest execution time, what is the best initialization of data[] ? __global__ void dec2zero( int * data, int N) { int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[xIndex]--; } } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } F assigns the result of (size - i + 1) to data[i] Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; } M assigns the constant } value size to data[i] Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; Q does also assign size to } data[i] , but only for } threads with odd index i void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

Interlude: The Kernels of Samuel int idx = threadId.x ; int dimx = threadDim.x ; void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; P calls function random } and assigns its value, } modulo size , to data[i] void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

__global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size; } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

__global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = size - i + 1; } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

__global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { data[i] = random() % size; } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

__global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { if (i % 2) data[i] = size; } } void P( int * data, int size) { 30210µs: for ( int i = idx ; i < size; i += dimx ) { normal distribution data[i] = random() % size; of values } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

__global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } int dimx = threadDim.x ; } void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 32193µs: if (i % 2) data[i] = size; half the values differ! } } void P( int * data, int size) { 30210µs: for ( int i = idx ; i < size; i += dimx ) { normal distribution data[i] = random() % size; of values } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

__global__ void dec2zero( int * data, int N) { Interlude: The Kernels of Samuel int xIndex = blockIdx.x * blockDim.x + threadIdx.x ; if (xIndex < N) { while (data[xIndex] > 0) { data[ xIndex ]--; } int idx = threadId.x ; } Divergence is int dimx = threadDim.x ; } harmful to void F( int * data, int size) { 16250µs: for ( int i = idx ; i < size; i += dimx ) { values differ performance! data[i] = size - i + 1; by constant } } void M( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 16153µs: data[i] = size; all values are equal } } void Q( int * data, int size) { for ( int i = idx ; i < size; i += dimx ) { 32193µs: if (i % 2) data[i] = size; half the values differ! } } void P( int * data, int size) { 30210µs: for ( int i = idx ; i < size; i += dimx ) { normal distribution data[i] = random() % size; of values } } Source: http://homepages.dcc.ufmg.br/~fernando/classes/dcc888/ementa/slides/DivergenceAnalysis.pdf

Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 2 T 1 T 3 T 0 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! } Kernel for parallel execution (CUDA). Divergent region: FUNCTION memcpy only active threads run memcpy Control flow graph for memcpy .

Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! } T 2 T 1 T 0 Kernel for parallel execution (CUDA). Divergent region: FUNCTION memcpy only active threads run memcpy Control flow graph for memcpy .

Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! } Kernel for parallel execution (CUDA). Divergent region: FUNCTION memcpy only active threads run memcpy T 2 T 1 T 0 Control flow graph for memcpy .

Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; Suboptimal behavior: } DIVERGENCE! } thread T 3 is inactive . Right? Kernel for parallel execution (CUDA). Observed behavior: FUNCTION memcpy t 3 Thr e ads c 5 c 6 t 2 c 0 c 1 c 2 c 3 c 4 T 2 T 1 T 0 b 5 b 6 b 7 b 0 b 1 b 2 b b 4 t 1 3 a 0 a 1 a 2 a a 4 t 0 3 Time Control flow graph for memcpy .

Divergences: Coda void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { T 3 memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } DIVERGENCE! Not really! We are using } Dynamic Parallelism Kernel for parallel execution (CUDA). Observed behavior: FUNCTION memcpy t 3 Thr e ads c 5 c 6 t 2 c 0 c 1 c 2 c 3 c 4 T 2 T 1 T 0 b 5 b 6 b 7 b 0 b 1 b 2 b b 4 t 1 3 a 0 a 1 a 2 a a 4 t 0 3 Time Control flow graph for memcpy .

D EPARTMENT OF C OMPUTER S CIENCE U NIVERSIDADE F EDERAL DE M INAS G ERAIS F EDERAL U NIVERSITY OF M INAS G ERAIS , B RAZIL D YNAMIC P ARALLELISM

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). CUDA’s nested kernel call : kernel<<<#warps, #threads>>>( args … )

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); } else { ; } } Kernel for parallel execution (CUDA). CUDA’s nested kernel call : kernel<<<#warps, #threads>>>( args … ) Launches a new kernel , with all threads active , to process the target function

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy runs once memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); per active thread at } else { memcpy<<<1,4>>> ; call site! } } Kernel for parallel execution (CUDA). Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); SIMD } else { ; kernels! } } Kernel for parallel execution (CUDA). Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } All threads } work on a SIMD implementation of memory copy. single vector! Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } All threads } work on a SIMD implementation of memory copy. single vector! Dynamic parallelism changes the dimension of the parallelism Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

Dynamic Parallelism void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } All threads } work on a SIMD implementation of memory copy. single vector! Dynamic parallelism changes the dimension of the parallelism Actual behavior with CUDA’s dynamic parallelism: From T 0 From T 1 From T 2 t 3 t 3 a 3 b 3 b 7 c 3 Threads c 0 c 1 c 2 c 3 c 4 c 5 c 6 a 2 b 2 b 6 c 2 c 6 t 2 t 2 All threads are b 0 b 1 b 2 b b 4 b 5 b 6 b 7 t 1 t 1 a 1 b 1 b 5 c 1 c 5 3 active upon entry! a 0 a 1 a 2 a a 4 t 0 t 0 a 0 a 4 b 0 b 4 c 0 c 4 3 Time Time

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA).

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA). Has the overhead of allocating and scheduling a new kernel

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA). Has the overhead of allocating and scheduling a new kernel kernel <<<#warps , #threads>>> (args...);

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { int tid( threadId.x ); if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); Parallelism: } else { Nested kernel calls ; } } Kernel for parallel execution (CUDA). Has the overhead of allocating and scheduling a new kernel kernel <<<#warps , #threads>>> (args...); Parallel Time ~ Kernel Launching Overhead + Sequential Time #warps x #threads

Dynamic Parallelism void kernel( int ** A, int ** B, int * N) { Important benefits when new work is invoked within an executing GPU int tid( threadId.x ); program include removing the burden on the programmer to if (tid < 3) { CUDA’s Dynamic memcpy <<< 1, 4 >>> (A[tid], B[tid], N[tid]); marshal and transfer the data on which to operate. Additional Parallelism: } else { parallelism can be exposed to the GPU’s hardware schedulers and Nested kernel calls ; } load balancers dynamically, adapting in response to data-driven } decisions or workloads. Algorithms and programming patterns that Kernel for parallel execution (CUDA). had previously required modifications to eliminate recursion, irregular loop structure, or other constructs that do not fit a flat, single-level of parallelism can be more transparently expressed . Dynamic Parallelism in CUDA kernel <<<#warps , #threads>>> (args...); Source : http://developer.download.nvidia.com/assets/cuda/files/CUDADownloads/TechBrief_Dynamic_Parallelism_in_CUDA.pdf Parallel Time ~ Kernel Launching Overhead + Sequential Time Has the overhead of allocating and scheduling #warps x #threads a new kernel

D EPARTMENT OF C OMPUTER S CIENCE U NIVERSIDADE F EDERAL DE M INAS G ERAIS F EDERAL U NIVERSITY OF M INAS G ERAIS , B RAZIL W ARP -S YNCHRONOUS P ROGRAMMING

Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Warp-synchronous: All threads must be active upon entrance to the procedure!

Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Warp-level parallelism! Mappings : T 0 T 1 T 2 T 3 T 0 T 1 T 2 T 3 int value = [10 20 30 10] increment int value = [11 21 31 11]

Warp-Synchronous Programming void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. Warp-level parallelism! Mappings : T 0 T 1 T 2 T 3 T 0 T 1 T 2 T 3 int value = [10 20 30 10] increment int value = [11 21 31 11] Reductions : T 0 T 1 T 2 T 3 T 0 T 1 T 2 T 3 int value = [10 20 30 10] sum int scalar = (70 70 70 70)

Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. * Everywhere blocks (early languages for SIMD machines): • C* • MPL • POMPC Block wherein threads are temporarily re-enabled !

Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. * Everywhere blocks : T 2 T 3 T 1 DIVERGENCE: T 0

Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. All threads are temporarily * Everywhere blocks : re-enabled to process code within EVERYWHERE block! T 2 T 3 T 1 DIVERGENCE: T 0 T 1 T 2 T 3 T 0 EVERYWHERE: EVERYWHERE { code... }

Warp-Synchronous Programming: Everywhere blocks void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. All threads are temporarily * Everywhere blocks : re-enabled to process code within EVERYWHERE block! T 2 T 3 T 1 DIVERGENCE: T 0 T 1 T 2 T 3 T 0 EVERYWHERE: EVERYWHERE { code... } Divergences restored ! T 2 T 3 T 1 DIVERGENCE: T 0

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. * Everywhere blocks (early languages for SIMD machines): • C* • MPL • POMPC * Shuffle (warp aware instruction): shuffle(v, i) allows thread to read the value stored in variable v , but in the register space of thread i

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } 1. everywhere re-enables all threads! 2. Skip formerly divergent threads! SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } 2. Skip formerly divergent threads! SIMD implementation of memory copy. 3. Extracts values for current thread “ i ”. void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } 2. Skip formerly divergent threads! 3. Extracts values for current thread “ i ”. SIMD implementation of memory copy. 4. We then call our SIMD kernel memcpy . void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming: Everywhere blocks + Shuffle void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } 1. everywhere re-enables all threads! } 2. Skip formerly divergent threads! 3. Extracts values for current thread “ i ”. SIMD implementation of memory copy. 4. We then call our SIMD kernel memcpy . void memcpy_wrapper( int ** dest, int ** src, int *N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” The target architecture must provide a dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); directive to re-enable inactive threads . N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Warp-synchronous wrapper for SIMD memory copy.

Warp-Synchronous Programming: Everywhere blocks + Shuffle everywhere temporarily re-enables all threads within the warp SPMD/SIMT SIMD handle divergences all threads must be active at the call site shuffle extracts private values and broadcasts them to all threads

Warp-Synchronous Programming: Everywhere blocks + Shuffle everywhere temporarily re-enables all threads within the warp crev SPMD/SIMT SIMD handle divergences all threads must be active at the call site shuffle extracts private values and broadcasts them to all threads

Warp-Synchronous Programming: Everywhere blocks + Shuffle We have defined the semantics of EVERYWHERE in the SIMD world: Semantics of everywhere in SIMD: encode the building blocks to implement this construct

Warp-Synchronous Programming: Everywhere blocks + Shuffle We have defined the semantics of EVERYWHERE in the SIMD world: Implemented an abstract SIMD Extended Intel's SPMD compiler machine in Prolog, with with a new idiom , function call support to everywhere blocks. re-vectorization , that enhances native dynamic parallelism.

Warp-Synchronous Programming: CREV crev memcmp(i)

D EPARTMENT OF C OMPUTER S CIENCE U NIVERSIDADE F EDERAL DE M INAS G ERAIS F EDERAL U NIVERSITY OF M INAS G ERAIS , B RAZIL F UNCTION C ALL R E -V ECTORIZATION

Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD function SIMD implementation of memory copy.

Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD function SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { memcpy<<<1, 4>>> (dest[tid], src[tid], N[tid]); } Too much overhead CUDA’s nested kernel call : Dynamic parallelism

Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { dest[i] = src[i]; } } SIMD function SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { memcpy<<<1, 4>>> (dest[tid], src[tid], N[tid]); } Too much overhead CUDA’s nested kernel call : Dynamic parallelism void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Too many lines of code Warp-synchronous wrapper for SIMD memory copy.

Function Call Re-Vectorization: Reprise void memcpy( int * dest, int * src, int N) { void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { for ( int i= threadId.x ; i < N; i+= threadDim.x ) { crev memcpy(dest[tid], src[tid], N[tid]); dest[i] = src[i]; } } Simplicity + Performance, a.k.a. } CREV SIMD implementation of memory copy. void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { memcpy<<<1, 4>>> (dest[tid], src[tid], N[tid]); } Too much overhead CUDA’s nested kernel call : Dynamic parallelism void memcpy_wrapper( int ** dest, int ** src, int * N, int mask) { EVERYWHERE { for ( int i=0; i < threadDim.x; ++i) { if ( not (mask & (1 << i))) continue ; // skip thread “ i ” dest_i = shuffle(dest, i); // if it is divergent src_i = shuffle(src, i); N_i = shuffle(N, i); memcpy(dest_i, src_i, N_i); } } } Too many lines of code Warp-synchronous wrapper for SIMD memory copy.

Function Call Re-Vectorization: Reprise Programmability Efficiency

Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Dynamic Parallelism Programmability Efficiency

Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Dynamic Parallelism Programmability ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Dynamic Function Call Parallelism Re-Vectorization Programmability ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

Function Call Re-Vectorization: Reprise CUDA: kernel <<<#warps, #threads>>> (args...) Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability High performance ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

Function Call Re-Vectorization: Reprise Re-enable all threads within warp, CUDA: kernel <<<#warps, #threads>>> (args...) avoiding kernel allocation and scheduling Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability High performance ... __shuffle (data, tid, var) __shuffle (data, tid, var) ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

Function Call Re-Vectorization: Reprise Re-enable all threads within warp, CUDA: kernel <<<#warps, #threads>>> (args...) avoiding kernel allocation and scheduling Simplicity Dynamic Function Call Parallelism Re-Vectorization Programmability High performance Allowing SIMD functions to be executed, ... without diving into __shuffle (data, tid, var) __shuffle (data, tid, var) warp-synchronous coding! ... __synchronize () Shuffle ... Nightmare __shuffle (data, tid, var) ... __synchronize () __shuffle (data, tid, var) Efficiency ... __shuffle (data, tid, var) ...

Function Call Re-Vectorization: Properties Composability We are able to nest everywhere blocks: crev can be called recursively ! // Traverses the matrix in a depth-first fashion Important benefits when new work is invoked within an void dfs ( uniform struct Graph& graph, uniform int root, float * uniform f) { executing GPU program include removing the burden on the programmer to marshal and transfer the data on which to if (graph.node[root].visited) return ; operate. Additional parallelism can be exposed to the GPU’s graph.node[root].visited = true ; hardware schedulers and load balancers dynamically, adapting // Eventual computations in response to data-driven decisions or workloads. f[root] = graph.node[root].length / Algorithms and programming patterns that had previously (float) graph.num_nodes; required modifications to eliminate recursion, // Traversal irregular loop structure, or other constructs that foreach (i = 0 ... graph.node[root].length) { do not fit a flat , single-level of parallelism can be more int child = graph.node[root].edge[i].node; if (!graph.node[child].visited) { transparently expressed . crev dfs (graph, child, f); } Dynamic Parallelism in CUDA } Source :http://developer.download.nvidia.com/assets/cuda/files/CUD ADownloads/TechBrief_Dynamic_Parallelism_in_CUDA.pdf }

Function Call Re-Vectorization: Properties Multiplicative composition The target crev function runs once per active thread. In a warp of W threads, the function may run up to W times. If the call is recursive, up to W N times. // Traverses the matrix in a depth-first fashion T 2 T 1 T 3 T 0 void dfs ( uniform struct Graph& graph, uniform int root, float * uniform f) { dfs() n = 1 Vectorized! if (graph.node[root].visited) return ; graph.node[root].visited = true ; // Eventual computations f[root] = graph.node[root].length / (float) graph.num_nodes; // Traversal foreach (i = 0 ... graph.node[root].length) { int child = graph.node[root].edge[i].node; if (!graph.node[child].visited) { crev dfs (graph, child, f); } } }

Function Call Re-Vectorization Pupil: Rubens Emilio Alves Moreira - PowerPoint PPT Presentation

Function Call Re-Vectorization Pupil: Rubens Emilio Alves Moreira Advisor: Fernando Magno Quinto Pereira Function Call Re-Vectorization Programmability Efficiency Function Call Re-Vectorization CUDA: kernel <<<#warps,

A PLACE TO CALL HOME A PLACE TO CALL HOME A PLACE TO CALL HOME A PLACE TO CALL HOME A PLACE

Chapter 7 Programming with Recursion Recursive Function A recursive call is a function call

PWSCF and new charge density PWSCF call read_input_file (input.f90) call run_pwscf call setup

Function Calls Function Calls Python supports expressions with math-like functions A

CSC 2400: Computer Systems Stack Buffer Overflow Attacks Summary Invoking a function ! CALL

Function Fields, Curves Introduction Function Fields vs. Curves and Global sections Function

Evaluation function Cost function g g Evaluation function Cost function expand vertex

Recursion What happens when a function calls itself? This is known as a recursive function

PWSCF and diagonalization ELECTRONS call electron_scf do iter = 1, niter call c_bands

Purpose, Function, and Design Purpose, Function, and Design Purpose, Function, and Design

The Natural Logarithm Function and The Exponential Function One specific logarithm function is

Functions Function Calls Python supports expressions with math-like functions A function

Homework 3: Dialog Part 1 Part 1 Call Call TellMe TellMe and get two sets of

Call-by-name, call-by-value, and the -calculus G.D. Plotkin Presented by Dietrich Geisler

Lab 10 Function Pointer Allows your program to determine which function to call dynamically. -

CHAPTER IX IX CHAPTER Radial Basis Function Networks Radial Basis Function Networks CHAPTER IX

Program Slicing Gias Uddin Special Topic Lecture prepared from the Survey of Frank Tip on Program

Programs analysis Programs analysis by SamCOTS by SamCOTS ( S tatic A nalyser of M alicious COTS

Pushdown Control-Flow Analysis of Higher Order Programs Christopher Earl 1 Matthew Might 1 David

Fundamentals of (Static ic and Dynamic ic) ) Software Verif ific icatio ion Control-flow

Big-Step Bounded Model Checking for So6ware Nishant Sinha ,

Source Code Metrics Framework Nikolay Grozev Neli Maneva Delyan Lilov Overview Introduction

Using Graph-Based Characteriza4on for Predic4ve Modeling of

Translating Out of Static Single Assignment Form Pushpinder Kaur Chouhan January 8, 2003 1