home ytang slides
play

/home/ytang/slides http://docs.nvidia.com/cuda/index.html - PowerPoint PPT Presentation

/home/ytang/slides http://docs.nvidia.com/cuda/index.html __global__ void foo( ... ) { __global__ void foo( int *bar ) { if (


  1. • /home/ytang/slides • http://docs.nvidia.com/cuda/index.html

  2. • • • • • • • • • •

  3. • • • • • •

  4. • • • •

  5. • __global__ void foo( ... ) { __global__ void foo( int *bar ) { if ( threadIdx.x % 2 ) { if ( bar[threadIdx.x] ) { ... ... } else { } else { ... ... } } } } __global__ void foo( ... ) { __global__ void foo( int *bar ) { if ( ( threadIdx.x / warpSize ) % 2 ) { int tid = threadIdx.x; ... for( int i = 0; i < bar[tid]; i++ ) { } else { ... ... } } } }

  6. • • • • • • GPU & On-chip memory • Off-chip GRAM • • • •

  7. • • cudaError_t cudaMalloc ( void** devPtr, size_t size ); • cudaError_t cudaFree ( void* devPtr ) ; • device-side malloc/new/free/delete • ptr[ index ] = value; • • cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ); • cudaError_t cudaMemset ( void* devPtr, int value, size_t count ); • •

  8. • • Coalesced, Aligned Coalesced, Unaligned • Strided Uncoalesced, Unaligned

  9. • __global__ void foo( int *bar ) { __global__ void foo( double *bar ) { bar[thread_id()] = ...; double e = bar[thread_id()+16]; } } __global__ void foo( int *bar ) { __global__ void foo( int2 *bar ) { bar[thread_id()+8] = ...; int e = bar[thread_id()].x; } } __global__ void foo( int *bar ) { __global__ void foo( float4 *bar ) { bar[thread_id()+13] = ...; float e = bar[thread_id()].z; } } __global__ void foo( int *bar ) { __global__ void foo( int *map, int *bar ) { int e = bar[thread_id()+16]; int e = bar[ map[thread_id()] ]; } }

  10. • • • • • • • •

  11. • • • • • • • • • •

  12. • • • • • • __global__ void foo( int *bar, int *map ) { int x = __ldg ( bar + map[ threadIdx.x ] ); } __global__ void foo2( const int* __restrict bar, int *map ) { int x = bar[ map[ threadIdx.x] ]; }

  13. • • • • •

  14. __shared__ int sum; int b 0 = ...; • register r 0 = sum; r 0 += b 0 ; __shared__ int sum; int b 1 = ...; int b = ...; register r 1 = sum; register r = sum; sum = r 0 ; __shared__ int sum; r += b; r 1 += b 1 ; int b = ...; sum = r; sum = r 1 ; sum += b; • • • modify = add, sub, exchange, etc... • float

  15. • • • type __shfl(type var, int srcLane, int width=warpSize); • __shfl() __shfl_up() __shfl_down() __shfl_xor()

  16. • • 𝑜−1 𝑏 𝑗 𝑇 𝑜 = σ 𝑗=0 • • for(int i = 0 ; i < n ; i++) sum += a[i]; •

Recommend


More recommend