Improving Performance of OpenCL on CPUs Ralf Karrenberg - PowerPoint PPT Presentation

Improving Performance of OpenCL on CPUs Ralf Karrenberg karrenberg@cs.uni-saarland.de Sebastian Hack hack@cs.uni-saarland.de European LLVM Conference, London April 12-13, 2012 1

Data-Parallel Languages: OpenCL ✞ ☎ __kernel void DCT(__global float * output , __global float * input , __global float * dct8x8 , __local float * inter , const uint width , const uint blockWidth , const uint inverse) { uint tidX = get_global_id (0); ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; } output[tidY*width + tidX] = acc; } ✝ ✆ 2

OpenCL: Execution Model 3

CPU Driver Implementation (2D, Na¨ ıve) ✞ ☎ cl_int clEnqueueNDRangeKernel (Kernel scalarKernel , TA argStruct , int* globalSizes , int* localSizes ) { int groupSizeX = globalSizes [0] / localSizes [0]; int groupSizeY = globalSizes [1] / localSizes [1]; // Loop over groups. for (int groupX =0; groupX <groupSizeX ; ++ groupX) { for (int groupY =0; groupY <groupSizeY ; ++ groupY) { // Loop over threads in group. for (int lidY =0; lidY < localSizes [1]; ++ lidY) { for (int lidX =0; lidX < localSizes [0]; ++ lidX) { scalarKernel (argStruct , lidX , lidY , groupX , groupY , globalSizes , localSizes ); } } } } } ✝ ✆ 4

CPU Driver Implementation (2D, Group Kernel) ✞ ☎ cl_int clEnqueueNDRangeKernel (Kernel groupKernel , TA argStruct , int* globalSizes , int* localSizes ) { int groupSizeX = globalSizes [0] / localSizes [0]; int groupSizeY = globalSizes [1] / localSizes [1]; // Loop over groups. for (int groupX =0; groupX <groupSizeX ; ++ groupX) { for (int groupY =0; groupY <groupSizeY ; ++ groupY) { // Loop over threads in group. groupKernel (argStruct , groupX , groupY , globalSizes , localSizes ); } } } ✝ ✆ 5

CPU Driver Implementation (2D, Group Kernel, OpenMP) ✞ ☎ cl_int clEnqueueNDRangeKernel (Kernel groupKernel , TA argStruct , int* globalSizes , int* localSizes ) { int groupSizeX = globalSizes [0] / localSizes [0]; int groupSizeY = globalSizes [1] / localSizes [1]; #pragma omp parallel for for (int groupX =0; groupX <groupSizeX ; ++ groupX) { for (int groupY =0; groupY <groupSizeY ; ++ groupY) { // Loop over threads in group. groupKernel (argStruct , groupX , groupY , globalSizes , localSizes ); } } } ✝ ✆ 6

Group Kernel (2D, Scalar) ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { scalarKernel (argStruct , lidX , lidY , groupIDs , globalSizes , localSizes ); // to be inlined } } } ✝ ✆ 7

Group Kernel (2D, Scalar, Inlined) ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = get_global_id (0); ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; } output[tidY*width + tidX] = acc; } } } ✝ ✆ 8

Group Kernel (2D, Scalar, Inlined, Optimized (1)) ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; } output[tidY*width + tidX] = acc; } } } ✝ ✆ 9

Group Kernel (2D, Scalar, Inlined, Optimized (1)) ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[lidY*blockWidth + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? lidY* blockWidth + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; } output[tidY*width + tidX] = acc; } } } ✝ ✆ 10

Group Kernel (2D, Scalar, Inlined, Optimized (2)) ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { uint LIC = lidY* blockWidth ; for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[LIC + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? LIC + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; } output[tidY*width + tidX] = acc; } } } ✝ ✆ 11

Barrier Synchronization ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { uint LIC = lidY* blockWidth ; for (int lidX =0; lidX <localSizes [0]; ++ lidX) { uint tidX = localSizes [0] * groupIDs [0] + lidX; ... inter[LIC + lidX] = ... barrier( CLK_LOCAL_MEM_FENCE ); float acc = 0.0f; for(uint k=0; k < blockWidth ; k++) { uint index1 = lidX* blockWidth + k; uint index2 = (inverse) ? LIC + k : k* blockWidth + lidY; acc += inter[index1] * dct8x8[index2 ]; } output[tidY*width + tidX] = acc; } } } ✝ ✆ 12

Barrier Synchronization: Example a b c d e 13

Barrier Synchronization: Example a a 1 a 2 b b c c 1 d c 2 e d 1 d 2 e 13

Barrier Synchronization: Example a a 1 a 1 F 1 a 2 next: F 2 b b c c 1 d c 2 e d 1 d 2 e 13

Barrier Synchronization: Example a a 1 a 1 a 2 F 1 F 2 a 2 next: F 2 b b b c 1 c c 1 next: F 3 d c 2 e d 1 d 2 e 13

Barrier Synchronization: Example a a 1 a 1 a 2 F 1 F 2 a 2 next: F 2 b b b c 1 c c 1 next: F 3 d c 2 c 2 F 3 e d 1 d 1 d 2 next: F 4 e 13

Barrier Synchronization: Example a a 1 a 1 a 2 F 1 F 2 a 2 next: F 2 b b b c 1 c c 1 next: F 3 d c 2 c 2 d 2 F 3 F 4 e d 1 d 1 e b d 2 next: F 4 return c 1 e next: F 3 13

Group Kernel (1D, Scalar, Barrier Synchronization) ✞ ☎ void groupKernel (TA argStruct , int groupID , int globalSizes , int localSize , ...) { void* data[localSize] = alloc(localSize* liveValSize ); int next = BARRIER_BEGIN ; while (true) { switch (next) { case BARRIER_BEGIN : for (int i=0; i<localSize; ++i) next = F1(argStruct , tid , ..., &data[i]); // B2 break; ... case B4: for (int i=0; i<localSize; ++i) next = F4(tid , ..., &data[i]); // B3 or END break; case BARRIER_END : return; } } } ✝ ✆ 14

OpenCL: Exploiting Parallelism on CPUs CPU (1 core): CPU (4 cores): All threads run sequentially Each core executes 1 thread 0 1 2 3 0 4 5 6 7 1 8 9 10 11 . . . 12 13 14 15 14 15 15

OpenCL: Exploiting Parallelism on CPUs CPU (1 core): CPU (4 cores): All threads run sequentially Each core executes 1 thread 0 1 2 3 0 4 5 6 7 1 8 9 10 11 . . . 12 13 14 15 14 15 CPU (4 cores, SIMD width 4): Each core executes 4 threads . . . . . . 0 1 2 3 4 5 6 7 8 11 12 15 15

Group Kernel (2D, SIMD) ✞ ☎ void groupKernel (TA argStruct , int* groupIDs , int* globalSizes , int* localSizes) { for (int lidY =0; lidY <localSizes [1]; ++ lidY) { for (int lidX =0; lidX <localSizes [0]; lidX +=4) { __m128i lidXV = <lidX ,lidX+1,lidX+2,lidX +3>; simdKernel (argStruct , lidXV , lidY , groupIDs , globalSizes , localSizes ); // to be inlined } } } ✝ ✆ Whole-Function Vectorization (WFV) of kernel code New kernel computes 4 “threads” at once using SIMD instruction set Challenge: diverging control flow 16

Diverging Control Flow a b Thread Trace c d 1 a b c e f 2 a b d e f e 3 a b c e b c e f f 4 a b c e b d e f Different threads execute different code paths 17

Diverging Control Flow a a b b Thread Trace c c d 1 a b c d e b c d e f d 2 a b c d e b c d e f e e 3 a b c d e b c d e f f 4 f a b c d e b c d e f Different threads execute different code paths Execute everything, mask out results of inactive threads (using predication, blending) Control flow to data flow conversion on ASTs [Allen et al. POPL’83] Whole-Function Vectorization on SSA CFGs [K & H CGO’11] 17

Improving Performance of OpenCL on CPUs Ralf Karrenberg - PowerPoint PPT Presentation

Improving Performance of OpenCL on CPUs Ralf Karrenberg karrenberg@cs.uni-saarland.de Sebastian Hack hack@cs.uni-saarland.de European LLVM Conference, London April 12-13, 2012 1 Data-Parallel Languages: OpenCL __kernel void

OpenCL Kernel Compilation Slides taken from Hands On OpenCL by Simon McIntosh-Smith, Tom Deakin,

Performance Gaps between OpenMP and OpenCL for Multi-core CPUs Jie Shen, Jianbin Fang, Henk

Investigation of the OpenCL support in the GeantV's Vectorized Geometry Gabor Biro 22.09.2014.

The OpenCL C++ API Slides taken from Hands On OpenCL by Simon McIntosh-Smith, Tom Deakin, James

Introduction to OpenCL David Black-Schaffer david.black-schaffer@it.uu.se 1 Disclaimer I

OpenCL on FPGAs Contains material from Hands On OpenCL by Simon McIntosh-Smith, Tom Deakin, James

Synchronization in OpenCL Slides taken from Hands On OpenCL by Simon McIntosh-Smith, Tom Deakin,

PERFORMANCE CONSIDERATIONS FOR OPENCL ON NVIDIA GPUS Karthik Raghavan Ravi, 4/4/16 THE PROBLEM

CPUs Chapter 3.5 Caches. Memory management. Caches and CPUs address data cache

Han Dong Dibyajyoti Ghosh Fahad Zafar Shujia Zhou Motivation Explore OpenCL in accelerating

CUDA (Compute Unified Device Dr. Bharathwaj Bharath Muthuswamy Architecture) and OpenCL

HiPANQ Overview of NVIDIA GPU Architecture and Introduction to CUDA/OpenCL Programming, and

Accelerating Tandem MS Protein Database Searches Using OpenCL Programming devices the

Scalable Multi-Precision Simulation of Spiking Neural Networks on GPU with OpenCL Dmitri

Experiences with OpenCL in PyFR: 2014Present F.D. Witherden 1 and P.E. Vincent 2 1 Department

Simulation of OpenCL and APUs on Multi2Sim 4.1 Rafael Ubal, David Kaeli Conference title 1

Lightweight and Accurate Recursive Fractal Network for Image Super-Resolution Juncheng Li, Yiting

Embarrassingly Parallel Computations Embarrassingly Parallel Computations A computation that

Bohemian Eigenvalues Joint work by: Robert M. Corless Steven E. Thornton Sonia Gupta Jonathan

Fractional L evy processes Heikki Tikanm aki Stockholm, March 15 2010 Heikki Tikanm aki

Parameter Estimation and the Fisher Matrix in Advanced LIGO Carl Rodriguez Ilya Mandel Ben

Multispace and Multilevel BDDC Jan Mandel University of Colorado at Denver and Health Sciences

Type Ia SN Light Curve Inference: Hierarchical Models for Nearby SN in the Rest-Frame Near

Micromorphic media Samuel Forest Mines ParisTech / CNRS Centre des Mat eriaux/UMR 7633 BP