Integrating GPU Support for OpenMP Offloading Directives into Clang Carlo Bertolli , Samuel F. Antao, Gheorghe-Teodor Bercea, Arpith C. Jacob, Alexandre E. Eichenberger, Tong Chen, Zehra Sura, Hyojin Sung, Georgios Rokos, David Appelhans, Kevin O’Brien IBM T.J. Watson Research Center The Second Workshop on the LLVM Compiler Infrastructure in HPC 11.15.15
C/C++ input file XLF CPU GPU Preproc. Preproc. Clang Clang LLVM LLVM P8 NVPTX PTXAS M NVLINK LD M K K M Fat binary M M K P8 P8 K Host GPU M K K M omp/lomp libtarget-nvptx PCI Express libtarget 2
C/C++ input file XLF OpenMP CPU GPU Implementation in Clang CG Preproc. Preproc. Clang Clang LLVM LLVM P8 NVPTX PTXAS M NVLINK LD M K K M Fat binary M M K P8 P8 K Host GPU M K K M omp/lomp libtarget-nvptx PCI Express libtarget 3
Research Topics • Implement OpenMP on GPU - Hard to do for programming constraints - Cannot re-use OpenMP on CPU (codegen +lib) - Performance guaranteed to be trash in many cases - What should be optimized? • Integration into Open Source compiler - Cannot be disruptive to compiler design and implementation - Based on existing assumptions: OpenMP is implemented in Clang codegen - Gather community interest for this implementation to land 4
OpenMP Challenges for GPUs #pragma omp target teams { Sequential within team: if (a[0]++ > 0) { only team master executes this #pragma omp parallel for for (int i = 0 ; i < n ; i++) { if ( omp_get_thread_num () > 0) { #pragma omp simd for (int s = 0 ; s < 32 ; s++) { .. } } else { #pragma omp simd for (int s = 0 ; s < 4 ; s++) { .. } } } } else if(b[0]++ > 0) { #pragma omp parallel for for (int i = 0 ; i < n*2 ; i++) { .. } } } 5
OpenMP Challenges for GPUs #pragma omp target teams thread_limit(256) Parallel threads : { some threads are executing this in parallel if (a[0]++ > 0) { #pragma omp parallel for num_threads(128) for (int i = 0 ; i < n ; i++) { if ( omp_get_thread_num () > 0) { #pragma omp simd for (int s = 0 ; s < 32 ; s++) { .. } } else { #pragma omp simd for (int s = 0 ; s < 4 ; s++) { .. } } } } else if(b[0]++ > 0) { #pragma omp parallel for for (int i = 0 ; i < n*2 ; i++) { .. } } } 6
OpenMP Challenges for GPUs #pragma omp target teams Explicit and implicit { divergence between threads if (a[0]++ > 0) { #pragma omp parallel for for (int i = 0 ; i < n ; i++) { if ( omp_get_thread_num () > 0) { #pragma omp simd for (int s = 0 ; s < 32 ; s++) { .. } } else { #pragma omp simd for (int s = 0 ; s < 4 ; s++) { .. } } } } else if(b[0]++ > 0) { #pragma omp parallel for nowait for (int i = 0 ; i < n*2 ; i++) { .. } } } 7
OpenMP Challenges for GPUs #pragma omp target teams No actual simd units on GPUs { if (a[0]++ > 0) { #pragma omp parallel for for (int i = 0 ; i < n ; i++) { if ( omp_get_thread_num () > 0) { #pragma omp simd for (int s = 0 ; s < 32 ; s++) { .. } } else { #pragma omp simd for (int s = 0 ; s < 4 ; s++) { .. } } } } else if(b[0]++ > 0) { #pragma omp parallel for nowait for (int i = 0 ; i < n*2 ; i++) { .. } } } 8
Control Loop Scheme nextState = SQ1; while (!finished) { switch (nextState) { int tmp = 3; case SQ1: #pragma omp target teams \ 0 1 2 3 4 if (tid > 0) break ; thread_limit(5) \ // sequential reg. 1 nextState = PR1; map(tofrom:tmp,a[:n]) break ; { case PR1: tmp += 3; if (tid > 4) break ; // parallel reg. 1 #pragma omp parallel for \ Avoid dynamic parallelism if (tid == 0) nextState = SQ2; num_threads(4) and start all threads break ; for (int i = 0 ; i < n; i++) case SQ2: a[i] += tmp; if (tid > 0) break ; // sequential reg. 2 tmp = -1; finished = true; } break ; } __syncthreads (); } 9
Control Loop Scheme nextState = SQ1; while (!finished) { switch (nextState) { int tmp = 3; case SQ1: #pragma omp target teams \ 0 1 2 3 4 if (tid > 0) break ; thread_limit(5) \ // sequential reg. 1 nextState = PR1; map(tofrom:tmp,a[:n]) break ; { 0 1 2 3 4 case PR1: tmp += 3; if (tid > 4) break ; // parallel reg. 1 #pragma omp parallel for \ if (tid == 0) nextState = SQ2; num_threads(4) break ; for (int i = 0 ; i < n; i++) case SQ2: a[i] += tmp; if (tid > 0) break ; // sequential reg. 2 tmp = -1; finished = true; } break ; } __syncthreads (); } 10
Control Loop Scheme nextState = SQ1; while (!finished) { switch (nextState) { int tmp = 3; case SQ1: #pragma omp target teams \ 0 1 2 3 4 if (tid > 0) break ; thread_limit(5) \ // sequential reg. 1 nextState = PR1; map(tofrom:tmp,a[:n]) break ; { 0 1 2 3 4 case PR1: tmp += 3; if (tid > 3) break ; // parallel reg. 1 #pragma omp parallel for \ if (tid == 0) nextState = SQ2; 0 1 2 3 4 num_threads(4) break ; for (int i = 0 ; i < n; i++) case SQ2: a[i] += tmp; if (tid > 0) break ; // sequential reg. 2 tmp = -1; finished = true; } break ; } __syncthreads (); } 11
Control Loop Scheme nextState = SQ1; while (!finished) { switch (nextState) { int tmp = 3; case SQ1: #pragma omp target teams \ 0 1 2 3 4 if (tid > 0) break ; thread_limit(5) \ // sequential reg. 1 nextState = PR1; map(tofrom:tmp,a[:n]) break ; { 0 1 2 3 4 case PR1: tmp += 3; if (tid > 4) break ; // parallel reg. 1 #pragma omp parallel for \ if (tid == 0) nextState = SQ2; 0 1 2 3 4 num_threads(4) break ; for (int i = 0 ; i < n; i++) case SQ2: a[i] += tmp; if (tid > 0) break ; 0 1 2 3 4 // sequential reg. 2 tmp = -1; finished = true; } break ; } __syncthreads (); } 12
Control Loop & Clang • Rules for modular integration • Do ’s • Extend OpenMP-related functions • Add new function calls • Add new runtime functions only for specific targets • Don’t s • OpenMP target implementation influences every C/C++ construct • Add metadata and process OpenMP later when more convenient 13
Example: Codegen Control Loop for # target void CGF::EmitOMPTargetDirective(..) { // control flow will lead to … nextState = SQ1; if (isTargetMode) while (!finished) { codegen CGM.getOpenMPRuntime().EnterTargetLoop(); switch (nextState) { case SQ1: // emit target region statements if (tid > 0) break ; CGF.EmitStmt(CS->getCapturedStmt()); if (isTargetMode) } CGM.getOpenMPRuntime().ExitTargetLoop(); __syncthreads (); } } 14
Example: Codegen Control Loop for # target void CGF::EmitOMPTargetDirective(..) { // control flow will lead to … nextState = SQ1; if (isTargetMode) while (!finished) { setInsertPoint CGM.getOpenMPRuntime().EnterTargetLoop(); switch (nextState) { case SQ1: // emit target region statements if (tid > 0) break ; CGF.EmitStmt(CS->getCapturedStmt()); if (isTargetMode) } CGM.getOpenMPRuntime().ExitTargetLoop(); __syncthreads (); } } 15
Example: Codegen Control Loop for # target void CGF::EmitOMPTargetDirective(..) { // control flow will lead to … nextState = SQ1; if (isTargetMode) while (!finished) { CGM.getOpenMPRuntime().EnterTargetLoop(); switch (nextState) { case SQ1: codegen until # parallel // emit target region statements if (tid > 0) break ; CGF.EmitStmt(CS->getCapturedStmt()); if (isTargetMode) } CGM.getOpenMPRuntime().ExitTargetLoop(); __syncthreads (); } } 16
Example: Codegen Control Loop for # parallel nextState = SQ1; void CGF::EmitOMPParallelDirective(..) { while (!finished) { // control flow will lead to … switch (nextState) { if (isTargetMode) case SQ1: CGM.getOpenMPRuntime().EnterParallel(); if (tid > 0) break ; // sequential reg. 1 // emit parallel region statements nextState = PR1; CGF.EmitStmt(CS->getCapturedStmt()); break ; case PR1: if (isTargetMode) if (tid > num_threads) break ; CGM.getOpenMPRuntime().ExitParallel(); } } __syncthreads (); } 17
Example: Codegen Control Loop for # target nextState = SQ1; void CGF::EmitOMPParallelDirective(..) { while (!finished) { // control flow will lead to … switch (nextState) { if (isTargetMode) case SQ1: CGM.getOpenMPRuntime().EnterParallel(); if (tid > 0) break ; // sequential reg. 1 setInsertPoint // emit parallel region statements nextState = PR1; CGF.EmitStmt(CS->getCapturedStmt()); break ; case PR1: if (isTargetMode) if (tid > num_threads) break ; CGM.getOpenMPRuntime().ExitParallel(); } } __syncthreads (); } 18
Control Loop Overhead vs CUDA (1/2) #pragma omp target teams \ distribute parallel for \ Vector Add CUDA Control Loop schedule(static,1) for (i = 0 ; i < n ; i++) 16 64 #registers/thread a[i] += b[i] + c[i]; Shared Memory 0 280 for (int i = threadIdx.x + blockIdx.x * (bytes) blockDim.x; i < n; 95.9% 26.6% Occupancy i += blockDim.x * gridDim.x) a[i] += b[i] + c[i]; Execution Time 1523.5 1988.5 (usec.) Nvidia Tesla K40m -maxregcount=64 19
Control Loop Overhead vs CUDA (2/2) #pragma omp target teams \ distribute parallel for \ Vector Matrix Add CUDA Control Loop schedule(static,1) for (i = 0 ; i < n ; i++) for (j = 0 ; j < n_loop ; j++) 18 64 #registers/thread a[i] += b[i] + c[i*n_loop + j]; Shared Memory for (int i = threadIdx.x + blockIdx.x * 0 280 blockDim.x; (bytes) i < n; 97.3% 49.5% i += blockDim.x * gridDim.x) Occupancy for (j = 0 ; j < n_loop ; j++) a[i] += b[i] + c[i*n_loop + j]; Execution Time 70832.0 78333.0 (usec.) Nvidia Tesla K40m -maxregcount=64 20
Recommend
More recommend