Pipelined Multithreading Generation in a Polyhedral Compiler January 22nd 2020, IMPACT’20, HiPEAC, Bologna, Italy Harenome Ranaivoarivony-Razanajato, Cédric Bastoul, Vincent Loechner University of Surasbourg and Inria Nancy Grand Est Team ICPS | Scientifjc and Parallel Computing University of Surasbourg
S1 S2 … S6 (b) Dependency Graph Motivating Example 1 for ( int i = 1; i < N; ++i) 2 A[i] = f1(A[i], A[i - 1]); // S1 3 for ( int i = 1; i < N; ++i) 4 B[i] = f2(A[i], B[i - 1]); // S2 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 (a) Sequential Program Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
Motivating Example S1 1 for ( int i = 1; i < N; ++i) 2 A[i] = f1(A[i], A[i - 1]); // S1 S2 3 for ( int i = 1; i < N; ++i) 4 B[i] = f2(A[i], B[i - 1]); // S2 /* ... */ … 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 S6 (a) Sequential Program (b) Dependency Graph Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
S2(1), thread 1 S3(1), thread 1 S1(2), thread 2 S2(2), thread 2 S1(3), thread 3 Motivating Example S1(1), thread 1 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 2 for ( int i = 1; i < N; ++i) 3 B[i] = f2(A[i], B[i - 1]); // S2 4 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 (a) Sequential Program (b) Pipelined Execution Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
S3(1), thread 1 S1(2), thread 2 S2(2), thread 2 S1(3), thread 3 Motivating Example S1(1), thread 1 S2(1), thread 1 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 2 for ( int i = 1; i < N; ++i) 3 B[i] = f2(A[i], B[i - 1]); // S2 4 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 (a) Sequential Program (b) Pipelined Execution Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
S3(1), thread 1 S2(2), thread 2 S1(3), thread 3 Motivating Example S1(1), thread 1 S2(1), thread 1 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 2 for ( int i = 1; i < N; ++i) 3 B[i] = f2(A[i], B[i - 1]); // S2 4 S1(2), thread 2 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 (a) Sequential Program (b) Pipelined Execution Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
S2(2), thread 2 S1(3), thread 3 Motivating Example S1(1), thread 1 S2(1), thread 1 S3(1), thread 1 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 2 for ( int i = 1; i < N; ++i) 3 B[i] = f2(A[i], B[i - 1]); // S2 4 S1(2), thread 2 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 (a) Sequential Program (b) Pipelined Execution Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
S1(3), thread 3 Motivating Example S1(1), thread 1 S2(1), thread 1 S3(1), thread 1 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 2 for ( int i = 1; i < N; ++i) 3 B[i] = f2(A[i], B[i - 1]); // S2 4 S1(2), thread 2 S2(2), thread 2 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 (a) Sequential Program (b) Pipelined Execution Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
Motivating Example S1(1), thread 1 S2(1), thread 1 S3(1), thread 1 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 2 for ( int i = 1; i < N; ++i) 3 B[i] = f2(A[i], B[i - 1]); // S2 4 S1(2), thread 2 S2(2), thread 2 /* ... */ 5 for ( int i = 1; i < N; ++i) 6 F[i] = f6(E[i], F[i - 1]); // S6 7 S1(3), thread 3 (a) Sequential Program (b) Pipelined Execution Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
Speedup: 2.89 6 stages on an Intel Xeon E5-2620v3 @ 2.40 GHz, with N 100 000 Motivating Example #pragma omp parallel 1 { 2 #pragma omp for schedule(static) ordered nowait 3 for ( int i = 1; i < N; ++i) 4 #pragma omp ordered 5 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 6 2 A[i] = f1(A[i], A[i - 1]); // S1 #pragma omp for schedule(static) ordered nowait 7 for ( int i = 1; i < N; ++i) 8 3 for ( int i = 1; i < N; ++i) #pragma omp ordered 9 4 B[i] = f2(A[i], B[i - 1]); // S2 B[i] = f2(A[i], B[i - 1]); // S2 10 5 /* ... */ /* ... */ 11 6 for ( int i = 1; i < N; ++i) #pragma omp for schedule(static) ordered nowait 12 7 F[i] = f6(E[i], F[i - 1]); // S6 for ( int i = 1; i < N; ++i) 13 #pragma omp ordered 14 F[i] = f6(E[i], F[i - 1]); // S6 15 } 16 (a) Sequential Program (b) Pipelined OpenMP target program Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
Motivating Example #pragma omp parallel 1 { 2 #pragma omp for schedule(static) ordered nowait 3 for ( int i = 1; i < N; ++i) 4 #pragma omp ordered 5 1 for ( int i = 1; i < N; ++i) A[i] = f1(A[i], A[i - 1]); // S1 6 2 A[i] = f1(A[i], A[i - 1]); // S1 #pragma omp for schedule(static) ordered nowait 7 for ( int i = 1; i < N; ++i) 8 3 for ( int i = 1; i < N; ++i) #pragma omp ordered 9 4 B[i] = f2(A[i], B[i - 1]); // S2 B[i] = f2(A[i], B[i - 1]); // S2 10 5 /* ... */ /* ... */ 11 6 for ( int i = 1; i < N; ++i) #pragma omp for schedule(static) ordered nowait 12 7 F[i] = f6(E[i], F[i - 1]); // S6 for ( int i = 1; i < N; ++i) 13 #pragma omp ordered 14 F[i] = f6(E[i], F[i - 1]); // S6 15 } 16 (a) Sequential Program (b) Pipelined OpenMP target program Speedup: 2.89 6 stages on an Intel Xeon E5-2620v3 @ 2.40 GHz, with N = 100 , 000 Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 1
Goals • Identifying software pipelines in a polyhedral compiler • Generate pipelined multithreading using OpenMP Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 2
Polyhedral Model Introduction Background Pipelined Multithreading Generation Experimental Results Conclusion
• Synchronization • #pragma omp barrier : explicit synchronization barrier • omp_set_lock() and omp_unset_lock() : explicit lock mechanism • Clauses • nowait clause on worksharing constructs: omit the implicit barrier at the end of a worksharing construct • ordered clause on worksharing constructs: sequentialize a region OpenMP • #pragma based API for shared memory parallelism • Worksharing constructs • #pragma omp for • #pragma omp task Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 3
• Clauses • nowait clause on worksharing constructs: omit the implicit barrier at the end of a worksharing construct • ordered clause on worksharing constructs: sequentialize a region OpenMP • #pragma based API for shared memory parallelism • Worksharing constructs • #pragma omp for • #pragma omp task • Synchronization • #pragma omp barrier : explicit synchronization barrier • omp_set_lock() and omp_unset_lock() : explicit lock mechanism Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 3
OpenMP • #pragma based API for shared memory parallelism • Worksharing constructs • #pragma omp for • #pragma omp task • Synchronization • #pragma omp barrier : explicit synchronization barrier • omp_set_lock() and omp_unset_lock() : explicit lock mechanism • Clauses • nowait clause on worksharing constructs: omit the implicit barrier at the end of a worksharing construct • ordered clause on worksharing constructs: sequentialize a region Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 3
Polyhedral Model Introduction Background Pipelined Multithreading Generation Sequential Loop Fission Relaxed nowait prerequisites Alternative: Explicit synchronization Experimental Results Conclusion
Sequential Loop Fission • Goal: maximize the number of pipeline stages • Dependence analysis: identify Surongly Connected Components Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 4
Sequential Loop Fission for ( int i = 2; i < N; ++i) { 1 a[i] = h[i - 1] + R[i]; // S1 2 for ( int i = 2; i < N; ++i) { b[i] = a[i - 1] + a[i]; // S2 1 3 a[i] = h[i - 1] + R[i]; // S1 c[i] = b[i - 1] + b[i]; // S3 2 4 b[i] = a[i - 1] + a[i]; // S2 d[i] = c[i - 1] + c[i]; // S4 3 5 c[i] = b[i - 1] + b[i]; // S3 e[i] = d[i - 2] + d[i - 1]; // S5 4 6 d[i] = c[i - 1] + c[i]; // S4 f[i] = e[i - 2] + e[i - 1]; // S6 5 7 e[i] = d[i - 2] + d[i - 1]; // S5 g[i] = f[i] + X[i]; // S7 6 8 f[i] = e[i - 2] + e[i - 1]; // S6 h[i] = g[i] + Y[i]; // S8 7 9 g[i] = f[i] + X[i]; // S7 } 8 10 h[i] = g[i] + Y[i]; // S8 for ( int i = 2; i < N; ++i) { 9 11 u[i] = v[i - 1] + d[i]; // S9 10 u[i] = v[i - 1] + d[i]; // S9 12 v[i] = u[i] + Z[i]; // S10 11 v[i] = u[i] + Z[i]; // S10 13 } 12 } 14 (b) Fission of Surongly Connected (a) Original loop body Components Pipelined Multithreading Generation in a Polyhedral Compiler ,Harenome Razanajato et al. 5
Recommend
More recommend