Improving Sampling-based Uncertainty Quantification Performance Through Embedded Ensemble Propagation Eric Phipps (etphipp@sandia.gov), Marta D’Elia, Mohamed Ebeida Sandia National Laboratories and Ahmad Rushdi, UC Davis QUIET 2017 July 18-21, 2017 SAND2017-7005 C Sandia National Laboratories is a multimission laboratory managed and operated by National Technology and Engineering Solutions of Sandia LLC, a wholly owned subsidiary of Honeywell International Inc. for the U.S. Department of Energy’s National Nuclear Security Administration under contract DE-NA0003525.
Can Exascale Solve the UQ Challenge? • Focusing on forward propagation of uncertainty through sampling-based methods – Common task in many UQ analyses • Key challenge: – Accurately quantifying rare events and localized behavior in high-dimensional uncertain input spaces for expensive simulations • Can exascale solve this challenge?
Emerging Architectures Motivate New UQ Approaches • UQ approaches traditionally implemented as an outer loop: – Repeatedly call forward simulation for each sample realization – Coarse-grained parallelism over samples • Increasing UQ performance will require – Speeding-up each sample evaluation, and/or – Evaluating more samples in parallel • Many important scientific simulations will struggle with upcoming architectures http://dakota.sandia.gov – Irregular memory access patterns – Difficulty in exploiting fine-grained parallelism (vectorization, fine-grained threads) • Increasing UQ parallelism requires exploiting massive increase in on-node parallelism • Improve performance by propagating multiple samples together at lowest levels of simulation (embedded ensemble propagation) – Improve memory access patterns – Expose new dimensions of structured fine-grained parallelism – Reduce aggregate communication
Sparse CRS-Format Matrix-Vector Product // CRS // CRS Matrix for an arbitrary floating Matrix for an arbitrary floating-point type T point type T template <typename template typename T> T> struct struct CrsMatrix CrsMatrix { int int num_rows num_rows; ; // number of rows in matrix // number of rows in matrix int int num_entries num_entries; ; // number of // number of nonzeros nonzeros in matrix in matrix int int *row_map row_map; ; // starting index of each row, [0,num_rows+1) // starting index of each row, [0,num_rows+1) int *col_entry int col_entry; ; // column indices for each // column indices for each nonzero, nonzero, [0,num_entries) [0,num_entries) T *values; T *values; // matrix values of type T, [0,num_entries) // matrix values of type T, [0,num_entries) }; }; // Serial // Serial CRS CRS matrix matrix-vector product for arbitrary floating vector product for arbitrary floating-point type T point type T template template <typename typename T> T> void void crs_mat_vec crs_mat_vec(const const CrsMatrix CrsMatrix<T>& A, <T>& A, const const T *x, T *y) { T *x, T *y) { for (int for int row= row=0; row< ; row<A.num_rows A.num_rows; ++row) { ; ++row) { const const int int entry_begin entry_begin = = A.row_map A.row_map[row]; [row]; const const int int entry_end entry_end = = A.row_map A.row_map[row+ [row+1]; ]; T sum = T sum = 0.0 0.0; for (int for int entry = entry = entry_begin entry_begin; entry < ; entry < entry_end entry_end; ++entry) { ; ++entry) { const const int int col = col = A.col_entry A.col_entry[entry]; [entry]; sum += sum += A.values A.values[entry] * x[col]; [entry] * x[col]; } y[ y[row row] = sum; ] = sum; } }
Simultaneous ensemble propagation • PDE: f ( u , y ) = 0 • Propagating m samples – block diagonal (nonlinear) system: m m m X X X F ( U , Y ) = 0 , U = e i ⊗ u i , Y = e i ⊗ y i , F = e i ⊗ f ( u i , y i ) , i =1 i =1 i =1 m ∂ F i ⊗ ∂ f X e i e T ∂ U = ∂ u i i =1 0 5 10 15 20 25 30 0 10 20 30 – Spatial DOFs for each sample stored consecutively
Ensemble Matrix-Vector Product // Ensemble matrix-vector // Ensemble matrix vector product product template <typename template typename T, T, int int m> m> void void ensemble_crs_mat_vec ensemble_crs_mat_vec(const const CrsMatrix CrsMatrix<T>& A, <T>& A, const const T *x, T *y) { T *x, T *y) { for for (int int e= e=0; e < m; ++e) { ; e < m; ++e) { for (int for int row row=0; i< ; i<A.num_rows A.num_rows; ++ ; ++row row) { ) { const const int int entry_begin entry_begin = = A.row_map A.row_map[row row]; ]; const const int int entry_end entry_end = A.row_map = A.row_map[row+ [row+1]; ]; T sum = T sum = 0.0 0.0; for for (int int entry = entry = entry_begin entry_begin; entry < ; entry < entry_end entry_end; ++entry) { ; ++entry) { const const int int col = col = A.col_entry A.col_entry[entry]; [entry]; sum += A.values sum += A.values[entry + e* [entry + e*A.num_entries A.num_entries] * x[col + e* ] * x[col + e*A.num_rows A.num_rows]; ]; } y[row y[ row + e* + e*A.num_rows A.num_rows] = sum; ] = sum; } } }
Simultaneous ensemble propagation • Commute Kronecker products: m m m F ( ˜ ˜ U , ˜ Y ) = 0 , ˜ u i ⊗ e i , ˜ y i ⊗ e i , ˜ X X X U = Y = F = f ( u i , y i ) ⊗ e i , i =1 i =1 i =1 m ∂ ˜ F ∂ f X ⊗ e i e T = i ∂ ˜ ∂ u i U i =1 0 5 10 15 20 25 30 0 10 20 30 – m sample values for each DOF stored consecutively
Commuted, Ensemble Matrix-Vector Product // Ensemble matrix-vector product using commuted layout // Ensemble matrix vector product using commuted layout template <typename template typename T, T, int int m> m> void void ensemble_commuted_crs_mat_vec ensemble_commuted_crs_mat_vec(const const CrsMatrix CrsMatrix<T>& A, <T>& A, const const T *x, T *y) { T *x, T *y) { for (int for int row row=0; i< ; i<A.num_rows A.num_rows; ++ ; ++row row) { ) { const const int int entry_begin entry_begin = = A.row_map A.row_map[row row]; ]; const const int int entry_end entry_end = = A.row_map A.row_map[row+ [row+1]; ]; T sum[m]; T sum[m]; for for (int int e= e=0; e < m; ++e) ; e < m; ++e) sum[e] = 0.0 sum[e] = 0.0; for for (int int entry = entry = entry_begin entry_begin; entry < ; entry < entry_end entry_end; ++entry) { ; ++entry) { const int const int col = col = A.col_entry A.col_entry[entry]; [entry]; for for (int int e= e=0; e < m; ++e) { ; e < m; ++e) { sum[e] += A.values sum[e] += A.values[entry*m + e] * x[col*m + e]; [entry*m + e] * x[col*m + e]; } } for for (int int e= e=0; e < m; ++e) ; e < m; ++e) y[row y[ row*m + e] = sum[e]; *m + e] = sum[e]; } } • Automatically reuse non-sample dependent data • Sparse access latency amortized across ensemble • Math on ensemble naturally maps to vector arithmetic • Communication latency amortized across ensemble
C++ Ensemble Scalar Type // Ensemble scalar type // Ensemble scalar type template <typename template typename U, U, int int m> m> struct struct Ensemble { Ensemble { U val[m]; U val[m]; Ensemble( Ensemble(const const U& U& v) { v) { for for (int int e= e=0; e<m; ++e) val[m] = v; } ; e<m; ++e) val[m] = v; } Ensemble& Ensemble& operator operator=(const const Ensemble& a) { Ensemble& a) { for (int for int e= e=0; e<m; ++e) val[m] ; e<m; ++e) val[m] = = a.val a.val[m]; [m]; return return *this this; } Ensemble& operator Ensemble& operator+=( +=(const const Ensemble& a) { Ensemble& a) { for for (int int e= e=0; e<m; ++e) val[m] += ; e<m; ++e) val[m] += a.val a.val[m]; [m]; return *this return this; } // ... // ... }; }; template template <typename typename U, U, int int m> m> Ensemble<U,m Ensemble< U,m> > operator operator*( *(const const Ensemble< Ensemble<U,m U,m>& a, >& a, const const Ensemble< Ensemble<U,m U,m>& b) { >& b) { Ensemble<U,m Ensemble< U,m> c; > c; for for (int int e= e=0; e<m; ++e) ; e<m; ++e) c.val c.val[e] = [e] = a.val a.val[e]* [e]*b.val b.val[e]; [e]; return return c; c; } // ... // ...
Ensemble Matrix-Vector Product Through Operator Overloading • Original matrix-vector product routine, instantiated with T = Ensemble<double,m> scalar type: // Serial Crs // Serial Crs matrix matrix-vector product for arbitrary floating vector product for arbitrary floating-point type T point type T template template <typename typename T> T> void void crs_mat_vec crs_mat_vec(const const CrsMatrix CrsMatrix<T>& A, <T>& A, const const T *x, T *y) { T *x, T *y) { for for (int int row= row=0; row< ; row<A.num_rows A.num_rows; ++row) { ; ++row) { const const int int entry_begin entry_begin = = A.row_map A.row_map[row]; [row]; const const int int entry_end entry_end = = A.row_map A.row_map[row+ [row+1]; ]; T sum = 0.0 T sum = 0.0; for (int for int entry = entry = entry_begin entry_begin; entry < ; entry < entry_end entry_end; ++entry) { ; ++entry) { const const int int col = col = A.col_entry A.col_entry[entry]; [entry]; sum += A.values sum += A.values[entry] * x[col]; [entry] * x[col]; } y[ y[row row] = sum; ] = sum; } }
Recommend
More recommend