27 Youngjoon Jo An abstract model void main() { � foreach(Point p : points) { � foreach(Node n : p.oracleNodes()) { � update(p, n); � } � } � } � �
28 Youngjoon Jo Iteration space of traversal void main() { � foreach(Point p : points) { � foreach(Node n : p.oracleNodes()) { � update(p, n); � } � } � Nodes } � � Points
29 Youngjoon Jo Iteration space of traversal Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
30 Youngjoon Jo Iteration space of traversal Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
31 Youngjoon Jo How to vectorize? Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
32 Youngjoon Jo Outline • Example & Abstract Model • Point Blocking to Enable SIMD • Traversal Splicing to Enhance Utilization • Automatic Transformation • Evaluation and Conclusion
33 Youngjoon Jo Point blocking [OOPSLA 2011] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
34 Youngjoon Jo Point blocking [OOPSLA 2011] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
35 Youngjoon Jo Point blocked code void recurse(Point *p, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } �
36 Youngjoon Jo Point blocked code void recurse(Block *block, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } �
37 Youngjoon Jo Point blocked code void recurse(Block *block, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � Function update(p, n); � body } else { � recurse(p, n->left); � recurse(p, n->right); � } � } �
38 Youngjoon Jo Point blocked code Loop over points in block void recurse(Block *block, Node *n) { � for (int i = 0; i = block->size; i++) { � Point *p = block->p[i]; � if (truncate(p, n)) continue; � Function if (n->isLeaf()) { � update(p, n); � body } else { � recurse(p, n->left); � recurse(p, n->right); � } � } � } �
39 Youngjoon Jo Point blocked code Loop over points in block void recurse(Block *block, Node *n) { � for (int i = 0; i = block->size; i++) { � Point *p = block->p[i]; � if (truncate(p, n)) continue; � Function if (n->isLeaf()) { � update(p, n); � body } else { � recurse(p, n->left); � recurse(p, n->right); � } � } � } �
40 Youngjoon Jo Point blocked code Loop over points in block void recurse(Block *block, Node *n) { � Block *nextBlock = // next level block � for (int i = 0; i = block->size; i++) { � Point *p = block->p[i]; � if (truncate(p, n)) continue; � Function if (n->isLeaf()) { � body update(p, n); � } else { � nextBlock->add(p); � } � } � } �
41 Youngjoon Jo Point blocked code Loop over points in block void recurse(Block *block, Node *n) { � Block *nextBlock = // next level block � for (int i = 0; i = block->size; i++) { � Point *p = block->p[i]; � if (truncate(p, n)) continue; � Function if (n->isLeaf()) { � body update(p, n); � } else { � nextBlock->add(p); � } � } � if (nextBlock->size > 0) { � Next block recurse(nextBlock, n->left); � recurses children recurse(nextBlock, n->right); � } � } �
42 Youngjoon Jo Point blocking [OOPSLA 2011] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
43 Youngjoon Jo Point blocking [OOPSLA 2011] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
44 Youngjoon Jo Point blocking [OOPSLA 2011] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
45 Youngjoon Jo Analogous to packet SIMD Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
46 Youngjoon Jo Analogous to packet SIMD Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 e g r e v i d 2 s 3 t n o i p n e h w n w o d s k a e r B 4 5 6 7 8 9 10 11 12 13 14 15
47 Youngjoon Jo Packet SIMD has poor utilization Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E Partial Full F SIMD SIMD G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
48 Youngjoon Jo Packet SIMD has poor utilization Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 Partial Full SIMD SIMD 2 3 4 5 6 7 8 9 10 11 12 13 14 15
49 Youngjoon Jo SIMD utilization Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 Partial Full n o i t a z l i i u t D M SIMD I S SIMD 2 3 k r o w l a t o T / D 4 5 6 M 7 S I l u l f n i k r o W = 8 9 10 11 12 13 14 15
50 Youngjoon Jo SIMD utilization Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G k r o H w l a o t T / D M I S l l u f n i k r o W 1 Partial Full s e l c r i c l a t o T / e SIMD u l SIMD b n 2 i s 3 e l c r i C = 4 5 6 7 2 3 . 0 = 4 7 / 4 2 = 8 9 10 11 12 13 14 15
51 Youngjoon Jo Use larger block size Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H D M S I n a h t r e 1 g r a Partial l Full e z i s k c o l b e s U SIMD SIMD 2 3 ! s t n i o p t c a p m o c d n a h 4 t 5 6 7 d w i 8 9 10 11 12 13 14 15
52 Youngjoon Jo Use larger block size Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
53 Youngjoon Jo Better utilization with larger block size Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H Partial SIMD 1 Full SIMD 2 3 4 5 6 7 8 9 10 11 12 13 14 15
54 Youngjoon Jo Better utilization with larger block size Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H Partial SIMD 1 s e c l Full r c i a l o t T / e u b l n i SIMD s e c l 2 r 3 C i 6 8 . 0 = 4 5 6 7 4 7 / 4 6 = 8 9 10 11 12 13 14 15
55 Youngjoon Jo SIMD utilization – Block size 1 SIMD Utilization 0.8 Barnes-Hut 0.6 Point Correlation 0.4 Nearest Neighbor 0.2 Vantage Point 0 4 40 400 4000 40000 400000 Photon Mapping Block Size
56 Youngjoon Jo Ideal utilization Ideal Utilization 1 SIMD Utilization 0.8 Barnes-Hut 0.6 Point Correlation 0.4 Nearest Neighbor 0.2 Vantage Point 0 4 40 400 4000 40000 400000 Photon Mapping Block Size s t n o i p l a t o t o t l a u q e e z s i k c o B l n o t i a z i i l t u D M I S a l e d i s d l e i y
57 Youngjoon Jo Use max block! Problem solved? 1 SIMD Utilization 0.8 Barnes-Hut 0.6 Point Correlation 0.4 Nearest Neighbor 0.2 Vantage Point 0 4 40 400 4000 40000 400000 Photon Mapping Block Size
58 Youngjoon Jo Large block has poor locality 1 SIMD Utilization 0.8 Barnes-Hut 0.6 Point Correlation 0.4 Nearest Neighbor 0.2 Vantage Point 0 4 40 400 4000 40000 400000 Photon Mapping Block Size
59 Youngjoon Jo Large block has poor locality 1 SIMD Utilization 0.8 Barnes-Hut 0.6 Point Correlation 0.4 Nearest Neighbor 0.2 Vantage Point 0 4 40 400 4000 40000 400000 Photon Mapping Block Size n o t i a z i i l t u d o o g h i t w e l u d e h c s d e e N y t i a l c o l d o o g d n a
60 Youngjoon Jo Outline • Example & Abstract Model • Point Blocking to Enable SIMD • Traversal Splicing to Enhance Utilization • Automatic Transformation • Evaluation and Conclusion
61 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
62 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
63 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 4 5 6 7 8 9 10 11 12 13 14 15
64 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 4 5 6 7 8 9 10 11 12 13 14 15
65 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Resume at next node 4 5 6 7 8 9 10 11 12 13 14 15
66 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Resume at next node 4 5 6 7 8 9 10 11 12 13 14 15
67 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Resume at next node 4 5 6 7 4. Repeat 2-3 until finished 8 9 10 11 12 13 14 15
68 Youngjoon Jo Traversal splicing [OOPSLA 2012] Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Resume at next node 4 5 6 7 4. Repeat 2-3 until finished 8 9 10 11 12 13 14 15
69 Youngjoon Jo Can change order of points Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H f o r e d r o e h t e g n a 1. Designate splice nodes h c 1 n a c e W 2. Traverse up to splice node ? w 2 o 3 h t u b , s n t i o 3. Resume at next node p d e s u a p 4 5 6 7 4. Repeat 2-3 until finished 8 9 10 11 12 13 14 15
70 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B e m a s h c a e r h c i h w C s t n o i p : t h g i s n I Points r D a l i m s i e v a h o t y l e k l i e r a E s e d o n e r F u t u f n i s a l s r e v a t r G H y r o t s i h l 1. Designate splice nodes a s r 1 e v a r t n o g n t i r o s c 2. Traverse up to splice node i m a n y D 2 3 3. Resume at next node 4 5 6 7 4. Repeat 2-3 until finished 8 9 10 11 12 13 14 15
71 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A B C Points D E F G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 4 5 6 7 8 9 10 11 12 13 14 15
72 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A B B C C Points D D E E F F G G H H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Reorder points at splice node 4 5 6 7 8 9 10 11 12 13 14 15
73 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A B C C E Points D F E H F B G D H G 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Reorder points at splice node 4 5 6 7 8 9 10 11 12 13 14 15
74 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A A B C C C E E Points D F F E H H F B B G D D H G G 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Reorder points at splice node 4 5 6 7 4. Resume at next node 8 9 10 11 12 13 14 15
75 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A E B C B C E D Points D F G E H A F B C G D F H G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Reorder points at splice node 4 5 6 7 4. Resume at next node 5. Repeat 2-4 until finished 8 9 10 11 12 13 14 15
76 Youngjoon Jo Dynamic sorting Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A E B C B C E D Points D F G E H A F B C G D F H G H 1. Designate splice nodes 1 2. Traverse up to splice node 2 3 3. Reorder points at splice node 4 5 6 7 4. Resume at next node 5. Repeat 2-4 until finished 8 9 10 11 12 13 14 15
77 Youngjoon Jo Dynamic sorting enhances utilization Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A E B C B C E D Points D F G E H A Partial F B C SIMD G D F H G H 1 Full SIMD 2 3 4 5 6 7 8 9 10 11 12 13 14 15
78 Youngjoon Jo Dynamic sorting enhances utilization Nodes 1 2 4 8 9 5 10 11 3 6 12 13 7 14 15 A A E B C B C E D Points D F G E H A Partial F B C SIMD G D F H G H 1 s e c l Full r c i a l o t T / e u b l n i SIMD s e c l 2 r 3 C i 5 6 . 0 = 4 5 6 7 4 7 / 8 4 = 8 9 10 11 12 13 14 15
79 Youngjoon Jo SIMD utilization – splice depth 1 N/A SIMD Utilization 0.8 2 0.6 4 0.4 6 0.2 8 0 4 40 400 4000 40000 400000 10 Block Size Nearest Neighbor
80 Youngjoon Jo SIMD utilization – splice depth Block size: 512 Splice depth: 10 1 N/A SIMD Utilization 0.8 2 0.6 4 0.4 Block size: 6 0.2 524288 8 0 4 40 400 4000 40000 400000 10 Block Size Nearest Neighbor
81 Youngjoon Jo SIMD utilization 1 0.9 0.8 Utilization 0.7 0.6 0.5 0.4 0.3 0.2 0.1 0 Baseline A Priori Sort Dynamic Sort Ideal
82 Youngjoon Jo SIMD utilization 1 0.9 0.8 Utilization 0.7 0.6 0.5 0.4 0.3 0.2 0.1 0 y l l a c t i a m o u t a n a c g n t i Baseline r A Priori Sort Dynamic Sort Ideal o s c i m a n y D m u m i x a m e h t t s o m l a t c a r t x e n o i a t z i i l t u D M S I f o t n u o m a
83 Youngjoon Jo Outline • Example & Abstract Model • Point Blocking to Enable SIMD • Traversal Splicing to Enhance Utilization • Automatic Transformation • Evaluation and Conclusion
84 Youngjoon Jo Automatic transformation • Point blocking Jo and Kulkarni [OOPSLA 2011] • Traversal splicing Jo and Kulkarni [OOPSLA 2012]
85 Youngjoon Jo Automatic transformation • Our key addition for SIMD: Layout transformation from AoS (array of structures) to SoA (structure of arrays) • + Allows vector load/stores • + Packed data has better spatial locality • - More overhead in moving data AoS (array of structures) x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 SoA (structure of arrays) x1 x2 x3 x4 y1 y2 y3 y4 z1 z2 z3 z4
86 Youngjoon Jo AoS to SoA layout • Whole program AoS to SoA layout transformation difficult to automate with aliasing • Limit scope to traversal code only • Copy in to SoA before traversal • Copy out to AoS after traversal • Inter-procedural, flow-insensitive analysis • Determine which point fields should be SoA • Conservatively ensure correctness
87 Youngjoon Jo AoS to SoA layout � � � void recurse(Point *p, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } �
88 Youngjoon Jo AoS to SoA layout struct Point { float f1, f2, f3; } � � � void recurse(Point *p, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } � � �
89 Youngjoon Jo AoS to SoA layout struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � void recurse(Point *p, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } � �
90 Youngjoon Jo AoS to SoA layout struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � void recurse(Point *p, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
91 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � void recurse(Point *p, Node *n) { � if (truncate(p, n)) return; � if (n->isLeaf()) { � update(p, n); � } else { � recurse(p, n->left); � recurse(p, n->right); � } � } � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
92 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 � f2 � � f3 � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
93 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ � f2 � � f3 � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
94 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 � � f3 � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
95 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 ✓ ✓ � � f3 � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
96 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 ✓ ✓ � � f3 ✓ � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
97 Youngjoon Jo Transforming SoA fields struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 ✓ ✓ � � f3 ✓ � � bool truncate(Point *p, Node *n) { � return p->f1 == n->point->f1; � } � � void update(Point *p, Node *n) { � p->f2 += n->point->f3; � } � �
98 Youngjoon Jo Transforming SoA fields struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 ✓ ✓ � � f3 ✓ � � bool truncate(Block *block, int bi, Node *n) { � return block->f1[bi] == n->point->f1; � } � � void update(Block *block, int bi, Node *n) { � block->f2[bi] += n->point->f3; � } � �
99 Youngjoon Jo Correctness violation example struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 ✓ ✓ ✓ � � f3 � � bool truncate(Block *block, int bi, Node *n) { � return block->f1[bi] == n->point->f1; � } � � void update(Block *block, int bi, Node *n) { � block->f2[bi] += n->point->f2; � } � �
100 Youngjoon Jo Ensuring correctness struct Point { float f1, f2, f3; } � struct Node { Node *left, *right; Point *point; } � � � � Point-access Non-point-access � Read Write Read Write � � f1 ✓ ✓ � f2 ✓ ✓ ✓ � y l e v t i a v r e s n o c s i � s y f3 a l n a d n u o S � t . c e r r o c n o � i a t m r o s f n a r t A o bool truncate(Point *p, Node *n) { � S s e v o r p return p->f1 == n->point->f1; � u r o f o l l a m r o f s n } � a r t o t s e c i f f u S � void update(Point *p, Node *n) { � . s k a r m h c n e b p->f2 += n->point->f3; � p->f3 = 1; � } � �
Recommend
More recommend