Memory: C and x86 assembly 1
Loop Refresher mem ops Optimized or sum: .LFB2: .loc 1 2 0 unoptimized? .LVL0: .loc 1 4 0 eax == s 0 movl $0, %eax .LVL1: ??? 0 int sum(int count) testl %edi, %edi ??? 0 { int s = 0; jle .L4 s = 0 0 int i; movl $0, %eax i = 0 0 for(i = 0; i < count; i++) { movl $0, %edx s+= i; .LVL2: } .L5: return s; .loc 1 5 0 s+=i 0 } addl %edx, %eax .loc 1 4 0 i++ 0 addl $1, %edx i < count 0 cmpl %edi, %edx go again 0 jne .L5 .L4: .LVL3: .loc 1 8 0 Done 0 rep ; ret 2
Loop Refresher mem ops Optimized or sum: .LFB2: .loc 1 2 0 unoptimized? .LVL0: .loc 1 4 0 eax == s 0 movl $0, %eax .LVL1: ??? 0 int sum(int count) testl %edi, %edi ??? 0 { int s = 0; jle .L4 Optimized s = 0 0 int i; movl $0, %eax i = 0 0 for(i = 0; i < count; i++) { movl $0, %edx s+= i; .LVL2: } .L5: return s; .loc 1 5 0 s+=i 0 } addl %edx, %eax .loc 1 4 0 i++ 0 addl $1, %edx i < count 0 cmpl %edi, %edx go again 0 jne .L5 .L4: .LVL3: .loc 1 8 0 Done 0 rep ; ret 2
Array Access in a Loop sum: mem ops .LFB2: int array[10]; .loc 1 5 0 .LVL0: int sum(int count) .loc 1 8 0 s = 0 0 { movl $0, %eax access memory at int s = 0; .LVL1: count <= 0? 0 int i; testl %edi, %edi yes? skip everything 0 for(i = 0; i < count; i++) { jle .L4 array + (long)i * 4 s = 0 0 s+= array[i]; movl $0, %eax i = 0; is in a 32 bit number 0 } movl $0, %ecx return s; .LVL2: t1 = 0, this is a 64-bit 0 } movl $0, %edx version of i, for address calc purposes. .L5: .loc 1 9 0 addl array(,%rdx,4), %eax s += array[i] 1 .loc 1 8 0 i++ 0 addl $1, %ecx t1++ 0 addq $1, %rdx i < count 0 cmpl %edi, %ecx 0 jne .L5 .L4: .LVL3: .loc 1 12 0 0 rep ; ret .LFE2: The array is statically .size sum, .-sum allocate 40 bytes for array .comm array,40,32 aligned at 32 byte boundary declared here 3
Array Access in a Loop sum: mem ops .LFB2: int array[10]; .loc 1 5 0 .LVL0: int sum(int count) .loc 1 8 0 s = 0 0 { movl $0, %eax int s = 0; .LVL1: ??? 0 int i; testl %edi, %edi ??? 0 for(i = 0; i < count; i++) { jle .L4 s = 0 0 s+= array[i]; movl $0, %eax i = 0; is in a 32 bit number 0 } movl $0, %ecx return s; .LVL2: t1 = 0, this is 0 } movl $0, %edx an address (64 bits) .L5: .loc 1 9 0 addl array(,%rdx,4), %eax s += array[1] 1 .loc 1 8 0 i++ 0 addl $1, %ecx t1++ 0 addq $1, %rdx i < count 0 cmpl %edi, %ecx 0 jne .L5 .L4: .LVL3: .loc 1 12 0 Good Spatial 0 rep ; ret .LFE2: .size sum, .-sum Locality allocate 40 .comm array,40,32 bytes for array aligned at 32 byte boundary byte array array +1 ... first access Second access Third access 4
Long long int instead arayLoop2.c .globl sum .type sum, @function sum: .LFB2: .loc 1 5 0 .LVL0: .loc 1 8 0 s = 0 movl $0, %eax .LVL1: ??? long long int array[10]; testl %edi, %edi ??? jle .L4 s = 0 int sum(int count) movl $0, %eax i = 0 { movl $0, %edx int s = 0; .LVL2: cast count long long int i; movslq %edi,%rcx to a long long int for(i = 0; i < count; i++) { .LVL3: s+= array[i]; .L5: } .loc 1 9 0 addl array(,%rdx,8),%eax now x8 1 return s; instead x4 } .loc 1 8 0 i++ addq $1, %rdx i < count cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .LVL5: .loc 1 12 0 rep ; ret .LFE2: .size sum, .-sum 2x the 5 .comm array,80,32 bytes
Structs How big is aStruct? 24 bytes! me struct.c m 0 a ops 1 a First .globl sum 2 a struct aStruct{ .type sum, @function 3 a int a; sum: 4 b int b; .LFB2: 5 b Second 6 b char c; .loc 1 9 0 7 b s == rdi long long int d; .LVL0: 8 c Third }; .loc 1 13 0 9 padding for alignment t = 0; t += s->a 1 movl (%rdi), %eax 10 padding for alignment 11 padding for alignment t += s->b 1 int sum(struct aStruct * s) { addl 4(%rdi), %eax 12 padding for alignment .LVL1: 13 padding for alignment 14 padding for alignment cast s->c to 1 int t = 0; movsbl 8(%rdi),%edx long 15 padding for alignment t+= s->c t += s->a; addl %edx, %eax 16 d 17 d t += s->b; .LVL2: 18 d t+= s->d 1 t += s->c; addl 16(%rdi), %eax 19 d Fourth t += s->d; .loc 1 19 0 20 d 21 d ret 22 d 23 d return t; Spatial locality? } Some good some bad 6
Structs How big is aStruct? 24 bytes! me struct.c m 0 a ops 1 a First .globl sum 2 a struct aStruct{ .type sum, @function 3 a int a; sum: 4 b int b; .LFB2: 5 b Second 6 b char c; .loc 1 9 0 7 b s == rdi long long int d; .LVL0: 8 c Third }; .loc 1 13 0 9 padding for alignment t = 0; t += s->a 1 movl (%rdi), %eax 10 padding for alignment 11 padding for alignment t += s->b 1 int sum(struct aStruct * s) { addl 4(%rdi), %eax 12 padding for alignment .LVL1: 13 padding for alignment 14 padding for alignment cast s->c to 1 int t = 0; movsbl 8(%rdi),%edx long 15 padding for alignment t+= s->c t += s->a; addl %edx, %eax 16 d 17 d t += s->b; .LVL2: 18 d t+= s->d 1 t += s->c; addl 16(%rdi), %eax 19 d Fourth t += s->d; .loc 1 19 0 20 d 21 d ret 22 d 23 d return t; Spatial locality? } Note the usefulness of the Some good some bad immediate for mem ops. 6
2D Array sum: long long int array[10][10]; .LFB2: .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7
2D Array sum: long long int array[10][10]; .LFB2: .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7
2D Array sum: long long int array[10][10]; .LFB2: x = (x + 4x)*16 .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d x = 16x+64x long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 x*=80 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax 80 = 10 x sizeof(long long) x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7
2D Array sum: long long int array[10][10]; .LFB2: x = (x + 4x)*16 .loc 1 5 0 int sum(int x, int count) .LVL0: { .loc 1 8 0 r8 == s int s = 0; movl $0, %r8d x = 16x+64x long long int i; .LVL1: ??? for(i = 0; i < count; i++) { testl %esi, %esi ??? s+= array[x][i]; jle .L4 x*=80 cast x to long } movslq %edi,%rax long x = x + x*4 return s; leaq (%rax,%rax,4), %rax 80 = 10 x sizeof(long long) x *= 16, so x = } salq $4, %rax 16x + x*64 array+x addq $array, %rax s = 0 movl $0, %r8d i = 0 movl $0, %edx .LVL2: cast count to a movslq %esi,%rcx long long int .LVL3: .L5: .loc 1 9 0 s += array[x][i] addl (%rax), %r8d .loc 1 8 0 Step one entry in the array i ++ addq $1, %rdx addr += 8 addq $8, %rax cmpq %rcx, %rdx jne .L5 .LVL4: .L4: .loc 1 12 0 movl %r8d, %eax ret The array is a contiguous .LFE2: .size sum, .-sum .comm array,800,32 chunk of 800 bytes array + x*80 array + (x+10)*80 Good Spatial Locality 7
Recommend
More recommend