13.1 13.2 Compiling with Optimizations • Compilers usually have options to apply optimization • Example: gcc/g++ -O n – -O0 : ____ optimization (the default); generates unoptimized code but has the CS356 Unit 13 __________ compilation time. – -O1 : ___________ optimization; optimizes reasonably well but does not degrade compilation time significantly. – -O2 : ____ optimization; generates highly optimized code and has the slowest compilation time. Performance – -O3 : Full optimization as in -O2; also uses more aggressive automatic _______ of subprograms within a unit and attempts to vectorize loops. – -Os : Optimize space usage (code and data) of resulting program. • However, there are still many things the programmer can do to help https://gcc.gnu.org/onlinedocs/gnat_ugn/Optimization-Levels.html 13.3 13.4 Profiling gprof Output • Rule: Optimize the _________________ • To instrument your code for profiling: – A small optimization in code that accounts for a – $ gcc -pg prog1.c -o prog1 large amount of ________________ is worth far • Run your code void someTask( /* args */ ) more than a _______ optimization in code that { accounts for a small fraction of the execution /* Segment A – sequential code */ – ./prog1 time for(int i=0; i<N; i++){ – This will run the program and generate a file with statistics: gmon.out /* Segment B */ • Q: How do you know where time is being spent? for(int j=0; j<N; j++){ • Process the profiler results /* Segment C */ • A: _________________! } – $ gprof prog1 gmon.out > results.txt } – Instrument your code to take statistics as it runs return 0; – View results.txt and then can show you what percentage of time } each function or even line of code was % cumulative self self total responsible for time seconds seconds calls s/call s/call name Which code segment should you – Common profilers 42.96 4.48 4.48 56091649 0.00 0.00 Board::operator<(Board const&) const likely focus your time optimizing? 6.43 5.15 0.67 2209524 0.00 0.00 std::_Rb_tree<...>::_M_lower_bound(...) • gprof (usually standard with Unix / Linux installs) and 5.08 5.68 0.53 108211500 0.00 0.00 __gnu_cxx::__normal_iterator<...>::operator+(...) gcc/g++ 4.51 6.15 0.47 4419052 0.00 0.00 Board::Board(Board const&) • Intel VTune 4.32 6.60 0.45 1500793 0.00 0.00 void std::__adjust_heap<...>(...) • MS Visual Studio Profiling Tools 3.84 7.00 0.40 28553646 0.00 0.00 PuzzleMove::operator>(PuzzleMove const&) const
13.5 13.6 Reducing Function Calls #include <stdio.h> int x=0; • Consider the "original" code to the int f1() right { /* Produces & returns an int */ • Can we optimize by converting the ____________________; } original code to the proposed int main() optimized code? { int y = f1() + f1() + f1() + f1(); – _____! printf("%d\n", y); return 0; • Functions may have _____________ } Original Code – What if ____________________ in the OPTIMIZATION BLOCKERS #include <iostream> function using namespace std; ... int main() { int y = 4*f(); cout << y << endl; return 0; } Proposed Optimization 13.7 13.8 Function Inlining Inlining int x=0; main: ... int f1() movl $0, %eax • Inlining is the process of _________________ the function code { call f1 /* Produces & returns an int */ movl %eax, %ebx into each location where it is called return ++x; movl $0, %eax } call f1 • This avoids the ________________ of a function call at the cost addl %eax, %ebx g++ -O1 … int main() movl $0, %eax of greater ________________________ { call f1 int y = f1() + f1() + f1() + f1(); addl %eax, %ebx – Note: Compiling with optimization levels above -O0 allow the compiler printf("%d\n", y); movl $0, %eax return 0; call f1 to auto-inline functions of its choice (usually small functions) } leal (%rbx,%rax), %edx int x=0; main: int x=0; int x=0; ... int f1() movl x(%rip), %edx # %edx = x int f1() int f1() { leal 4(%rdx), %eax # %eax = 4+x { { /* Produces & returns an int */ movl %eax, x(%rip) # x = 4+x /* Produces & returns an int */ /* Produces & returns an int */ return ++x; g++ -O2 … leal 6(%rdx,%rdx,2), %edx # %edx=3x+6 return ____; return ____; } addl %eax, %edx # %edx=4x+10 } } int main() int main() int main() { { { int y = f1() + f1() + f1() + f1(); int y = f1() + f1() + f1() + f1(); int y = ++x + ++x + ++x + ++x; printf("%d\n", y); printf("%d\n", y); printf("%d\n", y); return 0; return 0; return 0; } } }
13.9 13.10 Inlining Limits of Inlining int f1(vector<int>& v1) _Z2f1RSt6vectorIiSaIiEE: • Inlining can only be done when the definition of the function { .LFB509: int total = 0; .cfi_startproc is in the same __________________________ for(int i=0; i < v1.size(); i++){ movq (%rdi), %rsi total += v1[i]; movq 8(%rdi), %rax – Recall the compiler only sees the code in the current translation unit } subq %rsi, %rax return total; sarq $2, %rax (file) and so won't see the ______________ of f1() in lib.c to be able to } g++ -O1 … movq %rax, %rdi testq %rax, %rax inline it je .L4 movl $0, %ecx lib1.c prog1.c movl $0, %edx movl $0, %eax extern int x; int x=0; .L3: addl (%rsi,%rcx,4), %eax int f1(); int f1() addl $1, %edx { movslq %edx, %rcx int main() /* Produces & returns an int */ cmpq %rdi, %rcx { return ++x; jb .L3 int y = f1() + f1() + f1() + f1(); } rep ret printf("%d\n", y); .L4: return 0; movl $0, %eax } ret Notice there is no call to vector's _______ function. Compiling with optimization levels -O0 would cause it to NOT inline the call prog1.o lib1.o 13.11 13.12 C++ Templates and Inlining Memory Aliasing int twiddle1(long x, long y) { Since .h files are #include'd, any functions defined in the .h file can • • Consider twiddle1 and its x += y; x += y; then be inlined return x; // x + 2*y function to return x + 2y } • This is one reason templates offer some advantage in C++ is • Now suppose we have // Now with pointers because their definition is ALWAYS available int twiddle2a(long* xp, long* yp) { pointers as arguments *xp += *yp; *xp += *yp; prog1.c vec.h return *xp; – We could write twiddle2a (to try } #include "vec.h" template<typename T> to do what twiddle1 did) class vec int twiddle2b(long* xp, long* yp) int main() { { – Is it equivalent to twiddle1? { public: *xp += 2 * (*yp); vec<int> myvec; ... return *xp; for(int i=0; i < myvec.size(); i++){ int size() const; – Is twiddle2b equivalent to } ... private: } int size_; twiddle2a? int ans = 0; ... }; void f1(long x, long y) } { • _______! template <typename T> ans = twiddle1(x,y); int vec<T>::size() const ans += twiddle2a(&x,&y); { return size_; } – _____ if xp and yp _________ ans += twiddle2b(&x,&y); } the same value.
13.13 13.14 Memory Aliasing Memory Aliasing int twiddle1(long x, long y) int twiddle1(long x, long y) { { • The compiler must play it safe and • Aliasing may also affect inlining x += y; x += y; x += y; x += y; generate code that would work if both – -O1 does not inline twiddle2a return x; // x + 2*y return x; // x + 2*y } } pointers contain the same address (i.e. – Running -O3 does end up inlining twiddle2a // Now with pointers // Now with pointers reference the same variable)…we call f1: int twiddle2a(long* xp, long* yp) int twiddle2a(long* xp, long* yp) subq $16, %rsp { { this memory aliasing movq %rdi, 8(%rsp) *xp += *yp; *xp += *yp; movq %rsi, (%rsp) *xp += *yp; *xp += *yp; return *xp; return *xp; // Notice the compiler optimized } leaq (%rdi,%rsi,2), %rax } Inlined // to perform x + 2*y movl %eax, ans(%rip) twiddle1: int twiddle2b(long* xp, long* yp) int twiddle2b(long* xp, long* yp) leaq (%rdi,%rsi,2), %rax { movq %rsp, %rsi { gcc -O1 … ret *xp += 2 * (*yp); Not leaq 8(%rsp), %rdi *xp += 2 * (*yp); return *xp; call twiddle2a return *xp; Inlined // But here it left it as two } movq 8(%rsp), %rdx } // separate adds twiddle2a: int ans = 0; movq (%rsp), %rcx int ans = 0; movq (%rsi), %rax void f1(long x, long y) leaq (%rdx,%rcx,2), %rdx void f1(long x, long y) addq (%rdi), %rax { addl ans(%rip), %eax { movq %rax, (%rdi) ans = twiddle1(x,y); addl %edx, %eax ans = twiddle1(x,y); Inlined addq (%rsi), %rax ans += twiddle2a(&x,&y); movl %eax, ans(%rip) ans += twiddle2a(&x,&y); movq %rax, (%rdi) ans += twiddle2b(&x,&y); addq $16, %rsp ans += twiddle2b(&x,&y); ret } ret } 13.15 13.16 Overview • We have seen our processors have great capability to perform many operations in parallel • How can we write our code in such a way as to take advantage of those capabilities? • Are there limits on how much performance we can achieve and how would we know if we are hitting those limits? MAXIMIZING PERFORMANCE • Let's first understand our hardware capabilities
Recommend
More recommend