! Important(class(of(applications(((one(of(the(motifs)( ! Basis(for(approximating(derivatives(numerically( ! Physical(simulations((e.g.(turbulence(flow,(seismic(wave(propagation)( ! Multimedia(applications(((e.g.(image(smoothing)( ! Nearest(neighbor(update(on(a(structured(grid( ( 3D(Heat(Eqn(using(( ! fully(explicit(finite(differencing:( ! Didem!Unat,!!Xing!Cai,!!Scott!B.!Baden! ! dunat@lbl.gov! U’(x,y,z) = c0*U(x,y,z) + c1*(U(x,y,z-1) + U(x,y,z+1)+ U(x,y-1,z) ! ! + U(x,y+1,z) + U(x-1,y,z) + U(x+1,y,z)) ! Lawrence(Berkeley(National(Laboratory( Simula(Research(Laboratory(( ! Highly(data(parallel,(memory(bandwidth(bound( University(of(California,(San(Diego( › GPU(speedups(over(multicore((8(cores)(( ( " 5X((for(Lattice(Boltzmann(([Lee,ISCA’11],(( Oslo,(Jun(07,(2012( " 4X(Reverse(Time(Migration([Kruger,(SC’11]( ( 4 ( HPC$milestones$ ! Heterogeneity(in(compute(resources( Power(MW) 1(Exaflop/s( ! Explicit(management(of(data(transfer( 20.0 › Separate(device(memory(from(the(host(memory( 1(Petaflop/s( 2.35 ! Reengineering(of(scientific(applications( 0.85 1(Teraflop/s( › Algorithmic(changes(to(match(the(hardware(capabilities( 1(Gigaflop/s( 0.20 › Best(performance(requires(nonStrivial(knowledge(of(the(architecture( ( 1985% 1996% 2008% 2018% host accelerator ( Device Memory ! Power(consumption(is(the(main(design(constraint( Main Memory ! Drastic(changes(in(node(architecture([Shalf,(VecPar’10]( On-chip On-chip Memory Mem ory L2 L2 ! More(parallelism(on(the(chip( Vecto tor core core core core ! SoftwareSmanaged(memory(/(incoherent(caches( C Cores ores ! Already(started(seeing(concrete(instances(( bus bu ( 2 5 ! Graphics(Processing(Units((GPUs)( ! Explicitly(managed(memory(( Device Memory › Massively(parallel(single(chip(processor( › OnSchip(memory(resources(( › Low(power(cores:(trade(off(single(thread(performance( › Private(and(incoherent(( › Large(register(file(and(softwareSmanaged(memory( › e.g(__shared__((float(A[N];( ! Effective(in(accelerating(certain(data(parallel(applications(( ( › Case(Study:(Cardiac(Electrophysiology([Unat,(PARA’10]((( ! Hierarchical(thread(management( › Not(ideal(for(others:(sorting([Lee,(ISCA’10]( › Thread,(thread(groups,(thread(subgroups( ( › Granularity(of(a(thread( Shared Memory/L1 cache host accelerator Device Memory Register File Main Memory ! DomainSspecific(optimizations((( On-chip On-chip ! Limits(the(adoption(in(scientific(computing( Memory Mem ory L2 L2 Vecto tor core core core core C Cores ores We(need(programming(models(to(master(the(new(technology(and( make(it(accessible(to(computational(scientists.( bu bus 3 6
! Aims(programmer’s(productivity(and(high(performance( ! SourceStoSsource(translator((for(the(Nvidia(GPUs( ! Simplifies(application(development( › Parallelizes(loop(nests( ! Based(on(a(modest(number(of(compiler(directives( › Relieves(the(programmer(of(a(variety(of(tedious(tasks( › #pragma(mint(for( › Incremental(parallelization( ! Abstracts(away(the(programmer’s(view(of((the(hardware( Mint C + directives CUDA ( ( Seismic Modeling Cardiac Simulation ! MotifSspecific(autoSoptimizer( Turbulent Flow › Targets(stencil(methods(( › Incorporates(semantic(knowledge(to(compiler(analysis( Device Memory Main Memory › Performs(data(locality(optimizations(via(onSchip(memory( Mint › Compiler(flags(for(performance(tuning( L2 L2 core core core core ( 7 10 Serial!code! !!!!Accelerated!Region! Data!parallel!for! Host!Region! Data!parallel!for! Host!! Thread! 8 11 Serial!code! !!!!Accelerated!Region! ! #pragma(mint(parallel( Accelerated Region kernel › Indicates(the(accelerated(region( Data!parallel!for! ! #pragma(mint(for( …… › Marks(enclosed(loopSnest(for(acceleration( Block Block Device Memory › 3(additional(clauses(for(optimizations( Host!Region! ! #pragma(mint(copy( Data Transfer › Expresses(data(transfers(between(the(host(and(device( Data!parallel!for! ! #pragma(mint(single( › Handles(serial(section( …… Block Block ! #pragma(mint(barrier( Block › Synchronizes(host(and(device(threads( Host!! Synchronization Thread! 9 12
#pragma mint copy(U, toDevice, (n+2),(m+2),(k+2)) ! #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) ! Performance(tuning(parameters( #pragma mint copy(Unew, toDevice, (n+2),(m+2),(k+2)) ! #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) Device Memory ! HighSlevel(interface(to(lowSlevel( #pragma mint parallel Data hardware(specific(optimizations( { Transfers while( t++ < T ){ ( #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) for (int z=1; z<= k; z++) ForSloop(clauses((( 1. for (int y=1; y<= m; y++) › handle(data(decomposition(and(thread( for (int x=1; x<= n; x++) Unew[z][y][x] = c0 * U[z][y][x] + management( c1 * (U[z][y][x-1] + U[z][y][x+1] + › nest((),(tile((),(chunksize(()( Shared Memory/L1 cache U[z][y-1][x] + U[z][y+1][x] + 2. (Compiler(flags(for(data(locality( U[z-1][y][x] + U[z+1][y][x]); double*** tmp; Register File › Register:(Sregister( tmp = U; U = Unew; Unew = tmp; › SoftwareSmanaged(memory:(Sshared( }//end of while › Cache:(SpreferL1( }//end of parallel region ( #pragma mint copy(U, fromDevice, (n+2),(m+2),(k+2)) ! #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) ! 13 16 #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) Mint Program for #pragma mint parallel #pragma mint parallel the 3D Heat Eqn. { { while( t++ < T ){ while( t++ < T ){ !!#pragma!mint!for ! #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) for (int z=1; z<= k; z++) for (int z=1; z<= k; z++) for (int y=1; y<= m; y++) for (int y=1; y<= m; y++) for (int x=1; x<= n; x++) for (int x=1; x<= n; x++) Unew[z][y][x] = c0 * U[z][y][x] + Unew[z][y][x] = c0 * U[z][y][x] + c1 * (U[z][y][x-1] + U[z][y][x+1] + c1 * (U[z][y][x-1] + U[z][y][x+1] + U[z][y-1][x] + U[z][y+1][x] + U[z][y-1][x] + U[z][y+1][x] + U[z-1][y][x] + U[z+1][y][x]); U[z-1][y][x] + U[z+1][y][x]); double*** tmp; double*** tmp; tmp = U; U = Unew; Unew = tmp; tmp = U; U = Unew; Unew = tmp; Data parallel for loop }//end of while }//end of while }//end of parallel region }//end of parallel region #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) 14 17 Accelerated #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) Region #pragma mint copy(U,toDevice,(n+2),(m+2),(k+2)) depth of loop #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) #pragma mint copy(Unew,toDevice,(n+2),(m+2),(k+2)) parallelism #pragma mint parallel ! #pragma mint parallel #pragma mint parallel { { while( t++ < T ){ while( t++ < T ){ !!#pragma!mint!for!nest(all) ! #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) #pragma mint for nest(all) tile(16,16,64) chunksize(1,1,64) for (int z=1; z<= k; z++) for (int z=1; z<= k; z++) for (int y=1; y<= m; y++) for (int y=1; y<= m; y++) for (int x=1; x<= n; x++) for (int x=1; x<= n; x++) Unew[z][y][x] = c0 * U[z][y][x] + Unew[z][y][x] = c0 * U[z][y][x] + c1 * (U[z][y][x-1] + U[z][y][x+1] + c1 * (U[z][y][x-1] + U[z][y][x+1] + U[z][y-1][x] + U[z][y+1][x] + U[z][y-1][x] + U[z][y+1][x] + U[z-1][y][x] + U[z+1][y][x]); U[z-1][y][x] + U[z+1][y][x]); double*** tmp; double*** tmp; tmp = U; U = Unew; Unew = tmp; tmp = U; U = Unew; Unew = tmp; }//end of while }//end of while }//end of parallel region }//end of parallel region #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) #pragma mint copy(U,fromDevice,(n+2),(m+2),(k+2)) 15 18
Recommend
More recommend