gradient descent the ultimate optimizer erik meijer
play

Gradient Descent: The Ultimate Optimizer Erik Meijer @headinthebox - PowerPoint PPT Presentation

Gradient Descent: The Ultimate Optimizer Erik Meijer @headinthebox Copenhagen Denmark We all want to write cool apps like this ... Software 1.0 Augustin-Louis Cauchy 1789 -1857 What if we feed the examples/tests to a mathematician


  1. Gradient Descent: The Ultimate Optimizer Erik Meijer @headinthebox Copenhagen Denmark

  2. We all want to write cool apps like this ...

  3. Software 1.0

  4. Augustin-Louis Cauchy 1789 -1857

  5. What if we feed the examples/tests to a mathematician or machine and let it deduce the code for us?

  6. Physicists and Mathematicians have been doing curve fitting and function approximation for centuries. Just saying.

  7. Galileo Galilei Joseph Fourier Henry Padé 1566-1642 1768-1830 1863-1953

  8. “Everything interesting in CS has already been invented by mathematicians at least 100 years ago.” @headinthebox

  9. Fourier(x) = a 0 + (∑ a i cos(ixπ/L)) + (∑ b i sin(ixπ/L)) Pade N,M (x) = (∑ i ∈ 0…N a i x i ) (1+∑ i ∈ 1…M b i x i )

  10. We’ll jump on the latest Computer Science bandwagon; Deep Learning, using Artificial Neural Networks!!!!!!!!!!!!!!!!!!

  11. George Cybenko , 1989

  12. Activation Linear algebra/ function map-reduce weights/ parameters

  13. One input, one weight, identity

  14. strong assumption

  15. var a: ℝ = … val η: ℝ = … some tiny value of your choosing … fun model(x: ℝ ): ℝ = a*x syntax cheat fun loss(y: ℝ , ŷ: ℝ ): ℝ = (y-ŷ) 2 fun train(n: Int, samples: Sequence<Pair< ℝ , ℝ >>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair< ℝ , ℝ >>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x 2 -2*x*y a -= η*de/da } syntax cheat }

  16. var a: ℝ = … val η: ℝ = … some tiny value of your choosing … fun model(x: ℝ ): ℝ = a*x fun loss(y: ℝ , ŷ: ℝ ): ℝ = (y-ŷ) 2 fun train(n: Int, samples: Sequence<Pair< ℝ , ℝ >>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair< ℝ , ℝ >>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x 2 -2*x*y a -= η*de/da } }

  17. Differentiable Programming, I told you so!

  18. f(x) =3x 2 +4 f’(x) = 6x

  19. f(a+b ε ) Read = ⇦ that again! f(a)+f’(a)b ε

  20. High school math review: Sum Rule The derivative of (u+v) with respect to x

  21. High school math review: Product Rule The derivative of (u*v*w) with respect to x

  22. High school math review: Chain Rule The derivative of (f ∘ g) with respect to x

  23. Sum Rule Product Rule Chain Rule (a+(da/dx)ε) + (a+(da/dx)ε) * f(a+(da/dx)ε) (c+(dc/dx)ε) (b+(db/dx)ε) ={ dual number } ={ dual number } ={ dual number } f(a)+ (a+c)+ (a*b)+ d(f(a)/da)(da/dx)ε (da/dx+dc/dx)ε (a*(db/dx)+(da/dx)*b)ε = { chain rule } = { sum rule } = { product rule } f(a)+(df(a)/dx)ε (a+c)+(d(a+b)/dx)ε (a*b)+(d(a*b)/dx)ε Your high school education was a total waste of time!

  24. class 𝔼 ( val r : ℝ , val ε : ℝ =1.0) fun sin(x: 𝔼 ): 𝔼 = 𝔼 (r= sin (x. r ), ε= cos (x. r )*x. ε ) fun cos(x: 𝔼 ): 𝔼 = 𝔼 (r= cos (x. r ), ε=- sin (x. r )*x. ε ) operator fun 𝔼 .times(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r *that. r , ε= this . ε *that. r + this . r *that. ε ) operator fun 𝔼 .plus(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r +that. r , ε= this . ε +that. ε ) operator fun 𝔼 .minus(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r -that. r , ε= this . ε -that. ε ) operator fun 𝔼 .unaryMinus(): 𝔼 = 𝔼 (r=- this . r , ε=- this . ε ) operator fun 𝔼 .div(that: 𝔼 ): 𝔼 = ( this . r /that. r ).let { 𝔼 (r= it , ε=( this . ε /that. r - it *that. ε /that. r ) }

  25. var a: ℝ = … val η: ℝ = … fun model(x: ℝ ): ℝ = a*x fun loss(y: ℝ , ŷ: ℝ ): ℝ = (y-ŷ) 2 fun train(n: Int, samples: Sequence<Pair< ℝ , ℝ >>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair< ℝ , ℝ >>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x 2 -2*x*y a -= η*de/da } }

  26. var a: 𝔼 = 𝔼 (…) val η: ℝ = … def model(x: ℝ ): 𝔼 = a*x def loss(y: ℝ , ŷ: 𝔼 ): 𝔼 = (y-ŷ) 2 def epoch(samples: List[( ℝ , ℝ )]) { samples.foreach{ case (x,y) ➝ { val e = loss(y,model(x)) val de/da: ℝ =e. ε a -= η*(de/da) } }

  27. val x = ⅅ (3.0) val y = ⅅ (5.0) val z = x*y // dz/dx + dz/dy // 𝔼 (r=15.0, ε=8.0)

  28. operator fun ℝ .times(that: List< ℝ >) = that.map { this * it } operator fun List< ℝ >.times(that: ℝ ) = this .map { it *that } operator fun List< ℝ >.unaryMinus() = this .map { - it } operator fun List< ℝ >.plus(that: List< ℝ >) = this .zip(that) { x,y ➝ x+y } operator fun List< ℝ >.minus(that: List< ℝ >) = this .zip(that) { x,y ➝ x-y } operator fun List< ℝ >.div(that: ℝ ) = this .map { it /that } That’s all that needs class 𝔼 ( val r : ℝ , val ε : List< ℝ >) to change fun sin(x: 𝔼 ): 𝔼 = 𝔼 (r= sin (x. r ), ε= cos (x. r )*x. ε ) operator fun 𝔼 .times(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r *that. r , ε= this . ε *that. r + this . r *that. ε )

  29. Mathematically, by changing numbers to lists, we upgraded from dual numbers to synthetic differential geometry and deep category theory

  30. val x = ⅅ (3.0, 0. th ) val y = ⅅ (5.0, 1. th ) val z = x*y // [∂z/∂x, ∂z/∂y] // 𝔼 (r=15.0, ε=[5.0, 3.0])

  31. var κ: ⅅ = ⅅ (1e-20, 0. th ) var η: ⅅ = ⅅ (1e-20, 1. th ) var a = ⅅ (Math.random(), 2. th ) val 𝛿 = 1e-80 fun model(x: ℝ ): ⅅ = a*x fun loss(y: ℝ , ŷ: ⅅ ): ⅅ = (y-ŷ) 2 fun epoch(samples: List<Pair< ℝ , ℝ >>) { lateinit var e: ⅅ samples. forEach { (x,y) ➝ val (∂e/∂κ, ∂e/∂η, ∂e/∂a) = loss(y, model(x)) κ -= 𝛿 * ∂e/∂κ η -= κ * ∂e/∂η a -= η * ∂e/∂a } }

  32. η Error a κ

  33. Wait what, can’t you do any non-toy examples? Yes we can!

  34. Lower is better Choosing the correct hyper parameter is essential

  35. How do we pick the meta-step size?

  36. Stack a small number of hyper-...-hyper parameters layers and pick a tiny number for the last fixed one.

  37. https://arxiv.org/pdf/1909.13371.pdf

  38. fun id(x: ⅅ ) = ⅅ (r=x. r , ε=1.0*x. ε ) var x = ⅅ (0.0, n. th ); repeat (n) { x = id(x) }; x. ε ∂id(x n )/∂x n *(…*(∂id(x n )/∂x n *[…, ∂x n /∂x n ])…)

  39. Thinking Fast not Slow Represent lists by functions (...([] ++ x 1 ) ++ ...) ++ x n is slow O(n 2 ) !%#@&? x 1 ++(... ++(x n ++[])...) is fast O(n) = ((x 1 ++) ∘ … ∘ (x n ++)) []

  40. Thinking Fast not Slow (...([] ++ x 1 ) ++ ...) ++ x n is slow O(n 2 ) x 1 ++(... ++(x n ++[])...) is fast O(n) ∂id(x n )/∂x n *(…*(∂id(x n )/∂x n *[…, ∂x n /∂x n ])…) is slow O(n 2 ) (…(∂id(x n )/∂x n *∂id(x n )/∂x n )*…)*[…, ∂x n /∂x n ])…) is fast O(n)

  41. Chain Rule Product Rule 〚 f’(x. r )*x. ε 〛 (c) 〚 this . ε *that. r + this . r *that. ε 〛 (c) ={ 〚 a 〛 (b) = b*a } ={ commutativity } c*(f’(x. r )*x. ε ) 〚 that. r * this . ε + this . r *that. ε 〛 (c) ={ associativity } ={ 〚 a 〛 (b) = b*a } (c*f’(x. r ))*x. ε c*(that. r * this . ε + this . r *that. ε ) ={ 〚 a 〛 (b) = b*a } ={ distributivity } 〚 x. ε 〛 (c*f’(x. r )) c*(that. r * this . ε ) + c*( this . r *that. ε ) ={ 〚 x. ε 〛 = x. ɞ } ={ associativity } x. ɞ (c*f’(x. r )) (c*that. r )* this . ε + (c* this . r )*that. ε ={ abstraction } ={ definition of 〚〛 } { x. ɞ ( it *f’(x. r )) } (c) 〚 this . ε 〛 (c*that. r ) + 〚 that. ε 〛 (c* this . r ) ={ 〚 x. ε 〛 = x. ɞ } this . ɞ (c*that. r ) + that. ɞ (c* this . r ) ={ abstraction } { this . ɞ ( it *that. r ) + that. ɞ ( it * this . r ) } (c)

  42. class ⅅ ( val r : ℝ , val ɞ : ( ℝ )-> ℝ = {it} ) /* df(a)/dx = (df(a)/da)*(da/dx) */ fun sin(x: ⅅ ): ⅅ = ⅅ (r = sin (x. r ), ɞ = { x. ɞ ( it * cos (x. r )) } ) /* d(a*b)/dx = (da/dx)*b + a*(db/dx) */ operator fun ⅅ .times(that: ⅅ ): ⅅ = ⅅ (r = this . r * that. r , ɞ = { this . ɞ ( it *that. r ) +that. ɞ ( it * this . r ) } )

  43. repeat(n) { x = x*x }; x. ɞ (1.0)

  44. class 𝔼 ( val r : ℝ , var ε : ℝ = 0.0, var n : Int = 0, val ɞ : ( ℝ ) ➝ℝ = { it } ) { fun ɞ (d: ℝ ): ℝ { ε += d if (-- n == 0) { return ɞ .invoke( ε ) } else { return 0.0 } } } fun ⅅ .backward(d: ℝ = 1.0) { this . n ++; this . ɞ (d) } fun sin(x: 𝔼 ): 𝔼 = 𝔼 (r= sin (x. r ), ɞ = { x. ɞ ( it * cos (x. r )*) } ). also { x. n ++ } operator fun 𝔼 .times(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r *that. r , ɞ = { this . ɞ ( it *that. r ); that. ɞ ( it * this . r ) } ) . also { this . n ++; that. n ++; }

Recommend


More recommend