Gradient Descent: The Ultimate Optimizer Erik Meijer @headinthebox Copenhagen Denmark
We all want to write cool apps like this ...
Software 1.0
Augustin-Louis Cauchy 1789 -1857
What if we feed the examples/tests to a mathematician or machine and let it deduce the code for us?
Physicists and Mathematicians have been doing curve fitting and function approximation for centuries. Just saying.
Galileo Galilei Joseph Fourier Henry Padé 1566-1642 1768-1830 1863-1953
“Everything interesting in CS has already been invented by mathematicians at least 100 years ago.” @headinthebox
Fourier(x) = a 0 + (∑ a i cos(ixπ/L)) + (∑ b i sin(ixπ/L)) Pade N,M (x) = (∑ i ∈ 0…N a i x i ) (1+∑ i ∈ 1…M b i x i )
We’ll jump on the latest Computer Science bandwagon; Deep Learning, using Artificial Neural Networks!!!!!!!!!!!!!!!!!!
George Cybenko , 1989
Activation Linear algebra/ function map-reduce weights/ parameters
One input, one weight, identity
strong assumption
var a: ℝ = … val η: ℝ = … some tiny value of your choosing … fun model(x: ℝ ): ℝ = a*x syntax cheat fun loss(y: ℝ , ŷ: ℝ ): ℝ = (y-ŷ) 2 fun train(n: Int, samples: Sequence<Pair< ℝ , ℝ >>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair< ℝ , ℝ >>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x 2 -2*x*y a -= η*de/da } syntax cheat }
var a: ℝ = … val η: ℝ = … some tiny value of your choosing … fun model(x: ℝ ): ℝ = a*x fun loss(y: ℝ , ŷ: ℝ ): ℝ = (y-ŷ) 2 fun train(n: Int, samples: Sequence<Pair< ℝ , ℝ >>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair< ℝ , ℝ >>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x 2 -2*x*y a -= η*de/da } }
Differentiable Programming, I told you so!
f(x) =3x 2 +4 f’(x) = 6x
f(a+b ε ) Read = ⇦ that again! f(a)+f’(a)b ε
High school math review: Sum Rule The derivative of (u+v) with respect to x
High school math review: Product Rule The derivative of (u*v*w) with respect to x
High school math review: Chain Rule The derivative of (f ∘ g) with respect to x
Sum Rule Product Rule Chain Rule (a+(da/dx)ε) + (a+(da/dx)ε) * f(a+(da/dx)ε) (c+(dc/dx)ε) (b+(db/dx)ε) ={ dual number } ={ dual number } ={ dual number } f(a)+ (a+c)+ (a*b)+ d(f(a)/da)(da/dx)ε (da/dx+dc/dx)ε (a*(db/dx)+(da/dx)*b)ε = { chain rule } = { sum rule } = { product rule } f(a)+(df(a)/dx)ε (a+c)+(d(a+b)/dx)ε (a*b)+(d(a*b)/dx)ε Your high school education was a total waste of time!
class 𝔼 ( val r : ℝ , val ε : ℝ =1.0) fun sin(x: 𝔼 ): 𝔼 = 𝔼 (r= sin (x. r ), ε= cos (x. r )*x. ε ) fun cos(x: 𝔼 ): 𝔼 = 𝔼 (r= cos (x. r ), ε=- sin (x. r )*x. ε ) operator fun 𝔼 .times(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r *that. r , ε= this . ε *that. r + this . r *that. ε ) operator fun 𝔼 .plus(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r +that. r , ε= this . ε +that. ε ) operator fun 𝔼 .minus(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r -that. r , ε= this . ε -that. ε ) operator fun 𝔼 .unaryMinus(): 𝔼 = 𝔼 (r=- this . r , ε=- this . ε ) operator fun 𝔼 .div(that: 𝔼 ): 𝔼 = ( this . r /that. r ).let { 𝔼 (r= it , ε=( this . ε /that. r - it *that. ε /that. r ) }
var a: ℝ = … val η: ℝ = … fun model(x: ℝ ): ℝ = a*x fun loss(y: ℝ , ŷ: ℝ ): ℝ = (y-ŷ) 2 fun train(n: Int, samples: Sequence<Pair< ℝ , ℝ >>) { repeat(n) { epoch(samples) } } fun epoch(samples: Sequence<Pair< ℝ , ℝ >>) { samples.foreach{ (x,y) ➝ val e = loss(y, model(x)) val de/da = 2*a*x 2 -2*x*y a -= η*de/da } }
var a: 𝔼 = 𝔼 (…) val η: ℝ = … def model(x: ℝ ): 𝔼 = a*x def loss(y: ℝ , ŷ: 𝔼 ): 𝔼 = (y-ŷ) 2 def epoch(samples: List[( ℝ , ℝ )]) { samples.foreach{ case (x,y) ➝ { val e = loss(y,model(x)) val de/da: ℝ =e. ε a -= η*(de/da) } }
val x = ⅅ (3.0) val y = ⅅ (5.0) val z = x*y // dz/dx + dz/dy // 𝔼 (r=15.0, ε=8.0)
operator fun ℝ .times(that: List< ℝ >) = that.map { this * it } operator fun List< ℝ >.times(that: ℝ ) = this .map { it *that } operator fun List< ℝ >.unaryMinus() = this .map { - it } operator fun List< ℝ >.plus(that: List< ℝ >) = this .zip(that) { x,y ➝ x+y } operator fun List< ℝ >.minus(that: List< ℝ >) = this .zip(that) { x,y ➝ x-y } operator fun List< ℝ >.div(that: ℝ ) = this .map { it /that } That’s all that needs class 𝔼 ( val r : ℝ , val ε : List< ℝ >) to change fun sin(x: 𝔼 ): 𝔼 = 𝔼 (r= sin (x. r ), ε= cos (x. r )*x. ε ) operator fun 𝔼 .times(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r *that. r , ε= this . ε *that. r + this . r *that. ε )
Mathematically, by changing numbers to lists, we upgraded from dual numbers to synthetic differential geometry and deep category theory
val x = ⅅ (3.0, 0. th ) val y = ⅅ (5.0, 1. th ) val z = x*y // [∂z/∂x, ∂z/∂y] // 𝔼 (r=15.0, ε=[5.0, 3.0])
var κ: ⅅ = ⅅ (1e-20, 0. th ) var η: ⅅ = ⅅ (1e-20, 1. th ) var a = ⅅ (Math.random(), 2. th ) val 𝛿 = 1e-80 fun model(x: ℝ ): ⅅ = a*x fun loss(y: ℝ , ŷ: ⅅ ): ⅅ = (y-ŷ) 2 fun epoch(samples: List<Pair< ℝ , ℝ >>) { lateinit var e: ⅅ samples. forEach { (x,y) ➝ val (∂e/∂κ, ∂e/∂η, ∂e/∂a) = loss(y, model(x)) κ -= 𝛿 * ∂e/∂κ η -= κ * ∂e/∂η a -= η * ∂e/∂a } }
η Error a κ
Wait what, can’t you do any non-toy examples? Yes we can!
Lower is better Choosing the correct hyper parameter is essential
How do we pick the meta-step size?
Stack a small number of hyper-...-hyper parameters layers and pick a tiny number for the last fixed one.
https://arxiv.org/pdf/1909.13371.pdf
fun id(x: ⅅ ) = ⅅ (r=x. r , ε=1.0*x. ε ) var x = ⅅ (0.0, n. th ); repeat (n) { x = id(x) }; x. ε ∂id(x n )/∂x n *(…*(∂id(x n )/∂x n *[…, ∂x n /∂x n ])…)
Thinking Fast not Slow Represent lists by functions (...([] ++ x 1 ) ++ ...) ++ x n is slow O(n 2 ) !%#@&? x 1 ++(... ++(x n ++[])...) is fast O(n) = ((x 1 ++) ∘ … ∘ (x n ++)) []
Thinking Fast not Slow (...([] ++ x 1 ) ++ ...) ++ x n is slow O(n 2 ) x 1 ++(... ++(x n ++[])...) is fast O(n) ∂id(x n )/∂x n *(…*(∂id(x n )/∂x n *[…, ∂x n /∂x n ])…) is slow O(n 2 ) (…(∂id(x n )/∂x n *∂id(x n )/∂x n )*…)*[…, ∂x n /∂x n ])…) is fast O(n)
Chain Rule Product Rule 〚 f’(x. r )*x. ε 〛 (c) 〚 this . ε *that. r + this . r *that. ε 〛 (c) ={ 〚 a 〛 (b) = b*a } ={ commutativity } c*(f’(x. r )*x. ε ) 〚 that. r * this . ε + this . r *that. ε 〛 (c) ={ associativity } ={ 〚 a 〛 (b) = b*a } (c*f’(x. r ))*x. ε c*(that. r * this . ε + this . r *that. ε ) ={ 〚 a 〛 (b) = b*a } ={ distributivity } 〚 x. ε 〛 (c*f’(x. r )) c*(that. r * this . ε ) + c*( this . r *that. ε ) ={ 〚 x. ε 〛 = x. ɞ } ={ associativity } x. ɞ (c*f’(x. r )) (c*that. r )* this . ε + (c* this . r )*that. ε ={ abstraction } ={ definition of 〚〛 } { x. ɞ ( it *f’(x. r )) } (c) 〚 this . ε 〛 (c*that. r ) + 〚 that. ε 〛 (c* this . r ) ={ 〚 x. ε 〛 = x. ɞ } this . ɞ (c*that. r ) + that. ɞ (c* this . r ) ={ abstraction } { this . ɞ ( it *that. r ) + that. ɞ ( it * this . r ) } (c)
class ⅅ ( val r : ℝ , val ɞ : ( ℝ )-> ℝ = {it} ) /* df(a)/dx = (df(a)/da)*(da/dx) */ fun sin(x: ⅅ ): ⅅ = ⅅ (r = sin (x. r ), ɞ = { x. ɞ ( it * cos (x. r )) } ) /* d(a*b)/dx = (da/dx)*b + a*(db/dx) */ operator fun ⅅ .times(that: ⅅ ): ⅅ = ⅅ (r = this . r * that. r , ɞ = { this . ɞ ( it *that. r ) +that. ɞ ( it * this . r ) } )
repeat(n) { x = x*x }; x. ɞ (1.0)
class 𝔼 ( val r : ℝ , var ε : ℝ = 0.0, var n : Int = 0, val ɞ : ( ℝ ) ➝ℝ = { it } ) { fun ɞ (d: ℝ ): ℝ { ε += d if (-- n == 0) { return ɞ .invoke( ε ) } else { return 0.0 } } } fun ⅅ .backward(d: ℝ = 1.0) { this . n ++; this . ɞ (d) } fun sin(x: 𝔼 ): 𝔼 = 𝔼 (r= sin (x. r ), ɞ = { x. ɞ ( it * cos (x. r )*) } ). also { x. n ++ } operator fun 𝔼 .times(that: 𝔼 ): 𝔼 = 𝔼 (r= this . r *that. r , ɞ = { this . ɞ ( it *that. r ); that. ɞ ( it * this . r ) } ) . also { this . n ++; that. n ++; }
Recommend
More recommend