[Optimizers] Added support for the AMSGrad optimizer. (#314)

eaplatanios · web-flow · commit d61d029aedc3 · 2019-06-29T21:21:28.000-04:00
diff --git a/Sources/TensorFlow/Optimizers/MomentumBased.swift b/Sources/TensorFlow/Optimizers/MomentumBased.swift
@@ -336,3 +336,93 @@ public class AdaMax<Model: Differentiable & KeyPathIterable>: Optimizer
         model.move(along: -stepSize * firstMoments ./ denominator)
     }
 }
+
+/// AMSGrad optimizer.
+///
+/// This algorithm is a modification of Adam with better convergence properties when close to local
+/// optima.
+///
+/// Reference: ["On the Convergence of Adam and Beyond"](
+/// https://openreview.net/pdf?id=ryQu7f-RZ)
+public class AMSGrad<Model: Differentiable & KeyPathIterable>: Optimizer
+    where Model.TangentVector: VectorProtocol & PointwiseMultiplicative & ElementaryFunctions,
+          Model.TangentVector.VectorSpaceScalar == Float,
+          Model.AllDifferentiableVariables: KeyPathIterable,
+          Model.AllDifferentiableVariables == Model.TangentVector {
+    public typealias Model = Model
+    /// The learning rate.
+    public var learningRate: Float
+    /// A coefficient used to calculate the first and second moments of the gradients.
+    public var beta1: Float
+    /// A coefficient used to calculate the first and second moments of the gradients.
+    public var beta2: Float
+    /// A small scalar added to the denominator to improve numerical stability.
+    public var epsilon: Float
+    /// The learning rate decay.
+    public var decay: Float
+    /// The current step.
+    public var step: Int = 0
+    /// The first moments of the weights.
+    public var firstMoments: Model.TangentVector = .zero
+    /// The second moments of the weights.
+    public var secondMoments: Model.TangentVector = .zero
+    /// The maximum of the second moments of the weights.
+    public var secondMomentsMax: Model.TangentVector = .zero
+
+    public init(
+        for model: __shared Model,
+        learningRate: Float = 1e-3,
+        beta1: Float = 0.9,
+        beta2: Float = 0.999,
+        epsilon: Float = 1e-8,
+        decay: Float = 0
+    ) {
+        precondition(learningRate >= 0, "Learning rate must be non-negative")
+        precondition(0 <= beta1 && beta1 <= 1, "Beta parameter must be between 0 and 1")
+        precondition(0 <= beta2 && beta2 <= 1, "Beta parameter must be between 0 and 1")
+        precondition(decay >= 0, "Learning rate decay must be non-negative")
+
+        self.learningRate = learningRate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.decay = decay
+    }
+
+    public func update(_ model: inout Model, along direction: Model.TangentVector) {
+        update(&model.allDifferentiableVariables, along: direction)
+    }
+
+    // TODO: Deprecate this when `Differentiable.AllDifferentiableVariables` is removed.
+    public func update(
+        _ model: inout Model.AllDifferentiableVariables,
+        along direction: Model.TangentVector
+    ) {
+        self.step += 1
+        let step = Float(self.step)
+        let beta1Power = pow(beta1, step)
+        let beta2Power = pow(beta2, step)
+        let learningRate = self.learningRate * 1 / (1 + decay * step)
+        // Note: `stepSize` and `secondMoments` are split into two lines to avoid the "compiler is 
+        // unable to type-check this expression in reasonable time" error.
+        var stepSize = learningRate * sqrt(1 - pow(beta2Power, step))
+        stepSize = stepSize / (1 - pow(beta1Power, step))
+        firstMoments = firstMoments * beta1 + direction * (1 - beta1)
+        secondMoments = secondMoments * beta2
+        secondMoments += direction .* direction * (1 - beta2)
+
+        // Update `secondMomentsMax` using a key path approach because `max(_:_:)` cannot be 
+        // currently applied in a simpler manner.
+        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Float>.self) {
+            secondMomentsMax[keyPath: kp] = max(
+                secondMomentsMax[keyPath: kp], secondMoments[keyPath: kp])
+        }
+        for kp in model.recursivelyAllWritableKeyPaths(to: Tensor<Double>.self) {
+            secondMomentsMax[keyPath: kp] = max(
+                secondMomentsMax[keyPath: kp], secondMoments[keyPath: kp])
+        }
+
+        let denominator = Model.TangentVector.sqrt(secondMomentsMax) + epsilon
+        model.move(along: -stepSize * firstMoments ./ denominator)
+    }
+}
diff --git a/Tests/TensorFlowTests/SequentialTests.swift b/Tests/TensorFlowTests/SequentialTests.swift
@@ -33,6 +33,7 @@ final class SequentialTests: XCTestCase {
         let rmsprop = RMSProp(for: model, learningRate: 0.02)
         let adam = Adam(for: model, learningRate: 0.02)
         let adamax = AdaMax(for: model, learningRate: 0.02)
+        let amsgrad = AMSGrad(for: model, learningRate: 0.02)
         let adagrad = AdaGrad(for: model, learningRate: 0.02)
         let adadelta = AdaDelta(for: model, learningRate: 0.02)
         let x: Tensor<Float> = [[0, 0], [0, 1], [1, 0], [1, 1]]
@@ -51,13 +52,15 @@ final class SequentialTests: XCTestCase {
             adam.update(&model.allDifferentiableVariables, along: 𝛁model)
             adamax.update(&model, along: 𝛁model)
             adamax.update(&model.allDifferentiableVariables, along: 𝛁model)
+            amsgrad.update(&model, along: 𝛁model)
+            amsgrad.update(&model.allDifferentiableVariables, along: 𝛁model)
             adagrad.update(&model, along: 𝛁model)
             adagrad.update(&model.allDifferentiableVariables, along: 𝛁model)
             adadelta.update(&model, along: 𝛁model)
             adadelta.update(&model.allDifferentiableVariables, along: 𝛁model)
         }
         XCTAssertEqual(model.inferring(from: [[0, 0], [0, 1], [1, 0], [1, 1]]),
-                       [[0.47620448], [0.47620448], [0.47620448], [0.47620448]])
+                       [[0.52508783], [0.52508783], [0.52508783], [0.52508783]])
     }
 
     static var allTests = [