Source code

001/*
002 * Copyright (c) 2015-2020, Oracle and/or its affiliates. All rights reserved.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package org.tribuo.math.optimisers;
018
019import com.oracle.labs.mlrg.olcut.config.Config;
020import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
021import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
022import org.tribuo.math.Parameters;
023import org.tribuo.math.StochasticGradientOptimiser;
024import org.tribuo.math.la.DenseMatrix;
025import org.tribuo.math.la.DenseVector;
026import org.tribuo.math.la.Matrix;
027import org.tribuo.math.la.MatrixIterator;
028import org.tribuo.math.la.MatrixTuple;
029import org.tribuo.math.la.SGDVector;
030import org.tribuo.math.la.Tensor;
031import org.tribuo.math.la.VectorIterator;
032import org.tribuo.math.la.VectorTuple;
033
034import java.util.Arrays;
035import java.util.function.DoubleUnaryOperator;
036import java.util.logging.Logger;
037
038/**
039 * An implementation of the AdaGrad gradient optimiser with regularized dual averaging.
040 * <p>
041 * This gradient optimiser rewrites all the {@link Tensor}s in the {@link Parameters}
042 * with {@link AdaGradRDATensor}. This means it keeps a different value in the {@link Tensor}
043 * to the one produced when you call get(), so it can correctly apply regularisation to the parameters.
044 * When {@link AdaGradRDA#finalise()} is called it rewrites the {@link Parameters} with standard dense {@link Tensor}s.
045 * Follows the implementation in Factorie.
046 * <p>
047 * See:
048 * <pre>
049 * Duchi, J., Hazan, E., and Singer, Y.
050 * "Adaptive Subgradient Methods for Online Learning and Stochastic Optimization"
051 * Journal of Machine Learning Research, 2012, 2121-2159.
052 * </pre>
053 */
054public class AdaGradRDA implements StochasticGradientOptimiser {
055    private static final Logger logger = Logger.getLogger(AdaGradRDA.class.getName());
056
057    @Config(mandatory = true,description="Initial learning rate used to scale the gradients.")
058    private double initialLearningRate;
059
060    @Config(description="Epsilon for numerical stability around zero.")
061    private double epsilon = 1e-6;
062
063    @Config(description="l1 regularization penalty.")
064    private double l1 = 0;
065
066    @Config(description="l2 regularization penalty.")
067    private double l2 = 0;
068
069    @Config(description="Number of examples to scale the l1 and l2 penalties by.")
070    private int numExamples = 1;
071
072    private Parameters parameters;
073
074    public AdaGradRDA(double initialLearningRate, double epsilon, double l1, double l2, int numExamples) {
075        this.initialLearningRate = initialLearningRate;
076        this.epsilon = epsilon;
077        this.l1 = l1;
078        this.l2 = l2;
079        this.numExamples = numExamples;
080    }
081
082    public AdaGradRDA(double initialLearningRate, double epsilon) {
083        this(initialLearningRate,epsilon,0,0,1);
084    }
085
086    /**
087     * For olcut.
088     */
089    private AdaGradRDA() { }
090
091    @Override
092    public void initialise(Parameters parameters) {
093        this.parameters = parameters;
094        Tensor[] curParams = parameters.get();
095        Tensor[] newParams = new Tensor[curParams.length];
096        for (int i = 0; i < newParams.length; i++) {
097            if (curParams[i] instanceof DenseVector) {
098                newParams[i] = new AdaGradRDAVector(((DenseVector) curParams[i]), initialLearningRate, epsilon, l1 / numExamples, l2 / numExamples);
099            } else if (curParams[i] instanceof DenseMatrix) {
100                newParams[i] = new AdaGradRDAMatrix(((DenseMatrix) curParams[i]), initialLearningRate, epsilon, l1 / numExamples, l2 / numExamples);
101            } else {
102                throw new IllegalStateException("Unknown Tensor subclass");
103            }
104        }
105        parameters.set(newParams);
106    }
107
108    @Override
109    public Tensor[] step(Tensor[] updates, double weight) {
110        for (Tensor update : updates) {
111            update.scaleInPlace(weight);
112        }
113
114        return updates;
115    }
116
117    @Override
118    public void finalise() {
119        Tensor[] curParams = parameters.get();
120        Tensor[] newParams = new Tensor[curParams.length];
121        for (int i = 0; i < newParams.length; i++) {
122            if (curParams[i] instanceof AdaGradRDATensor) {
123                newParams[i] = ((AdaGradRDATensor) curParams[i]).convertToDense();
124            } else {
125                throw new IllegalStateException("Finalising a Parameters which wasn't initialised with AdaGradRDA");
126            }
127        }
128        parameters.set(newParams);
129    }
130
131    @Override
132    public String toString() {
133        return "AdaGradRDA(initialLearningRate="+initialLearningRate+",epsilon="+epsilon+",l1="+l1+",l2="+l2+")";
134    }
135
136    @Override
137    public void reset() { }
138
139    @Override
140    public AdaGradRDA copy() {
141        return new AdaGradRDA(initialLearningRate,epsilon,l1,l2,numExamples);
142    }
143
144    @Override
145    public ConfiguredObjectProvenance getProvenance() {
146        return new ConfiguredObjectProvenanceImpl(this,"StochasticGradientOptimiser");
147    }
148
149    /**
150     * An interface which tags a {@link Tensor} with a convertToDense method.
151     */
152    private static interface AdaGradRDATensor {
153        public Tensor convertToDense();
154
155        public static double truncate(double input, double threshold) {
156            if (input > threshold) {
157                return input - threshold;
158            } else if (input < -threshold) {
159                return input + threshold;
160            } else {
161                return 0.0;
162            }
163        }
164
165    }
166
167    /**
168     * A subclass of {@link DenseVector} which uses {@link AdaGradRDATensor#truncate(double, double)} to
169     * produce the values.
170     * <p>
171     * Be careful when modifying this or {@link DenseVector}.
172     */
173    private static class AdaGradRDAVector extends DenseVector implements AdaGradRDATensor {
174        private final double learningRate;
175        private final double epsilon;
176        private final double l1;
177        private final double l2;
178        private final double[] gradSquares;
179        private int iteration;
180
181        public AdaGradRDAVector(DenseVector v, double learningRate, double epsilon, double l1, double l2) {
182            super(v);
183            this.learningRate = learningRate;
184            this.epsilon = epsilon;
185            this.l1 = l1;
186            this.l2 = l2;
187            this.gradSquares = new double[v.size()];
188            this.iteration = 0;
189        }
190
191        private AdaGradRDAVector(double[] values, double learningRate, double epsilon, double l1, double l2, double[] gradSquares, int iteration) {
192            super(values);
193            this.learningRate = learningRate;
194            this.epsilon = epsilon;
195            this.l1 = l1;
196            this.l2 = l2;
197            this.gradSquares = gradSquares;
198            this.iteration = iteration;
199        }
200
201        @Override
202        public DenseVector convertToDense() {
203            return DenseVector.createDenseVector(toArray());
204        }
205
206        @Override
207        public AdaGradRDAVector copy() {
208            return new AdaGradRDAVector(Arrays.copyOf(elements,elements.length),learningRate,epsilon,l1,l2,Arrays.copyOf(gradSquares,gradSquares.length),iteration);
209        }
210
211        @Override
212        public double[] toArray() {
213            double[] newValues = new double[elements.length];
214            for (int i = 0; i < newValues.length; i++) {
215                newValues[i] = get(i);
216            }
217            return newValues;
218        }
219
220        @Override
221        public double get(int index) {
222            if (gradSquares[index] == 0.0) {
223                return elements[index];
224            } else {
225                double h = ((Math.sqrt(gradSquares[index]) + epsilon) / learningRate) + iteration * l2;
226                //double h = (1.0/learningRate) * (Math.sqrt(gradSquares[index]) + epsilon) + iteration*l2;
227                double rate = 1.0/h;
228                return rate * AdaGradRDATensor.truncate(elements[index], iteration*l1);
229            }
230        }
231
232        @Override
233        public double sum() {
234            double sum = 0.0;
235            for (int i = 0; i < elements.length; i++) {
236                sum += get(i);
237            }
238            return sum;
239        }
240
241        @Override
242        public void intersectAndAddInPlace(Tensor other, DoubleUnaryOperator f) {
243            iteration++;
244            SGDVector otherVec = (SGDVector) other;
245            for (VectorTuple tuple : otherVec) {
246                double update = f.applyAsDouble(tuple.value);
247                elements[tuple.index] += update;
248                gradSquares[tuple.index] += update*update;
249            }
250        }
251
252        @Override
253        public int indexOfMax() {
254            int index = 0;
255            double value = Double.NEGATIVE_INFINITY;
256            for (int i = 0; i < elements.length; i++) {
257                double tmp = get(i);
258                if (tmp > value) {
259                    index = i;
260                    value = tmp;
261                }
262            }
263            return index;
264        }
265
266        @Override
267        public double maxValue() {
268            double value = Double.NEGATIVE_INFINITY;
269            for (int i = 0; i < elements.length; i++) {
270                double tmp = get(i);
271                if (tmp > value) {
272                    value = tmp;
273                }
274            }
275            return value;
276        }
277
278        @Override
279        public double minValue() {
280            double value = Double.POSITIVE_INFINITY;
281            for (int i = 0; i < elements.length; i++) {
282                double tmp = get(i);
283                if (tmp < value) {
284                    value = tmp;
285                }
286            }
287            return value;
288        }
289
290        @Override
291        public double dot(SGDVector other) {
292            double score = 0.0;
293
294            for (VectorTuple tuple : other) {
295                score += get(tuple.index) * tuple.value;
296            }
297
298            return score;
299        }
300
301        @Override
302        public VectorIterator iterator() {
303            return new RDAVectorIterator(this);
304        }
305
306        private static class RDAVectorIterator implements VectorIterator {
307            private final AdaGradRDAVector vector;
308            private final VectorTuple tuple;
309            private int index;
310
311            public RDAVectorIterator(AdaGradRDAVector vector) {
312                this.vector = vector;
313                this.tuple = new VectorTuple();
314                this.index = 0;
315            }
316
317            @Override
318            public boolean hasNext() {
319                return index < vector.size();
320            }
321
322            @Override
323            public VectorTuple next() {
324                tuple.index = index;
325                tuple.value = vector.get(index);
326                index++;
327                return tuple;
328            }
329
330            @Override
331            public VectorTuple getReference() {
332                return tuple;
333            }
334        }
335    }
336
337    /**
338     * A subclass of {@link DenseMatrix} which uses {@link AdaGradRDATensor#truncate(double, double)} to
339     * produce the values.
340     * <p>
341     * Be careful when modifying this or {@link DenseMatrix}.
342     */
343    private static class AdaGradRDAMatrix extends DenseMatrix implements AdaGradRDATensor {
344        private final double learningRate;
345        private final double epsilon;
346        private final double l1;
347        private final double l2;
348        private final double[][] gradSquares;
349        private int iteration;
350
351        public AdaGradRDAMatrix(DenseMatrix v, double learningRate, double epsilon, double l1, double l2) {
352            super(v);
353            this.learningRate = learningRate;
354            this.epsilon = epsilon;
355            this.l1 = l1;
356            this.l2 = l2;
357            this.gradSquares = new double[v.getDimension1Size()][v.getDimension2Size()];
358            this.iteration = 0;
359        }
360
361        @Override
362        public DenseMatrix convertToDense() {
363            return new DenseMatrix(this);
364        }
365
366        @Override
367        public DenseVector leftMultiply(SGDVector input) {
368            if (input.size() == dim2) {
369                double[] output = new double[dim1];
370                for (VectorTuple tuple : input) {
371                    for (int i = 0; i < output.length; i++) {
372                        output[i] += get(i,tuple.index) * tuple.value;
373                    }
374                }
375
376                return DenseVector.createDenseVector(output);
377            } else {
378                throw new IllegalArgumentException("input.size() != dim2");
379            }
380        }
381
382        @Override
383        public void intersectAndAddInPlace(Tensor other, DoubleUnaryOperator f) {
384            if (other instanceof Matrix) {
385                Matrix otherMat = (Matrix) other;
386                if ((dim1 == otherMat.getDimension1Size()) && (dim2 == otherMat.getDimension2Size())) {
387                    for (MatrixTuple tuple : otherMat) {
388                        double update = f.applyAsDouble(tuple.value);
389                        values[tuple.i][tuple.j] += update;
390                        gradSquares[tuple.i][tuple.j] += update*update;
391                    }
392                } else {
393                    throw new IllegalStateException("Matrices are not the same size, this("+dim1+","+dim2+"), other("+otherMat.getDimension1Size()+","+otherMat.getDimension2Size()+")");
394                }
395            } else {
396                throw new IllegalStateException("Adding a non-Matrix to a Matrix");
397            }
398        }
399
400        @Override
401        public double get(int i, int j) {
402            if (gradSquares[i][j] == 0.0) {
403                return values[i][j];
404            } else {
405                double h = ((Math.sqrt(gradSquares[i][j]) + epsilon) / learningRate) + iteration * l2;
406                //double h = (1.0/learningRate) * (Math.sqrt(gradSquares[index]) + epsilon) + iteration*l2;
407                double rate = 1.0/h;
408                return rate * AdaGradRDATensor.truncate(values[i][j], iteration*l1);
409            }
410        }
411
412        @Override
413        public MatrixIterator iterator() {
414            return new RDAMatrixIterator(this);
415        }
416
417        private static class RDAMatrixIterator implements MatrixIterator {
418            private final AdaGradRDAMatrix matrix;
419            private final MatrixTuple tuple;
420            private final int dim2;
421            private int i;
422            private int j;
423
424            public RDAMatrixIterator(AdaGradRDAMatrix matrix) {
425                this.matrix = matrix;
426                this.tuple = new MatrixTuple();
427                this.dim2 = matrix.dim2;
428                this.i = 0;
429                this.j = 0;
430            }
431
432            @Override
433            public MatrixTuple getReference() {
434                return tuple;
435            }
436
437            @Override
438            public boolean hasNext() {
439                return (i < matrix.dim1) && (j < matrix.dim2);
440            }
441
442            @Override
443            public MatrixTuple next() {
444                tuple.i = i;
445                tuple.j = j;
446                tuple.value = matrix.get(i,j);
447                if (j < dim2-1) {
448                    j++;
449                } else {
450                    //Reached end of current vector, get next one
451                    i++;
452                    j = 0;
453                }
454                return tuple;
455            }
456        }
457
458    }
459}
460