# -*- coding: utf-8 -*-
# File: optimizer.py
from contextlib import contextmanager
import tensorflow as tf
from ..tfutils.common import get_tf_version_tuple
from ..compat import tfv1
from ..utils.develop import HIDE_DOC
from .gradproc import FilterNoneGrad, GradientProcessor
__all__ = ['apply_grad_processors', 'ProxyOptimizer',
'PostProcessOptimizer', 'VariableAssignmentOptimizer',
'AccumGradOptimizer']
[docs]class ProxyOptimizer(tfv1.train.Optimizer):
"""
A transparent proxy which delegates all methods of :class:`tf.train.Optimizer`
"""
def __init__(self, opt, name='ProxyOptimizer'):
assert isinstance(opt, tfv1.train.Optimizer), opt
super(ProxyOptimizer, self).__init__(False, name)
self._opt = opt
@HIDE_DOC
def compute_gradients(self, *args, **kwargs):
return self._opt.compute_gradients(*args, **kwargs)
@HIDE_DOC
def get_slot(self, *args, **kwargs):
return self._opt.get_slot(*args, **kwargs)
@HIDE_DOC
def get_slot_names(self, *args, **kwargs):
return self._opt.get_slot_names(*args, **kwargs)
@HIDE_DOC
def apply_gradients(self, *args, **kwargs):
return self._opt.apply_gradients(*args, **kwargs)
[docs]def apply_grad_processors(opt, gradprocs):
"""
Wrapper around optimizers to apply gradient processors.
Args:
opt (tf.train.Optimizer):
gradprocs (list[GradientProcessor]): gradient processors to add to the
optimizer.
Returns:
a :class:`tf.train.Optimizer` instance which runs the gradient
processors before updating the variables.
"""
assert isinstance(gradprocs, (list, tuple)), gradprocs
for gp in gradprocs:
assert isinstance(gp, GradientProcessor), gp
class _ApplyGradientProcessor(ProxyOptimizer):
def __init__(self, opt, gradprocs):
self._gradprocs = gradprocs[:]
super(_ApplyGradientProcessor, self).__init__(opt)
def apply_gradients(self, grads_and_vars,
global_step=None, name=None):
g = self._apply(grads_and_vars)
return self._opt.apply_gradients(g, global_step, name)
def _apply(self, g):
for proc in self._gradprocs:
g = proc.process(g)
return g
return _ApplyGradientProcessor(opt, gradprocs)
[docs]class PostProcessOptimizer(ProxyOptimizer):
"""
An optimizer which applies some "post-processing operation" per variable
(e.g. clipping, quantization) after the gradient update.
"""
[docs] def __init__(self, opt, func, colocate=True):
"""
Args:
opt (tf.train.Optimizer):
func (tf.Variable -> tf.Operation or None): the operation needed
to perform for this variable after the gradient update.
colocate (boolean): colocate the function with the variable. No effect since TF 1.13.
"""
super(PostProcessOptimizer, self).__init__(opt)
self._func = func
self._colocate = colocate
@HIDE_DOC
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
update_op = super(PostProcessOptimizer, self).apply_gradients(
grads_and_vars, global_step)
ops = []
with tf.control_dependencies([update_op]):
for _, var in grads_and_vars:
with self._maybe_colocate(var):
op = self._func(var)
if op is not None:
assert isinstance(op, tf.Operation), op
ops.append(op)
update_op = tf.group(update_op, *ops, name=name)
return update_op
@contextmanager
def _maybe_colocate(self, var):
G = tf.get_default_graph()
if self._colocate and get_tf_version_tuple() <= (1, 12):
with G.colocate_with(var):
yield
else:
yield
[docs]class VariableAssignmentOptimizer(PostProcessOptimizer):
"""
An optimizer which assigns each variable a new value (e.g. clipping,
quantization) after the gradient update.
"""
[docs] def __init__(self, opt, func):
"""
Args:
opt (tf.train.Optimizer):
func (tf.Variable -> tf.Tensor or None): the new value to be
assigned to this variable after the gradient update.
"""
def f(v):
t = func(v)
if t is None:
return t
return tfv1.assign(v, t, use_locking=False).op
super(VariableAssignmentOptimizer, self).__init__(opt, f)
[docs]class AccumGradOptimizer(ProxyOptimizer):
"""
An optimizer which accumulates gradients across :math:`k` :meth:`minimize` executions,
and apply them together in every :math:`k` th :meth:`minimize` execution.
This is roughly the same as using a :math:`k` times larger batch size plus a
:math:`k` times larger learning rate, but uses much less memory.
This optimizer can be used in any TensorFlow code (with or without tensorpack).
Example:
.. code-block:: python
from tensorpack.tfutils.optimizer import AccumGradOptimizer
myopt = tf.train.GradientDescentOptimizer(0.01)
myopt = AccumGradOptimizer(myopt, niter=5)
train_op = myopt.minimize(loss)
"""
[docs] def __init__(self, opt, niter):
"""
Args:
opt (tf.train.Optimizer): the underlying sub-optimizer.
niter (int): number of iterations to accumulate gradients.
"""
super(AccumGradOptimizer, self).__init__(opt, 'AccumGrad')
self._niter = int(niter)
def _create_accum_slots(self, var_list):
slots = []
for v in var_list:
# TODO an option to not colocate the accumulators with variables (to save more memory)
s = self._zeros_slot(v, "accum", self._name)
slots.append(s)
return slots
@HIDE_DOC
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
grads_and_vars = FilterNoneGrad().process(grads_and_vars)
vs = []
for g, v in grads_and_vars:
assert isinstance(g, (tf.Tensor, tf.IndexedSlices)) and isinstance(v, tf.Variable), \
"AccumGradOptimizer does not work for the gradient of {}! " \
"Types of v and g are {} and {}".format(v.op.name, type(v), type(g))
vs.append(v)
with tf.control_dependencies(None):
slots = self._create_accum_slots(vs)
slots_and_vars = [(s, gv[1]) for s, gv in zip(slots, grads_and_vars)]
with tfv1.variable_scope(self._name), tf.device('/cpu:0'):
counter = tf.Variable(
0, name="counter", trainable=False, dtype=tf.int32)
with tf.name_scope('AccumGradOptimizer'):
ops = []
for s, gv in zip(slots, grads_and_vars):
g, v = gv
ops.append(s.assign_add(g))
update_counter = tfv1.assign_add(counter, 1, name='update_counter')
update_slot_op = tf.group(update_counter, *ops, name='update_slot')
def update_grad():
update_op = self._opt.apply_gradients(slots_and_vars)
with tf.control_dependencies([update_op]):
clear_ops = [tfv1.assign(s, tf.zeros_like(s)) for s in slots]
return tf.group(*clear_ops, name='update_grad')
pred = tf.equal(tfv1.mod(counter, self._niter), 0)
with tf.control_dependencies([update_slot_op]):
if name is None:
name = 'cond_update_grad'
op = tf.cond(pred, update_grad, tf.no_op)
if global_step is not None:
# Tensorpack maintains global_step by other means,
# so this option is useless in tensorpack trainers.
# But we include the implementation here for completeness
global_step_increment = tfv1.assign_add(global_step, 1)
op = tf.group(op, global_step_increment, name=name)
else:
op = tf.identity(op, name=name).op
return op
if __name__ == '__main__':
# run it with "python -m tensorpack.tfutils.optimizer"
x = tfv1.get_variable('x', shape=[6])
cost = tf.reduce_sum(tf.abs(x), name='cost')
opt = tf.train.GradientDescentOptimizer(0.01)
opt = AccumGradOptimizer(opt, 5)
min_op = opt.minimize(cost, global_step=tf.train.get_or_create_global_step())
sess = tf.Session()
sess.run(tf.global_variables_initializer())
with sess.as_default():
for _ in range(20):
min_op.run()
print(x.eval())
print(tf.train.get_or_create_global_step().eval())