Step size¶

This subpackage contains step size selection methods like Barzilai-Borwein and Polyak's step size.

AdGD ¶

Bases: torchzero.core.transform.TensorTransform

AdGD and AdGD-2 (https://arxiv.org/abs/2308.02261)

Source code in torchzero/modules/step_size/adaptive.py

class AdGD(TensorTransform):
    """AdGD and AdGD-2 (https://arxiv.org/abs/2308.02261)"""
    def __init__(self, variant:Literal[1,2]=2, alpha_0:float = 1e-7, sqrt:bool=True, use_grad=True, inner: Chainable | None = None,):
        defaults = dict(variant=variant, alpha_0=alpha_0, sqrt=sqrt)
        super().__init__(defaults, uses_grad=use_grad, inner=inner,)

    def reset_for_online(self):
        super().reset_for_online()
        self.clear_state_keys('prev_g')
        self.global_state['reset'] = True

    @torch.no_grad
    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
        variant = settings[0]['variant']
        theta_0 = 0 if variant == 1 else 1/3
        theta = self.global_state.get('theta', theta_0)

        step = self.global_state.get('step', 0)
        self.global_state['step'] = step + 1

        p = TensorList(params)
        g = grads if self._uses_grad else tensors
        assert g is not None
        g = TensorList(g)

        prev_p, prev_g = unpack_states(states, tensors, 'prev_p', 'prev_g', cls=TensorList)

        # online
        if self.global_state.get('reset', False):
            del self.global_state['reset']
            prev_p.copy_(p)
            prev_g.copy_(g)
            return

        if step == 0:
            alpha_0 = settings[0]['alpha_0']
            if alpha_0 is None: alpha_0 = epsilon_step_size(g)
            self.global_state['alpha']  = alpha_0
            prev_p.copy_(p)
            prev_g.copy_(g)
            return

        sqrt = settings[0]['sqrt']
        alpha = self.global_state.get('alpha', math.inf)
        L = (g - prev_g).global_vector_norm() / (p - prev_p).global_vector_norm()
        eps = torch.finfo(L.dtype).tiny * 2

        if variant == 1:
            a1 = math.sqrt(1 + theta)*alpha
            val = math.sqrt(2) if sqrt else 2
            if L > eps: a2 = 1 / (val*L)
            else: a2 = math.inf

        elif variant == 2:
            a1 = math.sqrt(2/3 + theta)*alpha
            a2 = alpha / math.sqrt(max(eps, 2 * alpha**2 * L**2 - 1))

        else:
            raise ValueError(variant)

        alpha_new = min(a1, a2)
        if alpha_new < 0: alpha_new = max(a1, a2)
        if alpha_new > eps:
            self.global_state['theta'] = alpha_new/alpha
            self.global_state['alpha'] = alpha_new

        prev_p.copy_(p)
        prev_g.copy_(g)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        alpha = self.global_state.get('alpha', None)

        if not _acceptable_alpha(alpha, tensors[0]):
            # alpha isn't None on 1st step
            self.state.clear()
            self.global_state.clear()
            alpha = epsilon_step_size(TensorList(tensors), settings[0]['alpha_0'])

        torch._foreach_mul_(tensors, alpha)
        return tensors

    def get_H(self, objective):
        return _get_scaled_identity_H(self, objective)

BBStab ¶

Bases: torchzero.core.transform.TensorTransform

Stabilized Barzilai-Borwein method (https://arxiv.org/abs/1907.06409).

This clips the norm of the Barzilai-Borwein update by delta, where delta can be adaptive if c is specified.

Parameters:

c (float, default: 0.2 ) –

adaptive delta parameter. If delta is set to None, first inf_iters updates are performed with non-stabilized Barzilai-Borwein step size. Then delta is set to norm of the update that had the smallest norm, and multiplied by c. Defaults to 0.2.
delta (float | None, default: None ) –

Barzilai-Borwein update is clipped to this value. Set to None to use an adaptive choice. Defaults to None.
type (str, default: 'geom' ) –

one of "short" with formula sᵀy/yᵀy, "long" with formula sᵀs/sᵀy, or "geom" to use geometric mean of short and long. Defaults to "geom". Note that "long" corresponds to BB1stab and "short" to BB2stab, however I found that "geom" works really well.
inner (Chainable | None, default: None ) –

step size will be applied to outputs of this module. Defaults to None.

Source code in torchzero/modules/step_size/adaptive.py

class BBStab(TensorTransform):
    """Stabilized Barzilai-Borwein method (https://arxiv.org/abs/1907.06409).

    This clips the norm of the Barzilai-Borwein update by ``delta``, where ``delta`` can be adaptive if ``c`` is specified.

    Args:
        c (float, optional):
            adaptive delta parameter. If ``delta`` is set to None, first ``inf_iters`` updates are performed
            with non-stabilized Barzilai-Borwein step size. Then delta is set to norm of
            the update that had the smallest norm, and multiplied by ``c``. Defaults to 0.2.
        delta (float | None, optional):
            Barzilai-Borwein update is clipped to this value. Set to ``None`` to use an adaptive choice. Defaults to None.
        type (str, optional):
            one of "short" with formula sᵀy/yᵀy, "long" with formula sᵀs/sᵀy, or "geom" to use geometric mean of short and long.
            Defaults to "geom". Note that "long" corresponds to BB1stab and "short" to BB2stab,
            however I found that "geom" works really well.
        inner (Chainable | None, optional):
            step size will be applied to outputs of this module. Defaults to None.

    """
    def __init__(
        self,
        c=0.2,
        delta:float | None = None,
        type: Literal["long", "short", "geom", "geom-fallback"] = "geom",
        alpha_0: float = 1e-7,
        use_grad=True,
        inf_iters: int = 3,
        inner: Chainable | None = None,
    ):
        defaults = dict(type=type,alpha_0=alpha_0, c=c, delta=delta, inf_iters=inf_iters)
        super().__init__(defaults, uses_grad=use_grad, inner=inner)

    def reset_for_online(self):
        super().reset_for_online()
        self.clear_state_keys('prev_g')
        self.global_state['reset'] = True

    @torch.no_grad
    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
        step = self.global_state.get('step', 0)
        self.global_state['step'] = step + 1

        prev_p, prev_g = unpack_states(states, tensors, 'prev_p', 'prev_g', cls=TensorList)
        type = self.defaults['type']
        c = self.defaults['c']
        delta = self.defaults['delta']
        inf_iters = self.defaults['inf_iters']

        g = grads if self._uses_grad else tensors
        assert g is not None
        g = TensorList(g)

        reset = self.global_state.get('reset', False)
        self.global_state.pop('reset', None)

        if step != 0 and not reset:
            s = params-prev_p
            y = g-prev_g
            sy = s.dot(y)
            eps = torch.finfo(sy.dtype).tiny

            if type == 'short': alpha = _bb_short(s, y, sy, eps)
            elif type == 'long': alpha = _bb_long(s, y, sy, eps)
            elif type == 'geom': alpha = _bb_geom(s, y, sy, eps, fallback=False)
            elif type == 'geom-fallback': alpha = _bb_geom(s, y, sy, eps, fallback=True)
            else: raise ValueError(type)

            if alpha is not None:

                # adaptive delta
                if delta is None:
                    niters = self.global_state.get('niters', 0) # this accounts for skipped negative curvature steps
                    self.global_state['niters'] = niters + 1


                    if niters == 0: pass # 1st iteration is scaled GD step, shouldn't be used to find s_norm_min
                    elif niters <= inf_iters:
                        s_norm_min = self.global_state.get('s_norm_min', None)
                        if s_norm_min is None: s_norm_min = s.global_vector_norm()
                        else: s_norm_min = min(s_norm_min, s.global_vector_norm())
                        self.global_state['s_norm_min'] = s_norm_min
                        # first few steps use delta=inf, so delta remains None

                    else:
                        delta = c * self.global_state['s_norm_min']

                if delta is None: # delta is inf for first few steps
                    self.global_state['alpha'] = alpha

                # BBStab step size
                else:
                    a_stab = delta / g.global_vector_norm()
                    self.global_state['alpha'] = min(alpha, a_stab)

        prev_p.copy_(params)
        prev_g.copy_(g)

    def get_H(self, objective):
        return _get_scaled_identity_H(self, objective)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        alpha = self.global_state.get('alpha', None)

        if not _acceptable_alpha(alpha, tensors[0]):
            alpha = epsilon_step_size(TensorList(tensors), settings[0]['alpha_0'])

        torch._foreach_mul_(tensors, alpha)
        return tensors

BarzilaiBorwein ¶

Bases: torchzero.core.transform.TensorTransform

Barzilai-Borwein step size method.

Parameters:

type (str, default: 'geom' ) –

one of "short" with formula sᵀy/yᵀy, "long" with formula sᵀs/sᵀy, or "geom" to use geometric mean of short and long. Defaults to "geom".
fallback (float) –

step size when denominator is less than 0 (will happen on negative curvature). Defaults to 1e-3.
inner (Chainable | None, default: None ) –

step size will be applied to outputs of this module. Defaults to None.

Source code in torchzero/modules/step_size/adaptive.py

class BarzilaiBorwein(TensorTransform):
    """Barzilai-Borwein step size method.

    Args:
        type (str, optional):
            one of "short" with formula sᵀy/yᵀy, "long" with formula sᵀs/sᵀy, or "geom" to use geometric mean of short and long.
            Defaults to "geom".
        fallback (float, optional): step size when denominator is less than 0 (will happen on negative curvature). Defaults to 1e-3.
        inner (Chainable | None, optional):
            step size will be applied to outputs of this module. Defaults to None.
    """

    def __init__(
        self,
        type: Literal["long", "short", "geom", "geom-fallback"] = "geom",
        alpha_0: float = 1e-7,
        use_grad=True,
        inner: Chainable | None = None,
    ):
        defaults = dict(type=type, alpha_0=alpha_0)
        super().__init__(defaults, uses_grad=use_grad, inner=inner)

    def reset_for_online(self):
        super().reset_for_online()
        self.clear_state_keys('prev_g')
        self.global_state['reset'] = True

    @torch.no_grad
    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
        step = self.global_state.get('step', 0)
        self.global_state['step'] = step + 1

        prev_p, prev_g = unpack_states(states, tensors, 'prev_p', 'prev_g', cls=TensorList)
        type = self.defaults['type']

        g = grads if self._uses_grad else tensors
        assert g is not None

        reset = self.global_state.get('reset', False)
        self.global_state.pop('reset', None)

        if step != 0 and not reset:
            s = params-prev_p
            y = g-prev_g
            sy = s.dot(y)
            eps = torch.finfo(sy.dtype).tiny * 2

            if type == 'short': alpha = _bb_short(s, y, sy, eps)
            elif type == 'long': alpha = _bb_long(s, y, sy, eps)
            elif type == 'geom': alpha = _bb_geom(s, y, sy, eps, fallback=False)
            elif type == 'geom-fallback': alpha = _bb_geom(s, y, sy, eps, fallback=True)
            else: raise ValueError(type)

            # if alpha is not None:
            self.global_state['alpha'] = alpha

        prev_p.copy_(params)
        prev_g.copy_(g)

    def get_H(self, objective):
        return _get_scaled_identity_H(self, objective)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        alpha = self.global_state.get('alpha', None)

        if not _acceptable_alpha(alpha, tensors[0]):
            alpha = epsilon_step_size(TensorList(tensors), settings[0]['alpha_0'])

        torch._foreach_mul_(tensors, alpha)
        return tensors

BoldDriver ¶

Bases: torchzero.core.transform.TensorTransform

Multiplies step size by nplus if loss decreased compared to last iteration, otherwise multiplies by nminus.

Source code in torchzero/modules/step_size/adaptive.py

class BoldDriver(TensorTransform):
    """Multiplies step size by ``nplus`` if loss decreased compared to last iteration, otherwise multiplies by ``nminus``."""
    def __init__(self, a_init=1e-3, nplus=1.1, nminus=0.1, inner: Chainable | None = None):
        defaults = dict(a_init=a_init, nplus=nplus, nminus=nminus)
        super().__init__(defaults, uses_loss=True, inner=inner)
        self.global_state["alpha"] = a_init

    def reset_for_online(self):
        super().reset_for_online()
        self.clear_state_keys('f_prev')

    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
        fs = settings[0]
        if "f_prev" not in self.global_state:
            self.global_state["f_prev"] = tofloat(loss)
            return

        if self.global_state["f_prev"] <= loss:
            self.global_state["alpha"] *= fs["nminus"]

        else:
            self.global_state["alpha"] *= fs["nplus"]

        self.global_state["f_prev"] = tofloat(loss)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        alpha = self.global_state.get('alpha', None)

        if not _acceptable_alpha(alpha, tensors[0]):
            self.state.clear()
            self.global_state.clear()
            self.global_state["alpha"] = settings[0]["a_init"]
            alpha = epsilon_step_size(TensorList(tensors), 1e-7)

        torch._foreach_mul_(tensors, alpha)
        return tensors

    def get_H(self, objective):
        return _get_scaled_identity_H(self, objective)

LR ¶

Bases: torchzero.core.transform.TensorTransform

Learning rate. Adding this module also adds support for LR schedulers.

Source code in torchzero/modules/step_size/lr.py

class LR(TensorTransform):
    """Learning rate. Adding this module also adds support for LR schedulers."""
    def __init__(self, lr: float):
        defaults=dict(lr=lr)
        super().__init__(defaults)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        return lazy_lr(TensorList(tensors), lr=[s['lr'] for s in settings], inplace=True)

PolyakStepSize ¶

Bases: torchzero.core.transform.TensorTransform

Polyak's subgradient method with known or unknown f*.

Parameters:

f_star (float | Mone, default: 0 ) –

minimal possible value of the objective function. If not known, set to None. Defaults to 0.
y (float, default: 1 ) –

when f_star is set to None, it is calculated as f_best - y.
y_decay (float, default: 0.001 ) –

y is multiplied by (1 - y_decay) after each step. Defaults to 1e-3.
max (float | None, default: None ) –

maximum possible step size. Defaults to None.
use_grad (bool, default: True ) –

if True, uses dot product of update and gradient to compute the step size. Otherwise, dot product of update with itself is used.
alpha (float, default: 1 ) –

multiplier to Polyak step-size. Defaults to 1.

Source code in torchzero/modules/step_size/adaptive.py

class PolyakStepSize(TensorTransform):
    """Polyak's subgradient method with known or unknown f*.

    Args:
        f_star (float | Mone, optional):
            minimal possible value of the objective function. If not known, set to ``None``. Defaults to 0.
        y (float, optional):
            when ``f_star`` is set to None, it is calculated as ``f_best - y``.
        y_decay (float, optional):
            ``y`` is multiplied by ``(1 - y_decay)`` after each step. Defaults to 1e-3.
        max (float | None, optional): maximum possible step size. Defaults to None.
        use_grad (bool, optional):
            if True, uses dot product of update and gradient to compute the step size.
            Otherwise, dot product of update with itself is used.
        alpha (float, optional): multiplier to Polyak step-size. Defaults to 1.
    """
    def __init__(self, f_star: float | None = 0, y: float = 1, y_decay: float = 1e-3, max: float | None = None, use_grad=True, alpha: float = 1, inner: Chainable | None = None):

        defaults = dict(alpha=alpha, max=max, f_star=f_star, y=y, y_decay=y_decay)
        super().__init__(defaults, uses_grad=use_grad, uses_loss=True, inner=inner)

    @torch.no_grad
    def multi_tensor_update(self, tensors, params, grads, loss, states, settings):
        assert grads is not None and loss is not None
        tensors = TensorList(tensors)
        grads = TensorList(grads)

        # load variables
        max, f_star, y, y_decay = itemgetter('max', 'f_star', 'y', 'y_decay')(settings[0])
        y_val = self.global_state.get('y_val', y)
        f_best = self.global_state.get('f_best', None)

        # gg
        if self._uses_grad: gg = tensors.dot(grads)
        else: gg = tensors.dot(tensors)

        # store loss
        if f_best is None or loss < f_best: f_best = tofloat(loss)
        if f_star is None: f_star = f_best - y_val

        # calculate the step size
        if gg <= torch.finfo(gg.dtype).tiny * 2: alpha = 0 # converged
        else: alpha = (loss - f_star) / gg

        # clip
        if max is not None:
            if alpha > max: alpha = max

        # store state
        self.global_state['f_best'] = f_best
        self.global_state['y_val'] = y_val * (1 - y_decay)
        self.global_state['alpha'] = alpha

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        alpha = self.global_state.get('alpha', 1)
        if not _acceptable_alpha(alpha, tensors[0]): alpha = epsilon_step_size(TensorList(tensors))

        torch._foreach_mul_(tensors, alpha * unpack_dicts(settings, 'alpha', cls=NumberList))
        return tensors

    def get_H(self, objective):
        return _get_scaled_identity_H(self, objective)

RandomStepSize ¶

Bases: torchzero.core.transform.TensorTransform

Uses random global or layer-wise step size from low to high.

Parameters:

low (float, default: 0 ) –

minimum learning rate. Defaults to 0.
high (float, default: 1 ) –

maximum learning rate. Defaults to 1.
parameterwise (bool, default: False ) –

if True, generate random step size for each parameter separately, if False generate one global random step size. Defaults to False.

Source code in torchzero/modules/step_size/lr.py

class RandomStepSize(TensorTransform):
    """Uses random global or layer-wise step size from ``low`` to ``high``.

    Args:
        low (float, optional): minimum learning rate. Defaults to 0.
        high (float, optional): maximum learning rate. Defaults to 1.
        parameterwise (bool, optional):
            if True, generate random step size for each parameter separately,
            if False generate one global random step size. Defaults to False.
    """
    def __init__(self, low: float = 0, high: float = 1, parameterwise=False, seed:int|None=None):
        defaults = dict(low=low, high=high, parameterwise=parameterwise,seed=seed)
        super().__init__(defaults, uses_grad=False)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        s = settings[0]
        parameterwise = s['parameterwise']

        seed = s['seed']
        if 'generator' not in self.global_state:
            self.global_state['generator'] = random.Random(seed)
        generator: random.Random = self.global_state['generator']

        if parameterwise:
            low, high = unpack_dicts(settings, 'low', 'high')
            lr = [generator.uniform(l, h) for l, h in zip(low, high)]
        else:
            low = s['low']
            high = s['high']
            lr = generator.uniform(low, high)

        torch._foreach_mul_(tensors, lr)
        return tensors

StepSize ¶

Bases: torchzero.core.transform.TensorTransform

this is exactly the same as LR, except the lr parameter can be renamed to any other name to avoid clashes

Source code in torchzero/modules/step_size/lr.py

class StepSize(TensorTransform):
    """this is exactly the same as LR, except the `lr` parameter can be renamed to any other name to avoid clashes"""
    def __init__(self, step_size: float, key = 'step_size'):
        defaults={"key": key, key: step_size}
        super().__init__(defaults)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        return lazy_lr(TensorList(tensors), lr=[s[s['key']] for s in settings], inplace=True)

Warmup ¶

Bases: torchzero.core.transform.TensorTransform

Learning rate warmup, linearly increases learning rate multiplier from start_lr to end_lr over steps steps.

Parameters:

steps (int, default: 100 ) –

number of steps to perform warmup for. Defaults to 100.
start_lr (_type_, default: 1e-05 ) –

initial learning rate multiplier on first step. Defaults to 1e-5.
end_lr (float, default: 1 ) –

learning rate multiplier at the end and after warmup. Defaults to 1.

Example

Adam with 1000 steps warmup

.. code-block:: python

opt = tz.Optimizer(
    model.parameters(),
    tz.m.Adam(),
    tz.m.LR(1e-2),
    tz.m.Warmup(steps=1000)
)

Source code in torchzero/modules/step_size/lr.py

class Warmup(TensorTransform):
    """Learning rate warmup, linearly increases learning rate multiplier from ``start_lr`` to ``end_lr`` over ``steps`` steps.

    Args:
        steps (int, optional): number of steps to perform warmup for. Defaults to 100.
        start_lr (_type_, optional): initial learning rate multiplier on first step. Defaults to 1e-5.
        end_lr (float, optional): learning rate multiplier at the end and after warmup. Defaults to 1.

    Example:
        Adam with 1000 steps warmup

        .. code-block:: python

            opt = tz.Optimizer(
                model.parameters(),
                tz.m.Adam(),
                tz.m.LR(1e-2),
                tz.m.Warmup(steps=1000)
            )

    """
    def __init__(self, steps = 100, start_lr = 1e-5, end_lr:float = 1):
        defaults = dict(start_lr=start_lr,end_lr=end_lr, steps=steps)
        super().__init__(defaults, uses_grad=False)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        start_lr, end_lr = unpack_dicts(settings, 'start_lr', 'end_lr', cls = NumberList)
        num_steps = settings[0]['steps']
        step = self.global_state.get('step', 0)

        tensors = lazy_lr(
            TensorList(tensors),
            lr=_warmup_lr(step=step, start_lr=start_lr, end_lr=end_lr, steps=num_steps),
            inplace=True
        )
        self.global_state['step'] = step + 1
        return tensors

WarmupNormClip ¶

Bases: torchzero.core.transform.TensorTransform

Warmup via clipping of the update norm.

Parameters:

start_norm (_type_, default: 1e-05 ) –

maximal norm on the first step. Defaults to 1e-5.
end_norm (float, default: 1 ) –

maximal norm on the last step. After that, norm clipping is disabled. Defaults to 1.
steps (int, default: 100 ) –

number of steps to perform warmup for. Defaults to 100.

Example

Adam with 1000 steps norm clip warmup

.. code-block:: python

opt = tz.Optimizer(
    model.parameters(),
    tz.m.Adam(),
    tz.m.WarmupNormClip(steps=1000)
    tz.m.LR(1e-2),
)

Source code in torchzero/modules/step_size/lr.py

class WarmupNormClip(TensorTransform):
    """Warmup via clipping of the update norm.

    Args:
        start_norm (_type_, optional): maximal norm on the first step. Defaults to 1e-5.
        end_norm (float, optional): maximal norm on the last step. After that, norm clipping is disabled. Defaults to 1.
        steps (int, optional): number of steps to perform warmup for. Defaults to 100.

    Example:
        Adam with 1000 steps norm clip warmup

        .. code-block:: python

            opt = tz.Optimizer(
                model.parameters(),
                tz.m.Adam(),
                tz.m.WarmupNormClip(steps=1000)
                tz.m.LR(1e-2),
            )
    """
    def __init__(self, steps = 100, start_norm = 1e-5, end_norm:float = 1):
        defaults = dict(start_norm=start_norm,end_norm=end_norm, steps=steps)
        super().__init__(defaults, uses_grad=False)

    @torch.no_grad
    def multi_tensor_apply(self, tensors, params, grads, loss, states, settings):
        start_norm, end_norm = unpack_dicts(settings, 'start_norm', 'end_norm', cls = NumberList)
        num_steps = settings[0]['steps']
        step = self.global_state.get('step', 0)
        if step > num_steps: return tensors

        tensors = TensorList(tensors)
        norm = tensors.global_vector_norm()
        current_max_norm = _warmup_lr(step, start_norm[0], end_norm[0], num_steps)
        if norm > current_max_norm:
            tensors.mul_(current_max_norm / norm)

        self.global_state['step'] = step + 1
        return tensors

Step size¶

See also¶

AdGD ¶

BBStab ¶

BarzilaiBorwein ¶

BoldDriver ¶

LR ¶

PolyakStepSize ¶

RandomStepSize ¶

StepSize ¶

Warmup ¶

WarmupNormClip ¶