import numpy as np
from IPython.display import Image, display

import matplotlib.pyplot as plt
%matplotlib inline

%load_ext autoreload
%autoreload 2

## Set nicer colors
plt.rcParams['image.cmap'] = 'PuBu'
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=[[1.0, .3882, .2784]])
plt.rcParams['lines.markersize'] = 10

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

class RandomLossLandscapeWithoutGradient:
    """
    Creates a random two-dimensional loss landscape with multiple circular gaussian wells

    Args:
        d (int): number of dimensions for the loss landscape
        n_wells (int): number of gaussian wells
        random_state (int): random seed
    
    """

    def __init__(self, d=2, n_wells=3, random_state=None):
        
        # Fix the random seed
        self.random_state = random_state
        np.random.seed(random_state)

        # Select random well weights, locations, and widths
        self.coeffs = np.random.random(n_wells)
        self.coeffs /= np.sum(self.coeffs)
        self.locs = np.random.randn(n_wells, d)
        self.widths = np.random.rand(n_wells)[None, :]

    def _gaussian_well(self, X, width=1):
        """A single gaussian well centered at 0 with width `width`"""
        return -np.exp(-np.sum((X / width) ** 2, axis=1))
    
    def loss(self, X):
        """
        Compute the loss landscape at points X

        Args:
            X (np.ndarray): points at which to compute the loss landscape. This should 
                be of shape (n_batch, n_dim)
            width (float): width of the gaussian wells

        Returns:
            np.ndarray: loss landscape at points X

        Notes:
            The loss landscape is computed as the sum of the individual gaussian wells
            The shape of the argument to np.sum is (n_batch, n_wells)
        """
        # return np.einsum(
        #     '...i,i->...', 
        #     self._gaussian_well(X[..., None] - self.locs.T[None, :], self.widths), 
        #     self.coeffs
        # )
        # print(X[..., None].shape)
        # print(self.locs.T[None, :].shape)
        # print(self.widths.shape)
        # print((self._gaussian_well(X[..., None] - self.locs.T[None, :])).shape)
        # print(self._gaussian_well(X[..., None] - self.locs.T[None, :], self.widths).shape)
        return np.sum(
            self._gaussian_well(X[..., None] - self.locs.T[None, :], self.widths) * self.coeffs,
            axis=1
        )

    def __call__(self, X):
        return self.loss(X)


loss = RandomLossLandscapeWithoutGradient(random_state=0, n_wells=8)

## We have to make a list of points at which to plot the landscape. NumPy's meshgrid is 
## a built-in utility for this purpose. 
x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
print(X.shape)
Z = loss(X) # same as loss.loss(X) because class is callable

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(xx.ravel()[Z.argmin()], yy.ravel()[Z.argmin()], '*', markersize=20)
plt.axis('off')

(10000, 2)

(-3.3, 3.3, -3.3, 3.3)

def gradcheck(loss, x, eps=1e-9):
    """
    This function computes the exact gradient and an approximate gradient using 
    finite differences

    Args:
        loss (callable): loss function. Must have a method `grad` that returns the 
            analytic gradient
        x (np.ndarray): input to the loss function
        eps (float): epsilon for finite differences

    Returns:
        grad (np.ndarray): analytic gradient
        grad_num (np.ndarray): numerical gradient
    
    """
    x = np.array(x)
    grad = loss.grad(x)
    grad_num = list()
    for i in range(x.shape[0]):
        x1, x2 = x.copy(), x.copy()
        x1[i] += eps
        x2[i] -= eps
        grad_num.append((loss(x1) - loss(x2)) / (2 * eps))
    grad_num = np.array(grad_num)
    return grad, grad_num

grad, grad_num = gradcheck(loss, np.random.randn(2))

print(f"True gradient: {grad.squeeze()}\nApproximate gradient: {grad_num.squeeze()}")

print(np.allclose(grad, grad_num, atol=1e-1))

True gradient: [ 0.00512768 -0.0059314 ]
Approximate gradient: [ 0.00512768 -0.0059314 ]
True

x = np.linspace(-3, 3, 10)
y = np.linspace(-3, 3, 10)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss(X) # same as loss.loss(X) because class is callable

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z, s=2000)
plt.plot(xx.ravel()[Z.argmin()], yy.ravel()[Z.argmin()], '*r', markersize=10)
plt.axis('off')

(-3.3, 3.3, -3.3, 3.3)

class BaseOptimizer:
    """A Multivariate Gradient Descent Optimizer
    
    Args:
        loss (callable): loss function. Must have a method `grad` that returns the 
            analytic gradient
        lr (float): learning rate
        max_iter (int): maximum number of iterations
        tol (float): tolerance for stopping criterion
        random_state (int): random seed
        store_history (bool): whether to store the optimization trajectory
    """

    def __init__(self, loss, lr=0.1, max_iter=1000, tol=1e-6, random_state=None, store_history=False):
        self.loss = loss
        self.lr = lr
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        np.random.seed(random_state)
        self.store_history = store_history
        if self.store_history:
            self.Xs = []
            self.losses = []

    def fit(self, X):
        """Fit the optimizer to the data

        Args:
            X (np.ndarray): initial guess for the optimizer. Shape (n_dim,) or
                (n_batch, n_dim) if we have multiple initial points

        Returns:
            self
        """
        
        self.X = X
        if self.store_history:
            self.Xs = [self.X.copy()]
            self.losses = [self.loss(X)]
        for i in range(self.max_iter):
            
            self.X = self.update(self.X)

            if self.store_history:
                self.Xs.append(self.X.copy())
                self.losses.append(self.loss(self.X))

            # Stop early if loss is not decreasing any more for *any* batch elements
            if np.linalg.norm(self.loss.grad(self.X)) < self.tol:
                break

        return self
    
    def update(self, X):
        raise NotImplementedError("Implement this method in a subclass")

class GradientDescent(BaseOptimizer):
    """A Multivariate Gradient Descent Optimizer"""

    def update(self, X):
        return X - self.lr * self.loss.grad(X)

# Initialize optimizer
optimizer = GradientDescent(loss, lr=4.3/10000, max_iter=2000, tol=1e-6, random_state=0, store_history=True)

# Initialize many starting points
X0 = 6 * np.random.random(size=(100, 2)) - 3

# Fit optimizer
optimizer.fit(X0.copy())

<__main__.GradientDescent at 0x17fc7b210>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.k')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*optimizer.X.T, '.')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

plt.figure()
plt.plot(optimizer.losses, color=(0.7, 0.7, 0.7), lw=1, alpha=0.2)
plt.plot(np.mean(optimizer.losses, axis=1), 'k', lw=3)
plt.xlabel('Iteration')
plt.ylabel('Loss')

Text(0, 0.5, 'Loss')

Xs = np.array(optimizer.Xs)
plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
plt.plot(Xs[:, :, 0], Xs[:, :, 1], '-');
# plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.');
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)

def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
    plt.plot(Xs[:i, :, 0], Xs[:i, :, 1]);
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.b');
    # plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
    # plt.plot(Xs[:i, :, 0], Xs[:, :i, 1], '-');
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=2000), Output()), …

<function __main__.plotter(i)>

class GradientDescentMomentum(BaseOptimizer):
    """A Multivariate Gradient Descent Optimizer"""

    def __init__(self, loss, momentum=0.9, **kwargs):
        super().__init__(loss, **kwargs)
        self.momentum = momentum
        self.v = None

    def update(self, X):
        if self.v is None:
            self.v = np.zeros_like(X)
        self.v = self.momentum * self.v - self.lr * self.loss.grad(X)
        return X + self.v

# Initialize optimizer
optimizer = GradientDescentMomentum(loss, lr=0.1, momentum=0.9*5, max_iter=2000, tol=1e-6, 
                                    random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3

# Fit optimizer
optimizer.fit(X0.copy())

<__main__.GradientDescentMomentum at 0x29ef64610>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.k')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*optimizer.X.T, '.k')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

plt.figure()
plt.plot(optimizer.losses, color=(0.7, 0.7, 0.7), lw=1, alpha=0.2)
plt.plot(np.mean(optimizer.losses, axis=1), 'k', lw=3)
plt.xlabel('Iteration')
plt.ylabel('Loss')

Text(0, 0.5, 'Loss')

Xs = np.array(optimizer.Xs)
plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(Xs[0, :, 0], Xs[0, :, 1]);
plt.plot(Xs[:, :, 0], Xs[:, :, 1]);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.b');
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)

def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
    plt.plot(Xs[:i, :, 0], Xs[:i, :, 1], 'r');
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.b');
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=2000), Output()), …

<function __main__.plotter(i)>

class StochasticGradientDescent(BaseOptimizer):
    """A Multivariate Gradient Descent Optimizer"""

    def __init__(self, loss, noise, **kwargs):
        super().__init__(loss, **kwargs)
        self.noise = noise

    def update(self, X):
        grad = self.loss.grad(self.X)
        noisy_grad = grad + self.noise * np.random.randn(*grad.shape)
        return X - self.lr * noisy_grad

# Initialize optimizer
optimizer = StochasticGradientDescent(loss, lr=0.1, noise=0.2, max_iter=2000, tol=1e-6, random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3

# Fit optimizer
optimizer.fit(X0.copy())

<__main__.StochasticGradientDescent at 0x12b4df710>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.k')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*optimizer.X.T, '.k')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

plt.figure()
plt.plot(optimizer.losses, color=(0.7, 0.7, 0.7), lw=1, alpha=0.2)
plt.plot(np.mean(optimizer.losses, axis=1), 'k', lw=3)
plt.xlabel('Iteration')
plt.ylabel('Loss')

Text(0, 0.5, 'Loss')

Xs = np.array(optimizer.Xs)
plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
plt.plot(Xs[:, :, 0], Xs[:, :, 1]);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.b');
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)

def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
    plt.plot(Xs[:i, :, 0], Xs[:i, :, 1], 'r');
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.b');
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=2000), Output()), …

<function __main__.plotter(i)>

class ProjectedGradientDescent(BaseOptimizer):
    """A Multivariate Gradient Descent Optimizer"""

    def __init__(self, loss, **kwargs):
        super().__init__(loss, **kwargs)

    def update(self, X):
        self.X = self.project(self.X)
        grad = self.loss.grad(self.X)
        return self.project(X - self.lr * grad)

    def project(self, X):
        """
        Project onto the simplex consisting of points where the norm of all elements is one
        """
        X = X.copy()
        X /= np.linalg.norm(X, axis=1, keepdims=True)
        return X

# Initialize optimizer
optimizer = ProjectedGradientDescent(loss, lr=0.1, max_iter=2000, tol=1e-6, random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3


# Fit optimizer
optimizer.fit(X0.copy())

<__main__.ProjectedGradientDescent at 0x17a48a110>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.k')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
Xs = np.array(optimizer.Xs)
plt.scatter(X[:, 0], X[:, 1], c=Z, zorder=-2)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
plt.plot(Xs[:, :, 0], Xs[:, :, 1], 'r', zorder=-1);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.b', markersize=10);
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)

def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    # plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
    # plt.plot(Xs[:i, :, 0], Xs[:i, :, 1], 'r');
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.r');
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=2000), Output()), …

<function __main__.plotter(i)>

class GradientDescentLagrange(BaseOptimizer):
    """A Multivariate Gradient Descent Optimize with Lagrange Multipliers"""

    def __init__(self, loss, lam, **kwargs):
        super().__init__(loss, **kwargs)
        self.lam = lam # Lagrange multiplier hyperparameter

    def update(self, X):
        grad = self.loss.grad(X)
        self.lam = self.lam + self.lr * (np.einsum('ij,ij->i', X, X) - 1)
        return X - self.lr * (grad + self.lam[..., None] * X)

# Initialize optimizer
optimizer = GradientDescentLagrange(loss, lr=0.1, lam=0.5, max_iter=2000, tol=1e-6, random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3

# Fit optimizer
optimizer.fit(X0.copy())

<__main__.GradientDescentLagrange at 0x17f44e6d0>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
Xs = np.array(optimizer.Xs)
plt.scatter(X[:, 0], X[:, 1], c=Z, zorder=-2)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
plt.plot(Xs[:, :, 0], Xs[:, :, 1], zorder=-1);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.b', markersize=10);
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)



def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    # plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
    # plt.plot(Xs[:i, :, 0], Xs[:i, :, 1], 'r');
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.');
    # unit_circle = np.random.random((2000, 2)) - 0.5
    # unit_circle = unit_circle / np.linalg.norm(unit_circle, axis=1, keepdims=True)
    # plt.plot(*unit_circle.T, '.k', markersize=0.5)
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=2000), Output()), …

<function __main__.plotter(i)>

class RandomLossLandscapeWithHessian(RandomLossLandscape):
    """
    Subclass of the Random Gaussian Loss Landscape that adds an analytic Hessian calculation
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        # Everything is passed to the RandomLossLandscape constructor

    def _hessian_gaussian_well(self, X, width=1):
        oprod = np.einsum('ijk,imk->ijmk', X, X) # (n, d, d, 1)
        iden = np.eye(X.shape[1])[None, ..., None] 
        return (-iden + oprod / width**2) / width**2 * self._gaussian_well(X, width)[:, None, None, :]

    def hessian(self, X): 
        return np.einsum('...i,i->...', self._hessian_gaussian_well(X[..., None] - self.locs.T[None, :], self.widths), self.coeffs)

loss = RandomLossLandscapeWithHessian(random_state=0, n_wells=8)

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss(X) # same as loss.loss(X) because class is callable

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(xx.ravel()[Z.argmin()], yy.ravel()[Z.argmin()], '*r', markersize=10)
plt.axis('off')

plt.figure(figsize=(8, 8))
eig_dirs = np.linalg.eigvalsh(loss.hessian(X))
signed_log = lambda x: np.sign(x) * np.log(1 + np.abs(x))
plt.scatter(X[:, 0], X[:, 1], c=np.log10(np.abs(np.mean(eig_dirs, axis=1))))
plt.plot(xx.ravel()[Z.argmin()], yy.ravel()[Z.argmin()], '*r', markersize=10)
plt.axis('off')

plt.figure(figsize=(8, 8))
x = np.linspace(-3, 3, 15)
y = np.linspace(-3, 3, 15)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)
plt.quiver(X[:, 0], X[:, 1], loss.grad(X)[:, 0], loss.grad(X)[:, 1], Z, scale=1e0)
# plt.streamplot(x, y, loss.grad(X)[:, 0].reshape(100, 100), loss.grad(X)[:, 1].reshape(100, 100), color=Z.reshape(100, 100))
plt.axis('off')

(-3.3, 3.3, -3.3, 3.3)

class MultivariateNewtonsMethod:
    """
    Optimize a function, subject to the constraint that the solution lies in a convex set.
    """
    
    def __init__(self, loss, lr=0.1, max_iter=1000, tol=1e-6, random_state=None, store_history=False):
        self.loss = loss
        self.lr = lr
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        np.random.seed(random_state)
        self.store_history = store_history
        if self.store_history:
            self.Xs = []
            self.losses = []

    def fit(self, X):
        
        self.X = X
        if self.store_history:
            self.Xs = [self.X.copy()]
            self.losses = [self.loss(X)]

        for i in range(self.max_iter):

            ## compute the gradient and the Hessian
            ## adding a random term to the Hessian helps with numerical stability
            grad = self.loss.grad(self.X)
            hess = self.loss.hessian(self.X) + 1e-12 * np.eye(self.X.shape[1])[None, ...]
            
            ## Invert the Hessian
            ihess = np.linalg.inv(hess)

            ## Because we want to operate on the last axis, we need to use einsum
            self.X -= np.einsum('ink,ik->in', ihess, grad, optimize=True)

            ## momentum
            # self.X -= self.lr * np.einsum('ink,ik->in', ihess, grad, optimize=True)

            if self.store_history:
                self.Xs.append(self.X.copy())
                self.losses.append(self.loss(self.X))

            # Stop early if loss is not decreasing any more for *any* batch elements
            if np.linalg.norm(self.loss.grad(self.X)) < self.tol:
                break

# Initialize optimizer
optimizer = MultivariateNewtonsMethod(loss, max_iter=1000, tol=1e-6, random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3
# X0 = np.random.random(size=(100, 2))

# Fit optimizer
optimizer.fit(X0.copy())

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
Xs = np.array(optimizer.Xs)
plt.scatter(X[:, 0], X[:, 1], c=Z, zorder=-2)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
plt.plot(Xs[:, :, 0], Xs[:, :, 1], zorder=-1);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.b', markersize=10);
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

plt.figure()
plt.plot(optimizer.losses, color=(0.7, 0.7, 0.7), lw=1, alpha=0.2)
plt.plot(np.mean(optimizer.losses, axis=1), 'k', lw=3)
plt.xlabel('Iteration')
plt.ylabel('Loss')

Text(0, 0.5, 'Loss')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)

def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    # plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
    # plt.plot(Xs[:i, :, 0], Xs[:i, :, 1], 'r');
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.');
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=929), Output()), _…

<function __main__.plotter(i)>

class GradientDescentAdaptive(BaseOptimizer):
    """
    Optimize a function, subject to the constraint that the solution lies in a convex set.
    """
    
    def __init__(self, loss, **kwargs):
        super().__init__(loss, **kwargs)

    def update(self, X):
        grad = self.loss.grad(self.X)
        hess = self.loss.hessian(self.X) + 1e-12 * np.eye(self.X.shape[1])[None, ...]
        scale = np.abs(np.linalg.eigh(hess)[0][-1]) # + np.abs(np.linalg.eigh(hess)[0][0])
        return X - self.lr * grad / scale

# Initialize optimizer
optimizer = GradientDescentAdaptive(loss, max_iter=2000, tol=1e-6, random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3

# Fit optimizer"
optimizer.fit(X0.copy())

<__main__.GradientDescentAdaptive at 0x17a746a10>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
Xs = np.array(optimizer.Xs)
plt.scatter(X[:, 0], X[:, 1], c=Z, zorder=-2)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
plt.plot(Xs[:, :, 0], Xs[:, :, 1], zorder=-1);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.r', markersize=10);
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

## Make an interactive video
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets

Xs = np.array(optimizer.Xs)

def plotter(i):
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=Z)
    # plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.k');
    # plt.plot(Xs[:i, :, 0], Xs[:i, :, 1], 'r');
    plt.plot(Xs[i, :, 0], Xs[i, :, 1], '.');
    plt.xlim([-3, 3])
    plt.ylim([-3, 3])
    plt.axis('off')
    plt.show()

interact(
    plotter, 
    i=widgets.IntSlider(0, 0, Xs.shape[0] - 1, 1, layout=Layout(width='800px'))
)

interactive(children=(IntSlider(value=0, description='i', layout=Layout(width='800px'), max=2000), Output()), …

<function __main__.plotter(i)>

class LBFGS(BaseOptimizer):

    def __init__(self, loss, memory=10, **kwargs):
        super().__init__(loss, **kwargs)
        self.memory = memory
        self.grad_hist = []  # Gradient history
        self.s_hist = []     # Steps history

    def update(self, X):
        grad = self.loss.grad(X)
        s, y = None, None
        if len(self.grad_hist) > 0:
            s = X - self.X_prev  # Step taken
            y = grad - self.grad_prev  # Change in gradient

            if len(self.grad_hist) >= self.memory:
                self.grad_hist.pop(0)
                self.s_hist.pop(0)

            self.grad_hist.append(y)
            self.s_hist.append(s)

        Hk = self.get_inverse_hessian_approximation()
        pk = -Hk.dot(grad)

        self.X_prev = X.copy()
        self.grad_prev = grad.copy()

        return X + self.lr * pk

    def get_inverse_hessian_approximation(self):
        """
        Compute the approximate inverse Hessian matrix.
        Uses the LBFGS algorithm's update rules.
        """
        H0 = np.eye(len(self.X))  # Initial approximation (identity matrix)

        # Apply the LBFGS update formula using the history
        for i in range(len(self.s_hist) - 1, -1, -1):
            s = self.s_hist[i]
            y = self.grad_hist[i]
            rho = 1.0 / (y @ s)

            if i == len(self.s_hist) - 1:
                # Scale the initial Hessian approximation (H0)
                gamma = (s @ y) / (y @ y)
                H0 = gamma * H0

            Hs = H0 @ s
            H0 = H0 + np.outer(y, y) * rho * rho - (np.outer(Hs, s) + np.outer(s, Hs)) * rho

        return H0
    

loss = RandomLossLandscapeWithHessian(random_state=0, n_wells=8)

# Initialize optimizer
optimizer = LBFGS(loss, lr=0.1, max_iter=2000, tol=1e-6, random_state=0, store_history=True)

# Initialize starting point
X0 = 6 * np.random.random(size=(100, 2)) - 3

# Fit optimizer
optimizer.fit(X0.copy())

<__main__.LBFGS at 0x29c759810>

x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
xx, yy = np.meshgrid(x, y)
X = np.array([xx.ravel(), yy.ravel()]).T
Z = loss.loss(X)

plt.figure(figsize=(8, 8))
plt.scatter(X[:, 0], X[:, 1], c=Z)
plt.plot(*X0.T, '.')
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Initial Guesses')

plt.figure(figsize=(8, 8))
Xs = np.array(optimizer.Xs)
plt.scatter(X[:, 0], X[:, 1], c=Z, zorder=-2)
plt.plot(Xs[0, :, 0], Xs[0, :, 1], '.');
plt.plot(Xs[:, :, 0], Xs[:, :, 1], zorder=-1);
plt.plot(Xs[-1, :, 0], Xs[-1, :, 1], '.r', markersize=10);
plt.xlim([-3, 3])
plt.ylim([-3, 3])
plt.axis('off')
plt.title('Final Guesses')

Text(0.5, 1.0, 'Final Guesses')

import time


loss = RandomLossLandscapeWithHessian(random_state=0, n_wells=8)

X0 = 6 * np.random.random(size=(500, 2)) - 3

optimizer_kwargs = {"max_iter": 5000, "lr": 0.1, "tol": 1e-6, "random_state": 0, "store_history": True}
optimizer_list = [
    GradientDescent(loss, **optimizer_kwargs),
    GradientDescentMomentum(loss, **optimizer_kwargs),
    StochasticGradientDescent(loss, noise=0.2, **optimizer_kwargs),
    # ProjectedGradientDescent(loss, **optimizer_kwargs),
    # GradientDescentLagrange(loss, lam=0.5, **optimizer_kwargs),
    MultivariateNewtonsMethod(loss, **optimizer_kwargs),
    GradientDescentAdaptive(loss, **optimizer_kwargs),
    LBFGS(loss, **optimizer_kwargs),
]

all_losses = []
all_walltimes = []

for optimizer in optimizer_list:
    
    start_time = time.time()
    optimizer.fit(X0.copy())
    stop_time = time.time()

    final_loss = np.mean(optimizer.losses, axis=1)[-1]
    
    all_losses.append(final_loss)
    all_walltimes.append(stop_time - start_time)
all_losses = np.array(all_losses)
all_walltimes = np.array(all_walltimes)
optimizer_names = np.array([type(o).__name__ for o in optimizer_list])

plt.figure(figsize=(6, 6))
sorted_losses = np.argsort(all_losses)
plt.barh(optimizer_names[sorted_losses], all_losses[sorted_losses])
plt.xlabel('Final Loss')
plt.ylabel('Optimizer')

plt.figure(figsize=(6, 6))
sorted_times = np.argsort(all_walltimes)
plt.barh(optimizer_names[sorted_times], all_walltimes[sorted_times])
plt.xlabel('Wall Time')
plt.ylabel('Optimizer')

plt.figure(figsize=(6, 6))
plt.scatter(all_losses, all_walltimes)
## label optimizers
for i in range(len(optimizer_names)):
    plt.annotate(optimizer_names[i], (all_losses[i], all_walltimes[i]))
plt.xlabel('Final Loss')
plt.ylabel('Wall Time')

Text(0, 0.5, 'Wall Time')

Optimization in many dimensions¶

Optimization in a complex landscape¶

Some considerations when performing multivariate optimization¶

Computing analytic gradients is hard¶

Why not use global maximization?¶

Optimizing in many dimensions¶

Multivariate gradient descent¶

Hyperparameter tuning: What is the correct learning rate?¶

Momentum and stochasticity¶

Momentum¶

Stochastic Gradient Descent¶

Dynamical interpretation:¶

A Caveat: SGD is different in machine learning¶

Constrained optimization with projected gradient descent¶

Projected gradient descent¶

What about inequality constraints?¶

Equality constrained optimization with Lagrange multipliers¶

First-order methods widely-used today¶

Second order methods¶

The Hessian of the Random Gaussian well landcape¶

The multivariate Newton's method¶

The results are underwhelming¶

The spectrum of the Hessian¶

Quasi-Newton methods¶

Bakeoff: Which method is best?¶

What to do without any derivative functions?¶

What about calculations with intermediate variables?¶

Finite difference derivatives¶

Any tricks to do it faster?¶