import numpy as np
from IPython.display import Image, display

import matplotlib.pyplot as plt
%matplotlib inline

xx = np.linspace(-1, 1, 500)

f_con = lambda x: 0.5 * x**2 - 0.2
f_noncon = lambda x :  x**2 + 0.5 * np.sin(30 * x) - 0.2

plt.figure()
plt.plot(xx, f_con(xx))
plt.title('A convex optimization problem')
plt.xlabel('x')
plt.ylabel('f(x)')

plt.figure()
plt.plot(xx, f_noncon(xx))
plt.title('A non-convex optimization problem')
plt.xlabel('x')
plt.ylabel('f(x)')

Text(0, 0.5, 'f(x)')

class GradientDescentOptimizer:

    def __init__(self, learning_rate=0.1, max_iter=1000, tolerance=1e-6, store_history=False):

        self.learning_rate = learning_rate

        self.max_iter = max_iter
        self.tolerance = tolerance
        self.store_history = store_history
        if self.store_history:
            self.history = list()

    def optimize(self, df, x0):
        x = x0
        for i in range(self.max_iter):

            x_new = x - self.learning_rate * df(x)

            if np.linalg.norm(x_new - x) < self.tolerance:
                break
            
            x = x_new

            if self.store_history:
                self.history.append(x_new)

        return x

# def loss(x):
#     return 0.5 * x**2 - 0.2

loss = lambda x: 0.5 * x**2 - 0.2
loss_grad = lambda x: x

loss = lambda x: x**2 + 0.05 * np.sin(30 * x) - 0.2
loss_grad = lambda x: 2 * x + 0.05 * 30 * np.cos(30 * x)

# optimizer = GradientDescentOptimizer(store_history=True, learning_rate=2.0)
optimizer = GradientDescentOptimizer(store_history=True, learning_rate=0.005)
x_opt = optimizer.optimize(loss_grad, 0.5)

plt.figure()
plt.plot(xx, loss(xx))
plt.plot(x_opt, loss(x_opt), 'ro')
plt.xlabel('x')
plt.ylabel("Loss")

plt.figure()
plt.plot(optimizer.history)
plt.xlabel('Iteration')
plt.ylabel('x')

Text(0, 0.5, 'x')

loss = lambda x: x**2 + 0.05 * np.sin(30 * x) - 0.2
loss_grad = lambda x: 2 * x + 0.05 * 30 * np.cos(30 * x)

## Solve optimization
optimizer = GradientDescentOptimizer(learning_rate=0.02, store_history=True)
x_opt = optimizer.optimize(loss_grad, 0.4)

## Plot loss
xx = np.linspace(-1, 1, 500)
plt.figure()
plt.plot(xx, loss(xx))
plt.plot(x_opt, loss(x_opt), 'ro')
plt.xlabel('x')
plt.ylabel("Loss")

plt.figure()
plt.plot(optimizer.history)
plt.xlabel('Iteration')
plt.ylabel('x')

Text(0, 0.5, 'x')

## Solve optimization
optimizer = GradientDescentOptimizer(learning_rate=0.02, store_history=True)
x_opt = optimizer.optimize(loss_grad, 0.7)

## Plot loss
xx = np.linspace(-1, 1, 500)
plt.figure()
plt.plot(xx, loss(xx))
plt.plot(x_opt, loss(x_opt), 'ro')
plt.xlabel('x')
plt.ylabel("Loss")

plt.figure()
plt.plot(optimizer.history)
plt.xlabel('Iteration')
plt.ylabel('x')

Text(0, 0.5, 'x')

class GradientDescentMomentumOptimizer:
    
        def __init__(self, learning_rate=0.1, alpha=0.9, max_iter=1000, tolerance=1e-6, store_history=False):
            self.learning_rate = learning_rate
            self.alpha = alpha
            self.max_iter = max_iter
            self.tolerance = tolerance
            self.store_history = store_history
            if self.store_history:
                self.history = list()
    
        def optimize(self, df, x0):
            x = x0
            v = 0
            for i in range(self.max_iter):
                
                v = self.alpha * v + self.learning_rate * df(x)
                x_new = x - v

                if np.linalg.norm(x_new - x) < self.tolerance:
                    break
                x = x_new
    
                if self.store_history:
                    self.history.append(x_new)
    
            return x

# loss = lambda x: 0.5 * x**2 - 0.2
# loss_grad = lambda x: x

loss = lambda x: x**2 + 0.05 * np.sin(30 * x) - 0.2
loss_grad = lambda x: 2 * x + 0.05 * 30 * np.cos(30 * x)


## Solve optimization
optimizer = GradientDescentMomentumOptimizer(learning_rate=0.3, alpha=0.2, store_history=True)
x_opt = optimizer.optimize(loss_grad, 0.5)

## Plot loss
xx = np.linspace(-1, 1, 500)
plt.figure()
plt.plot(xx, loss(xx))
plt.plot(x_opt, loss(x_opt), 'ro')
plt.xlabel('x')
plt.ylabel("Loss")

plt.figure()
plt.plot(optimizer.history)
plt.xlabel('Iteration')
plt.ylabel('x')

Text(0, 0.5, 'x')

Image("../resources/newton.png", width=800)
# Source: https://jermwatt.github.io/machine_learning_refined/notes/4_Second_order_methods/4_4_Newtons.html

def finite_difference(f, x, dx=1e-5, order=1):
    """
    Compute the derivative of a function using centered finite differences

    Args:
        f: The function to differentiate
        x: The point at which to differentiate
        dx: The step size
        order: The order of the derivative to compute
    """
    if order == 1:
        return (f(x + dx / 2) - f(x - dx / 2)) / dx
    elif order == 2:
        return (f(x + dx) - 2 * f(x) + f(x - dx)) / dx**2
    else:
        raise ValueError('Only first and second order derivatives are supported')

class NewtonOptimizer:
    """
    A simple implementation of Newton's method for finding the optima of a function

    Parameters:
        tolerance (float): The tolerance for the stopping criterion
        max_iterations (int): The maximum number of iterations to perform
        store_history (bool): Whether to store the history of the optimization
    """
    def __init__(self, tolerance=1e-6, max_iterations=1000, store_history=False):
        self.tolerance = tolerance
        self.max_iterations = max_iterations
        self.store_history = store_history
        if store_history:
            self.history = list()

    def optimize(self, f, x0, df=None):

        x = x0

        if self.store_history:  self.history.append(np.copy(x))
        for _ in range(self.max_iterations):

            # Approximate the derivatives
            fd = finite_difference(f, x, order = 1)
            fdd = finite_difference(f, x, order = 2)
            x = x - fd / fdd


            ## Stop early if the local concavity is zero
            if np.abs(fdd) < self.tolerance:
                break
            

            if self.store_history: self.history.append(np.copy(x))
        return x


## Convex function
loss = lambda x: 0.5 * x**2 - 0.2
optimizer = NewtonOptimizer(store_history=True)
xstar = optimizer.optimize(loss, 0.5)

plt.figure()
plt.plot(xx, loss(xx))
plt.plot(xstar, loss(xstar), 'ro')
plt.xlabel('x')
plt.ylabel("Loss")

plt.figure()
plt.plot(optimizer.history)
plt.xlabel('Iteration')
plt.ylabel('x')

# ## Non-convex function
loss = lambda x :  x**2 + 0.5 * np.sin(30 * x) - 0.2
optimizer = NewtonOptimizer(store_history=True)
xstar = optimizer.optimize(loss, 0.6)

plt.figure()
plt.plot(xx, loss(xx))
plt.plot(xstar, loss(xstar), 'ro')
plt.xlabel('x')
plt.ylabel("Loss")

plt.figure()
plt.plot(optimizer.history)
plt.xlabel('Iteration')
plt.ylabel('x')

Text(0, 0.5, 'x')

Convex Optimization¶

Optimization¶

One dimensional optimization: fixed point methods and rootfinding¶

Gradient descent¶

Modifying gradient descent for non-convex functions¶

Second-order methods and Newton's method¶

Newton's method¶

What if we don't know the derivatives of $f(x)$?¶

Why not always do this?¶

Interpreting the Newton optimizer¶

Physical Interpretation¶