## Preamble / required packages
import numpy as np
np.random.seed(0)

## Import local plotting functions and in-notebook display functions
import matplotlib.pyplot as plt
from IPython.display import Image, display
%matplotlib inline

import warnings
## Comment this out to activate warnings
warnings.filterwarnings('ignore')

true_fn = lambda X: np.cos(1.5 * np.pi * X)

np.random.seed(0)
n_samples = 30
X = np.sort(np.random.rand(n_samples))
y = true_fn(X) + np.random.randn(n_samples) * 0.1

X_test = np.linspace(0, 1, 100)
plt.plot(X_test, true_fn(X_test), 'k', label="True")
plt.plot(X, y, '.b', markersize=10, label="Sampled")
plt.legend(loc="best")

plt.xlabel("x")
plt.ylabel("y")

Text(0, 0.5, 'y')

xx = np.array([1, 2, 3])

print(np.vander(xx, 2, increasing=True), end='\n\n')

print(np.vander(xx, 3, increasing=True), end='\n\n')

print(np.vander(xx, 5, increasing=True), end='\n\n')

print(np.vander(xx, 10, increasing=True), end='\n\n')

[[1 1]
 [1 2]
 [1 3]]

[[1 1 1]
 [1 2 4]
 [1 3 9]]

[[ 1  1  1  1  1]
 [ 1  2  4  8 16]
 [ 1  3  9 27 81]]

[[    1     1     1     1     1     1     1     1     1     1]
 [    1     2     4     8    16    32    64   128   256   512]
 [    1     3     9    27    81   243   729  2187  6561 19683]]

xx = np.array([1, 2, 3]) # 3 data points

pvals = range(2, 20)
all_condition_numbers = []
for i in pvals:
    all_condition_numbers.append(np.linalg.cond(np.vander(xx, i, increasing=True)))

plt.plot(pvals, all_condition_numbers)
plt.xlabel('Polynomial degree at fixed data quantity')
plt.ylabel('Condition number')

Text(0, 0.5, 'Condition number')

xx = np.array([1.0, 2.0]) # 2 data points

phi = np.vander(xx, 10, increasing=True)

phi /= np.linalg.norm(phi, axis=0, keepdims=True)

def plot_vector(vec, **kwargs):
    plt.plot([0, vec[0]], [0, vec[1]], **kwargs)

plt.figure(figsize=(6, 6))
plot_vector(phi[:, 0], color='k', label='First column')
plot_vector(phi[:, 1], color='r', label='Second column')
plt.title("First two columns of the Vandermonde matrix")


plt.figure(figsize=(6, 6))
plot_vector(phi[:, -2], color='k', label='First column')
plot_vector(phi[:, -1], color='r', label='Second column')
plt.title('Last two columns')

# plot with color gradient
plt.figure(figsize=(6, 6))
for i in range(10):
    plot_vector(phi[:, i], color=plt.cm.viridis(i / 10))
plt.title('All columns')

Text(0.5, 1.0, 'All columns')

plt.imshow(phi)
plt.xlabel('Power index')
plt.ylabel('Datapoint index')

Text(0, 0.5, 'Datapoint index')

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

degrees = [1, 2, 3]
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    ax.plot(X_test, true_fn(X_test), color='k', label="True function")    
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    ax.plot(X, y, '.b', markersize=10, label="Sampled")
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("Polynomial of Degree {}".format(degrees[i]))

## Plot residuals for each model
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    ax.plot(X, y - pipeline.predict(X[:, np.newaxis]), '.b', markersize=10, label="Samples")
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("Residuals for Degree {}".format(degrees[i]))

degrees = [30]
plt.figure(figsize=(10, 6))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    X_test = np.linspace(0, 1, 100)
    ax.plot(X_test, true_fn(X_test), color='k', label="True function")    
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    ax.plot(X, y, '.b', markersize=10, label="Sampled")
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("Polynomial of Degree {}".format(degrees[i]))

## Compute train MSE
polynomial_features = PolynomialFeatures(degree=30)
linear_regression = LinearRegression()
pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
pipeline.fit(X[:, np.newaxis], y)
print("Train MSE: {}".format(np.mean((pipeline.predict(X[:, np.newaxis]) - y)**2)))

Train MSE: 0.0021376156181802915

## Compute test MSE
X_test = np.linspace(0, 1, 100)
print("Test MSE: {}".format(np.mean((pipeline.predict(X_test[:, np.newaxis]) - true_fn(X_test))**2)))

Test MSE: 39620.98485755194

## Plot train and test error versus model size

degrees = range(1, 21)
train_errors = []
test_errors = []
for degree in degrees:
    polynomial_features = PolynomialFeatures(degree=degree,)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    ## Compute errors on training and testing data
    train_errors.append(np.mean((pipeline.predict(X[:, np.newaxis]) - y)**2))
    test_errors.append(np.mean((pipeline.predict(X_test[:, np.newaxis]) - true_fn(X_test))**2))

plt.figure(figsize=(6, 6))
plt.semilogy(degrees, train_errors, label='Train error')
plt.semilogy(degrees, test_errors, label='Test error')
plt.legend(loc='best')
plt.xlabel('Model complexity (polynomial degree)')
plt.ylabel('Mean Squared Error')

Text(0, 0.5, 'Mean Squared Error')

degrees = [1, 20, 5]
titles = ['Underfitting', 'Overfitting', 'A Good Fit']
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    ax.plot(X_test, true_fn(X_test), color='k', label="True function")    
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    ax.plot(X, y, '.b', markersize=10, label="Sampled")

    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("{} (Degree {})".format(titles[i], degrees[i]))

from sklearn.linear_model import Ridge

lambda_values = [0.0, 1e-8, 1e-6, 1e-3, 1e-1, 1e0, 1e1]

plt.figure(figsize=(10, 25))
for i, lambda_value in enumerate(lambda_values):
    ax = plt.subplot(len(lambda_values), 1, i + 1)
    polynomial_features = PolynomialFeatures(degree=20)
    linear_regression = Ridge(alpha=lambda_value)
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    X_test = np.linspace(0, 1, 100)
    ax.plot(X_test, true_fn(X_test), color='k', label="True function")    
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    ax.plot(X, y, '.b', markersize=10, label="Sampled")
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    
    ax.set_title(f"Regularization {lambda_value}, MSE: {pipeline.score(X_test[:, None], true_fn(X_test)):.4f}")

from sklearn.linear_model import Ridge

degrees = [15]*3
plt.figure(figsize=(14, 5))
for idx, i in enumerate(range(len(degrees))):
    # sample a dataset
    np.random.seed(idx)
    n_samples = 30
    X = np.sort(np.random.rand(n_samples))
    y = true_fn(X) + np.random.randn(n_samples) * 0.1

    

    # fit a least squares model
    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)
    
    # fit a Ridge model
    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = Ridge(alpha=0.1) # sklearn uses alpha instead of lambda
    pipeline2 = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline2.fit(X[:, np.newaxis], y)    

    # visualize results
    ax = plt.subplot(1, len(degrees), i + 1)
    ax.plot(X_test, true_fn(X_test), color='k', label="True function")
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="OLS")
    ax.plot(X_test, pipeline2.predict(X_test[:, np.newaxis]), label="L2 Reg.")    
    ax.plot(X, y, '.b', markersize=10, label="Sampled")


    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("Dataset sample #{}".format(idx))

print("Weights for OLS model")
print(pipeline.named_steps['lr'].coef_[:8])

print("Weights for L2 regularized model")
print(pipeline2.named_steps['lr'].coef_[:8])

from sklearn.linear_model import Lasso, Ridge

# create ridge coefficients
alphas = np.logspace(-10, 0, 50)
ridge_coefs = []
for a in alphas:
    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = Ridge(alpha=a) # sklearn uses alpha instead of lambda
    pipeline2 = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline2.fit(X[:, np.newaxis], y)   

    ridge_coefs.append(pipeline2.named_steps['lr'].coef_)

# plot ridge coefficients
plt.figure(figsize=(10, 5))
plt.semilogx(alphas, ridge_coefs)
plt.xlabel('Regularization parameter (lambda)')
plt.ylabel('Magnitude of model parameters')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')

lasso_coefs = []
for a in alphas:
    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = Lasso(alpha=a) # sklearn uses alpha instead of lambda
    pipeline2 = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline2.fit(X[:, np.newaxis], y)   

    lasso_coefs.append(pipeline2.named_steps['lr'].coef_)

# plot lasoo coefficients
plt.figure(figsize=(10, 5))
plt.semilogx(alphas, lasso_coefs)
plt.xlabel('Regularization parameter (lambda)')
plt.ylabel('Magnitude of model parameters')
plt.title('Lasso coefficients as a function of the regularization')
plt.axis('tight')

(3.1622776601683794e-11,
 3.1622776601683795,
 -9.326209457732526,
 6.4894889729000935)

## Make a dataset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

true_fn = lambda X: np.cos(1.5 * np.pi * X)

np.random.seed(0)
n_samples = 20
# X = np.linspace(0, 1, n_samples)
X = np.random.rand(n_samples)
y = true_fn(X) + np.random.randn(n_samples) * 0.1

X_test = np.linspace(0, 1, 100)
plt.plot(X_test, true_fn(X_test), 'k', label="True function")
plt.plot(X, y, '.b', markersize=5, label="Samples")
plt.xlabel("x")
plt.ylabel("y")

plt.legend(loc="best")

<matplotlib.legend.Legend at 0x29012f450>

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso

degrees = [1, 6, 15, 50]
plt.figure(figsize=(8, 20))
for i in range(len(degrees)):
    

    polynomial_features = PolynomialFeatures(degree=degrees[i])
    ridge_regression = Ridge(alpha=0.01)
    pipeline = Pipeline([("pf", polynomial_features), ("lr", ridge_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    ax = plt.subplot(len(degrees), 1, i + 1)
    ax.plot(X_test, true_fn(X_test), color='k', label="True function")    
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    ax.plot(X, y, '.b', markersize=5, label="Sampled")
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("Polynomial of Degree {}".format(degrees[i]))
    plt.xlabel("x")
    plt.ylabel("y")
    plt.tight_layout()

    ## Plot residuals for each model
    # ax = plt.subplot(len(degrees), 1, i + 1)
    # ax.plot(X_test, true_fn(X_test) - pipeline.predict(X_test[:, np.newaxis]), '.b', markersize=5, label="Samples")
    # ax.set_xlim((0, 1))
    # ax.set_ylim((-1, 1))
    # plt.xlabel("x")
    # plt.ylabel("Test Residuals")
    # plt.tight_layout()

from sklearn.linear_model import Ridge

degrees = np.arange(3, 50)
all_train_scores, all_test_scores = list(), list()
all_coefs0, all_coefs = [], []
for i in range(len(degrees)):

    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    # linear_regression = RidgeCV()
    linear_regression = Ridge(alpha=0.1)
    # linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)
    y_pred_train = pipeline.predict(X[:, np.newaxis])

    X_test = np.linspace(0, 1, 100)
    y_test = true_fn(X_test)
    y_pred_test = pipeline.predict(X_test[:, np.newaxis])

    all_train_scores.append(np.sqrt(np.sum((y_pred_train - y)**2)))
    all_test_scores.append(np.sqrt(np.sum((y_pred_test - y_test)**2)))
    all_coefs.append(pipeline.named_steps['lr'].coef_)

plt.figure(figsize=(8, 4))
plt.plot(degrees, all_test_scores)
plt.xlabel('Polynomial Degree')
plt.ylabel('Test Error')

Text(0, 0.5, 'Test Error')

plt.plot([np.std(item) * np.sqrt(len(item)) for item in all_coefs])
plt.xlabel('Polynomial Degree')
plt.ylabel('L2 norm of coefficients')

# plt.ylim([2.8, 2.9])

Text(0, 0.5, 'L2 norm of coefficients')

Overfitting, the Bias-Variance tradeoff, Regularization, and Double Descent¶

Overfitting and Underfitting¶

Polynomial Regression¶

Create a synthetic dataset¶

The Vandermonde matrix¶

Solving yet another least-squares problem¶

Properties of the Vandermonde matrix¶

How can we interpret this effect?¶

Polynomials Regression¶

Why not go even higher?¶

Let's plot the error on the training points¶

Let's plot the error on a test set¶

Model complexity¶

Plotting train/test error vs. model complexity¶

We'd say that the best model is the one that minimizes the test error.¶

The Bias-Variance Tradeoff¶

Overfitting¶

Underfitting¶

Determining Overfitting vs. Underfitting¶

How to Fix Underfitting¶

How to Fix Overfitting¶

Regularization¶

L2 Regularization for Polynomial Regression¶

Regularization reduces the variance of the model¶

The bias-variance tradeoff¶

Hyperparameter Search¶

Normal Equations for Ridge regression Models¶

L1 vs. L2 Regularization¶

An emerging topic: double descent¶

What's going on?¶

How does this happen? The overparameterized regime¶

Implicit Regularization¶