from sklearn.datasets import load_digits
digits = load_digits()


from matplotlib import pyplot as plt

plt.figure(figsize=(8,16))
_, axes = plt.subplots(2, 5)
images_and_labels = list(zip(digits.images, digits.target))
for ax, (image, label) in zip(axes.flatten(), images_and_labels[:10]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Digit %i' % label)

<Figure size 576x1152 with 0 Axes>


# https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html
from sklearn.model_selection import train_test_split

# Split data into train and test subsets
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
X_train, X_dev, y_train, y_dev = train_test_split(
    data, digits.target, test_size=0.5, shuffle=False)


# https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=1e7)

# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)

# Now predict the value of the digit on the second half:
predicted = classifier.predict(X_dev)


(predicted == y_dev).mean()

0.9332591768631813


# these dev set digits are classified incorrectly
X_error = X_dev[predicted != y_dev]
y_error = y_dev[predicted != y_dev]
p_error = predicted[predicted != y_dev]

# these dev set digits are classified correctly
X_corr = X_dev[predicted == y_dev]
y_corr = y_dev[predicted == y_dev]
p_corr = predicted[predicted == y_dev]

# show the histogram
plt.xticks(range(10))
plt.hist(y_error)

(array([ 3., 11.,  2., 14.,  7.,  4.,  1.,  4.,  9.,  5.]),
 array([0. , 0.9, 1.8, 2.7, 3.6, 4.5, 5.4, 6.3, 7.2, 8.1, 9. ]),
 <BarContainer object of 10 artists>)


plt.figure(figsize=(8,16))
_, axes = plt.subplots(2, 8)

# these images are classified incorrectly
images_and_labels = list(zip(X_error[y_error==3], p_error[y_error==3]))
for ax, (image, label) in zip(axes[0,:], images_and_labels[:8]):
    ax.set_axis_off()
    ax.imshow(image.reshape((8,8)), cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('f(x)=%i' % label)

# these images are classified correctly
images_and_labels = list(zip(X_corr[y_corr==3], p_corr[y_corr==3]))    
for ax, (image, label) in zip(axes[1,:], images_and_labels[:8]):
    ax.set_axis_off()
    ax.imshow(image.reshape((8,8)), cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('f(x)=%i' % label)

<Figure size 576x1152 with 0 Axes>


import numpy as np
np.random.seed(1)
n_samples = 40
true_fn = lambda X: np.cos(1.5 * np.pi * X)

X = np.sort(np.random.rand(n_samples))
y = true_fn(X) + np.random.randn(n_samples) * 0.1

X_line = np.linspace(0, 1, 100)
plt.plot(X_line, true_fn(X_line), label="True function")
plt.scatter(X, y, edgecolor='b', s=20, label="Samples")

<matplotlib.collections.PathCollection at 0x1271c8828>


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

n_plots, X_line = 3, np.linspace(0,1,20)
plt.figure(figsize=(14, 5))
for i in range(n_plots):
    ax = plt.subplot(1, n_plots, i + 1)
    random_idx = np.random.randint(0, 30, size=(30,))
    X_random, y_random = X[random_idx], y[random_idx]

    polynomial_features = PolynomialFeatures(degree= z, include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X_random[:, np.newaxis], y_random)

    ax.plot(X_line, true_fn(X_line), label="True function")    
    ax.plot(X_line, pipeline.predict(X_line[:, np.newaxis]), label="Model")
    ax.scatter(X_random, y_random, edgecolor='b', s=20, label="Samples", alpha=0.2)
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title('Random sample %d' % i)


degrees = [1, 20, 5]
titles = ['Underfitting', 'Overfitting', 'A Good Fit']
plt.figure(figsize=(14, 5))
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)

    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("pf", polynomial_features), ("lr", linear_regression)])
    pipeline.fit(X[:, np.newaxis], y)

    ax.plot(X_test, true_fn(X_test), label="True function")    
    ax.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    ax.scatter(X, y, edgecolor='b', s=20, label="Samples", alpha=0.2)
    ax.set_xlim((0, 1))
    ax.set_ylim((-2, 2))
    ax.legend(loc="best")
    ax.set_title("{} (Degree {})".format(titles[i], degrees[i]))


from matplotlib import pyplot as plt

plt.figure(figsize=(8,16))
_, axes = plt.subplots(2, 5)
images_and_labels = list(zip(digits.images, digits.target))
for ax, (image, label) in zip(axes.flatten(), images_and_labels[:10]):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    ax.set_title('Digit %i' % label)

<Figure size 576x1152 with 0 Axes>


# https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html
from sklearn.neural_network import MLPClassifier
classifier = MLPClassifier()

# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)

# Now predict the value of the digit:
predicted = classifier.predict(X_dev)
predicted_train = classifier.predict(X_train)


print('Training set accuracy: %.3f ' % (predicted_train == y_train).mean())
print('Development set accuracy: %.3f ' % (predicted == y_dev).mean())

Training set accuracy: 1.000 
Development set accuracy: 0.937


classifier = MLPClassifier(max_iter=1000, alpha=1)

# We learn the digits on the first half of the digits
classifier.fit(X_train, y_train)

# Now predict the value of the digit:
predicted = classifier.predict(X_dev)
predicted_train = classifier.predict(X_train)


print('Training set accuracy: %.3f ' % (predicted_train == y_train).mean())
print('Development set accuracy: %.3f ' % (predicted == y_dev).mean())

Training set accuracy: 1.000 
Development set accuracy: 0.947

$ $	Blurry	Flipped	Mislabeled
Image 1	X	X
Image 2		X
Image 3			X
...
Total	20%	50%	30%

Lecture 21: Model Iteration and Improvement¶

Applied Machine Learning¶

Practical Considerations When Applying Machine Learning¶

Fast & Data-Driven Model Iteration¶

Part 1: Error Analysis¶

Review: Model Development Workflow¶

Review: Datasets for Model Development¶

How to Analyze Models¶

Prioritizng Model Improvements: An Example¶

Error Analysis¶

Error Categories¶

Error Analysis: An Example¶

Mislabeled Data¶

Development Set Size¶

Error Analysis on the Training Set¶

Error Analysis: Another Example¶

Limitations of Error Analysis¶

Part 2: Bias/Variance Analysis¶

Review: Error Analysis¶

Review: Overfitting (Variance)¶

Review: Underfitting (Bias)¶

On the Signficance of Bias and Variance¶

Quantifying Bias and Variance¶

Diagnosing Bias and Variance¶

Addressing Variance¶

Addressing Bias¶

Bias/Variance Analysis: An Example¶

Error vs. Bias/Variance Analyses¶

Bias/Variance Analysis vs Hyperparameter Search¶

Model Iteration Cycle¶

Part 3: Baselines¶

Motivation¶

Baselines¶

Optimal Performance¶

Estimating the Optimal Error Rate¶

Quantifying Bias Using Optimal Error¶