The hunt for the most effective machine learning model is hard enough with a modest dataset, and much more so as our data grow! As we search for the optimal combination of features, algorithm, and hyperparameters, we often use tools like histograms, heatmaps, embeddings, and other plots to make our processes more informed and effective. However, large, high-dimensional datasets can prove particularly challenging. In this talk, we’ll explore a suite of visual diagnostics, investigate their strengths and weaknesses in face of increasingly big data, and consider how we can steer the machine learning process, not only purposefully but at scale!
15. import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from yellowbrick.features import ParallelCoordinates
data = load_iris()
oz = ParallelCoordinates(ax=axes[idx], fast=True)
oz.fit_transform(data.data, data.target)
oz.finalize()
Each point drawn individually
as connected line segment
With standardization
Points grouped by class, each class
drawn as single segment
18. class Estimator(object):
def fit(self, X, y=None):
"""
Fits estimator to data.
"""
# set state of self
return self
def predict(self, X):
"""
Predict response of X
"""
# compute predictions pred
return pred
class Transformer(Estimator):
def transform(self, X):
"""
Transforms the input data.
"""
# transform X to X_prime
return X_prime
class Pipeline(Transfomer):
@property
def named_steps(self):
"""
Returns a sequence of estimators
"""
return self.steps
@property
def _final_estimator(self):
"""
Terminating estimator
"""
return self.steps[-1]
The scikit-learn API
self.X
19. class Visualizer(Estimator):
def draw(self):
"""
Draw called from scikit-learn methods.
"""
return self.ax
def finalize(self):
self.set_title()
self.legend()
def poof(self):
self.finalize()
plt.show()
import matplotlib.pyplot as plt
from yellowbrick.base import Visualizer
class MyVisualizer(Visualizer):
def __init__(self, ax=None, **kwargs):
super(MyVisualizer, self).__init__(ax, **kwargs)
def fit(self, X, y=None):
self.draw(X)
return self
def draw(self, X):
if self.ax is None:
self.ax = self.gca()
self.ax.plt(X)
def finalize(self):
self.set_title("My Visualizer")
The Yellowbrick API
20. Oneliners
from sklearn.linear_model import Lasso
from yellowbrick.regressor import ResidualsPlot
# Option 1: scikit-learn style
viz = ResidualsPlot(Lasso())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.poof()
from sklearn.linear_model import Lasso
from yellowbrick.regressor import residuals_plot
# Option 2: Quick Method
viz = residuals_plot(
Lasso(), X_train, y_train, X_test, y_test
)
��