● Repeatable way to go from raw data to trained model
● Pipeline object takes sequential list of steps
● Output of one step is input to next step
● Each step is a tuple with two elements
● Name: string
● Transform: obj implementing .fit() and .transform()
● Flexible: a step can itself be another pipeline!
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape
df.head(1)
df.Use.value_counts()
df.info()
df_numeric = df[['Total','FTE']]
nn
y_train.head(3)
X_train.head(3)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import Imputer
# Imputer default is "mean"
from time import time
pl = Pipeline([
('imp',Imputer()),
('clf',OneVsRestClassifier(LogisticRegression()))
])
start = time()
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)
accuracy = pl.score(X_test, y_test)
print 'accuracy: ', accuracy
from sklearn.feature_extraction.text import CountVectorizer
text = df['Text_1']
text.fillna('Non Type', inplace=True)
X_train, X_test, y_train, y_test = train_test_split(text, pd.get_dummies(df['Use']), random_state = 777)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
pl = Pipeline([
('vec',CountVectorizer()),
('clf',OneVsRestClassifier(LogisticRegression()))
])
start = time()
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)
accuracy = pl.score(X_test, y_test)
print 'accuracy: ', accuracy
co = CountVectorizer()
text.shape
vec = co.fit_transform(text)
vec
co.vocabulary_.items()[:30]
● Want to use all available features in one pipeline
● Problem
● Pipeline steps for numeric and text preprocessing can’t follow each other
● e.g., output of CountVectorizer can’t be input to Imputer
● Solution
● FunctionTransformer() & FeatureUnion()
● Turns a Python function into an object that a scikit-learn pipeline can understand
● Need to write two functions for pipeline preprocessing
● Take entire DataFrame, return numeric columns
● Take entire DataFrame, return text columns
● Can then preprocess numeric and text data in separate pipelines
df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape
from sklearn.preprocessing import FunctionTransformer
get_text = FunctionTransformer(lambda x: x['Text_1'], validate=False)
get_numeric = FunctionTransformer(lambda x: x[['Total','FTE']], validate=False)
text_fillna = FunctionTransformer(lambda x: x.fillna('No class type'), validate=False)
from sklearn.pipeline import FeatureUnion
numeric_pipeline = Pipeline([
('selector',get_numeric),
('imputer',Imputer())
])
text_pipeline = Pipeline([
('selector',get_text),
('fillna',text_fillna),
('vectorizer',CountVectorizer())
])
pl = Pipeline([
('Union',FeatureUnion([
('numeric',numeric_pipeline),
('text',text_pipeline)
])),
('clf',OneVsRestClassifier(LogisticRegression()))
])
pl
X_train, X_test, y_train, y_test = train_test_split(df, pd.get_dummies(df['Use']), random_state = 777)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
start = time()
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)
accuracy = pl.score(X_test, y_test)
print 'accuracy: ', accuracy
LABELS = ['Function', 'Use', 'Sharing', 'Reporting', 'Student_Type','Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status']
NUMERIC_COLUMNS = ['Total','FTE']
NON_LABELS = [c for c in df.columns if c not in LABELS]
NON_LABELS
len(NON_LABELS) - len(NUMERIC_COLUMNS)
import numpy as np
dummy_labels = pd.get_dummies(df[LABELS])
dummy_labels.shape
from multi_split import multilabel_train_test_split
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS], dummy_labels, 0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
● Is current model the best?
● Can quickly try different models with pipelines
● Pipeline preprocessing steps unchanged
● Edit the model step in your pipeline
● Random Forest, Naïve Bayes, k-NN
# Signature: combine_text_columns(data_frame, to_drop=['FTE', 'Total', 'Function', 'Use', 'Sharing', 'Reporting', 'Student_Type', 'Position_Type', 'Object_Type', 'Pre_K', 'Operating_Status'])
# Source:
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
""" Takes the dataset as read in, drops the non-feature, non-text columns and
then combines all of the text columns into a single vector that has all of
the text for a row.
:param data_frame: The data as read in with read_csv (no preprocessing necessary)
:param to_drop (optional): Removes the numeric and label columns by default.
"""
# drop non-text columns that are in the df
to_drop = set(to_drop) & set(data_frame.columns.tolist())
text_data = data_frame.drop(to_drop, axis=1)
# replace nans with blanks
text_data.fillna("", inplace=True)
# joins all of the text items in a row (axis=1)
# with a space in between
return text_data.apply(lambda x: " ".join(x), axis=1)
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
# Get the dummy encoding of the labels
dummy_labels = pd.get_dummies(df[LABELS])
# Get the columns that are features in the original df
NON_LABELS = [c for c in df.columns if c not in LABELS]
# Split into training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(df[NON_LABELS],
dummy_labels,
0.2,
seed=123)
# Preprocess the text data: get_text_data
get_text_data = FunctionTransformer(combine_text_columns, validate=False)
# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
text_fillna = FunctionTransformer(lambda x: x.fillna('No class type'), validate=False)
# Complete the pipeline: pl
from sklearn.linear_model import SGDClassifier
pl = Pipeline([
('union', FeatureUnion(
transformer_list = [
('numeric_features', Pipeline([
('selector', get_numeric_data),
('imputer', Imputer())
])),
('text_features', Pipeline([
('selector', get_text_data),
('fillna',text_fillna),
('vectorizer', CountVectorizer())
]))
]
)),
('clf', OneVsRestClassifier(SGDClassifier()))
])
pl
start = time()
# Fit to the training data
pl.fit(X_train, y_train)
print 'used: {:.2f}s'.format(time()-start)
# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on budget dataset: ", accuracy)