Splitting the multi-class dataset
● Recall: Train-test split
● Will not work here
● May end up with labels in test set that never appear in training set
● Solution: StratifiedShuffleSplit
● Only works with a single target variable
● We have many target variables
● multilabel_train_test_split()
https://github.com/drivendataorg/box-plots-sklearn/blob/master/src/data/multilabel.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
%matplotlib inline
df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape
num_col = []
for i,j in zip(df.dtypes,df):
if i !='object':
num_col.append(j)
num_col
df[num_col].isnull().sum()
numeric_data_only = df[num_col].fillna(-1000)
numeric_data_only.isnull().sum()
type(numeric_data_only)
LABELS = ['Function',
'Use',
'Sharing',
'Reporting',
'Student_Type',
'Position_Type',
'Object_Type',
'Pre_K',
'Operating_Status']
df[LABELS].shape
# Get labels and convert to dummy variables: label_dummies
label_dummies = pd.get_dummies(df[LABELS])
label_dummies.shape
label_dummies.head(3)
# Create training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only,
label_dummies,
size=0.2,
seed=123)
X_train.shape, y_train.shape, X_test.shape,y_test.shape
X_train.info()
y_train.info()
# Import classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())
clf
start = time()
clf.fit(X_train, y_train)
print 'Used {:.2f}s'.format(time()-start)
● If .predict() was used instead:
● Output would be 0 or 1
● Log loss penalizes being confident and wrong
● Worse performance compared to .predict_proba()
test_pred = clf.predict_proba(X_test)
test_pred.shape
# Format predictions in DataFrame: prediction_df
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS],prefix_sep='---').columns,
index=X_test.index,
data=test_pred)
prediction_df.head()
● Bag-of-words
● Simple way to represent text in machine learning
● Discards information about grammar and word order
● Computes frequency of occurrence
● CountVectorizer()
● Tokenizes all the strings
● Builds a ‘vocabulary’
● Counts the occurrences of each token in the vocabulary
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
df.Position_Extra.head()
df.Position_Extra.nunique()
df.Position_Extra.isnull().sum()
# Fill missing values in df.Position_Extra
df.Position_Extra.fillna('', inplace=True)
# Instantiate the CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)
vec_alphanumeric
# Fit to the data
vec_alphanumeric.fit(df.Position_Extra)
# Print the number of tokens and first 15 tokens
msg = "There are {} tokens in Position_Extra if we split on non-alpha numeric"
print(msg.format(len(vec_alphanumeric.get_feature_names())))
print(vec_alphanumeric.get_feature_names()[:15])
df[['Use','Sharing','Reporting']].head(3)
text_data = df[['Use','Sharing','Reporting']]
# Replace nans with blanks
text_data.fillna("", inplace=True)
text_data.head(5)
# Join all text items in a row that have a space in between
text_data.apply(lambda x: " ".join(x), axis=1).head()
text_combined = text_data.apply(lambda x: " ".join(x), axis=1)
set((1,2,3)) & set((2,3,4))
' '.join(['4','7'])
ttt = np.arange(18).reshape(3,6)
ttt
ttt.sum(0)
ttt.sum(1)
text_combined.head()
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Create the basic token pattern
TOKENS_BASIC = '\\S+(?=\\s+)'
# Create the alphanumeric token pattern
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
# Instantiate basic CountVectorizer: vec_basic
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)
# Instantiate alphanumeric CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)
# Create the text vector
text_vector = text_combined
# Fit and transform vec_basic
vec_basic.fit_transform(text_vector)
# Print number of tokens of vec_basic
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names())))
# Fit and transform vec_alphanumeric
vec_alphanumeric.fit_transform(text_vector)
# Print number of tokens of vec_alphanumeric
print("There are {} alpha-numeric tokens in the dataset".format(len(vec_alphanumeric.get_feature_names())))
print vec_basic.get_feature_names()
print vec_alphanumeric.get_feature_names()
import numpy as np
import pandas as pd
def multilabel_sample(y, size=1000, min_count=5, seed=None):
""" Takes a matrix of binary labels `y` and returns
the indices for a sample of size `size` if
`size` > 1 or `size` * len(y) if size =< 1.
The sample is guaranteed to have > `min_count` of
each label.
"""
try:
if (np.unique(y).astype(int) != np.array([0, 1])).all():
raise ValueError()
except (TypeError, ValueError):
raise ValueError('multilabel_sample only works with binary indicator matrices')
if (y.sum(axis=0) < min_count).any():
raise ValueError('Some classes do not have enough examples. Change min_count if necessary.')
if size <= 1:
size = np.floor(y.shape[0] * size)
if y.shape[1] * min_count > size:
msg = "Size less than number of columns * min_count, returning {} items instead of {}."
warn(msg.format(y.shape[1] * min_count, size))
size = y.shape[1] * min_count
rng = np.random.RandomState(seed if seed is not None else np.random.randint(1))
if isinstance(y, pd.DataFrame):
choices = y.index
y = y.values
else:
choices = np.arange(y.shape[0])
sample_idxs = np.array([], dtype=choices.dtype)
# first, guarantee > min_count of each label
for j in range(y.shape[1]):
label_choices = choices[y[:, j] == 1]
label_idxs_sampled = rng.choice(label_choices, size=min_count, replace=False)
sample_idxs = np.concatenate([label_idxs_sampled, sample_idxs])
sample_idxs = np.unique(sample_idxs)
# now that we have at least min_count of each, we can just random sample
sample_count = int(size - sample_idxs.shape[0])
# get sample_count indices from remaining choices
remaining_choices = np.setdiff1d(choices, sample_idxs)
remaining_sampled = rng.choice(remaining_choices,
size=sample_count,
replace=False)
return np.concatenate([sample_idxs, remaining_sampled])
def multilabel_sample_dataframe(df, labels, size, min_count=5, seed=None):
""" Takes a dataframe `df` and returns a sample of size `size` where all
classes in the binary matrix `labels` are represented at
least `min_count` times.
"""
idxs = multilabel_sample(labels, size=size, min_count=min_count, seed=seed)
return df.loc[idxs]
def multilabel_train_test_split(X, Y, size, min_count=5, seed=None):
""" Takes a features matrix `X` and a label matrix `Y` and
returns (X_train, X_test, Y_train, Y_test) where all
classes in Y are represented at least `min_count` times.
"""
index = Y.index if isinstance(Y, pd.DataFrame) else np.arange(Y.shape[0])
test_set_idxs = multilabel_sample(Y, size=size, min_count=min_count, seed=seed)
train_set_idxs = np.setdiff1d(index, test_set_idxs)
test_set_mask = index.isin(test_set_idxs)
train_set_mask = ~test_set_mask
return (X[train_set_mask], X[test_set_mask], Y[train_set_mask], Y[test_set_mask])