import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
%matplotlib inline
df = pd.read_csv('TrainingData.csv', index_col=0)
df.shape
df.info()
df.columns
df.head()
# df.columns = ['ID']+list(df.columns[1:])
df['Job_Title_Description'].value_counts().head(10)
df['Object_Type'].value_counts().head(10)
df['FTE'].value_counts().head()
df['Total'].describe()
plt.xlim?
# Create the histogram
plt.hist(df['FTE'].dropna(),bins=500,alpha=.5,color='r')
plt.xlim([-.2,1.2])
# Add title and labels
plt.title('Distribution of %full-time \n employee works')
plt.xlabel('% of full-time')
plt.ylabel('num employees')
# Display the histogram
plt.show()
# sample_df.label = sample_df.label.astype('category')
df.head(1)
df.dtypes
df['Use'].value_counts()
type(df['Use']), type(df[['Use']])
dummies = pd.get_dummies(df[['Use']], prefix_sep='_')
dummies.head()
dummies = pd.get_dummies(df['Use'], prefix_sep='_')
dummies.head()
categorize_label = lambda x: x.astype('category')
type(categorize_label)
df_use = df[['Use']].apply(categorize_label, axis=0)
df_use.info()
df.Function.astype('category').dtypes
df.apply(categorize_label, axis=0).dtypes
df.dtypes.value_counts()
df.apply(categorize_label).dtypes.value_counts()
# pd.Series.nunique?
# Return number of unique elements in the object.
df['Function'].nunique()
for i in df: print i, '--- unique values: ', df[i].nunique()
# Calculate number of unique values for each label: num_unique_labels
num_unique_labels = df.apply(pd.Series.nunique)
num_unique_labels
# Plot number of unique values for each label
plt.figure(figsize=[10,5])
num_unique_labels.plot(kind='bar',alpha=.6)
plt.ylim([0,2000])
# Label the axes
plt.xlabel('Labels')
plt.ylabel('Number of unique values')
# Display the plot
plt.show()
# np.clip?
# np.clip(a, a_min, a_max, out=None)
# Docstring:
# Clip (limit) the values in an array.
# Given an interval, values outside the interval are clipped to
# the interval edges. For example, if an interval of ``[0, 1]``
# is specified, values smaller than 0 become 0, and values larger
# than 1 become 1.
np.clip(range(1,10), 3, 7)
● Log loss for binary classification
● Actual value: y = {1=yes, 0=no}
● Prediction (probability that the value is 1): p
def compute_log_loss(predicted, actual, eps=1e-14):
import numpy as np
""" Computes the logarithmic loss between predicted and
actual when these are 1D arrays.
:param predicted: The predicted probabilities as floats between 0-1
:param actual: The actual binary labels. Either 0 or 1.
:param eps (optional): log(0) is inf, so we need to offset our
predicted values slightly by eps from 0 or 1.
"""
predicted = np.clip(predicted, eps, 1 - eps)
loss = -1 * np.mean(actual * np.log(predicted) + (1 - actual)* np.log(1 - predicted))
return loss
compute_log_loss(predicted=0.9, actual=0)
compute_log_loss(predicted=0.5, actual=1)