# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns

# Set default Seaborn style
sns.set()

from sklearn.datasets import load_iris

data = load_iris()
type(data)

sklearn.utils.Bunch

data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='|S10')

data.data.shape

(150, 4)

data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

versicolor = data.data[data.target==1]
versicolor.shape

(50, 4)

data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

versicolor_petal_lengths = versicolor[:,2]
print versicolor_petal_lengths.shape
versicolor_petal_lengths

(50,)

array([4.7, 4.5, 4.9, 4. , 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4. ,
       4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4. , 4.9, 4.7, 4.3, 4.4,
       4.8, 5. , 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1,
       4. , 4.4, 4.6, 4. , 3.3, 4.2, 4.2, 4.2, 4.3, 3. , 4.1])

Plotting a histogram of iris data¶

Plot a histogram of the petal lengths of his 50 samples of Iris versicolor using matplotlib/seaborn's default settings. Recall that to specify the default seaborn style, you can use sns.set(), where sns is the alias that seaborn is imported as.

The subset of the data set containing the Iris versicolor petal lengths in units of centimeters (cm) is stored in the NumPy array versicolor_petal_length.

Also, Justin assigned his plotting statements (except for plt.show()) to the dummy variable _. This is to prevent unnecessary output from being displayed.

Adjusting the number of bins in a histogram¶

# Import numpy
import numpy as np

# Compute number of data points: n_data
n_data = len(versicolor_petal_lengths)
print n_data

# Number of bins is the square root of number of data points: n_bins
n_bins = np.sqrt(n_data)
print n_bins

# Convert number of bins to integer: n_bins
n_bins = int(n_bins)
print n_bins

50
7.0710678118654755
7

# Plot histogram of versicolor petal lengths
_ = plt.hist(versicolor_petal_lengths, bins= n_bins, ec='black')

# Label axes
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('count')

# Show histogram
plt.show()

convert sklearn dataset to DataFrame¶

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# save load_iris() sklearn dataset to iris
# if you'd like to check dataset type use: type(load_iris())
# if you'd like to view list of attributes use: dir(load_iris())
iris = load_iris()

# np.c_ is the numpy concatenate function
# which is used to concat iris['data'] and iris['target'] arrays 
# for pandas column argument: concat iris['feature_names'] list
# and string list (in this case one string); you can make this anything you'd like..  
# the original dataset would probably call this ['Species']
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
df.shape

(150, 5)

np.c_[np.array([1,2,3]), np.array([4,5,6])]

array([[1, 4],
       [2, 5],
       [3, 6]])

np.stack([[1,2,3],[4,5,6]], 1)

array([[1, 4],
       [2, 5],
       [3, 6]])

df.head()

iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='|S10')

df['species']=df['target'].map({0:'setosa',1:'versicolor',2:'virginica'})

df.head()

Bee swarm plot¶

Make a bee swarm plot of the iris petal lengths. Your x-axis should contain each of the three species, and the y-axis the petal lengths.

A data frame containing the data is in your namespace as df

# Create bee swarm plot with Seaborn's default settings
_ = sns.swarmplot(x='species', y='petal length (cm)', data=df)

# Label the axes
_ = plt.xlabel('species')
_ = plt.ylabel('petal length (cm)')

# Show the plot
plt.show()

Computing the ECDF¶

a function that takes as input a 1D array of data and then returns the x and y values of the ECDF

http://www.statsmodels.org/stable/generated/statsmodels.distributions.empirical_distribution.ECDF.html?highlight=ecdf

https://en.wikipedia.org/wiki/Empirical_distribution_function

def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / float(n)

    return x, y

ecdf([1,2,3,4])

(array([1, 2, 3, 4]), array([0.25, 0.5 , 0.75, 1.  ]))

ecdf(versicolor_petal_lengths)

(array([3. , 3.3, 3.3, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 3.9, 4. , 4. ,
        4. , 4. , 4. , 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4,
        4.4, 4.4, 4.4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.6, 4.6, 4.6,
        4.7, 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5. , 5.1]),
 array([0.02, 0.04, 0.06, 0.08, 0.1 , 0.12, 0.14, 0.16, 0.18, 0.2 , 0.22,
        0.24, 0.26, 0.28, 0.3 , 0.32, 0.34, 0.36, 0.38, 0.4 , 0.42, 0.44,
        0.46, 0.48, 0.5 , 0.52, 0.54, 0.56, 0.58, 0.6 , 0.62, 0.64, 0.66,
        0.68, 0.7 , 0.72, 0.74, 0.76, 0.78, 0.8 , 0.82, 0.84, 0.86, 0.88,
        0.9 , 0.92, 0.94, 0.96, 0.98, 1.  ]))

# Compute ECDF for versicolor data: x_vers, y_vers
x_vers, y_vers = ecdf(versicolor_petal_lengths)

# Generate plot
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')

# Make the margins nice
plt.margins(0.02)

# Label the axes
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

# Display the plot
plt.show()

df.head()

# Compute ECDFs
plt.figure(figsize=((15,5)))

x_set, y_set = ecdf(df.iloc[:,0][df['target']==0])
x_vers, y_vers = ecdf(df.iloc[:,1][df['target']==1])
x_virg, y_virg = ecdf(df.iloc[:,2][df['target']==2])

# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')

# Make nice margins
plt.margins(0.02)

# Annotate the plot
_ = plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

# Display the plot
plt.show()

df = pd.read_csv('./2008_all_states.csv')
df.shape

(3153, 8)

df.tail()

df['state'].value_counts().head()

TX    254
GA    159
VA    134
KY    120
MO    115
Name: state, dtype: int64

df.pivot_table(index='east_west',values='total_votes', aggfunc='mean')

df.pivot_table(index='state',values='total_votes', aggfunc='mean').tail()

df.head()

swing = df[df['state'].apply(lambda x: x in ['PA','OH','IL'])]
swing.shape

(257, 8)

df_swing = swing

_ = plt.hist(df_swing['dem_share'])
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('number of counties')
plt.show()

_ = sns.swarmplot(x='state', y='dem_share', data=df_swing)
_ = plt.xlabel('state')
_ = plt.ylabel('percent of vote for Obama')
plt.show()

import numpy as np
plt.figure(figsize=[16,5])
x = np.sort(df_swing['dem_share'])
y = np.arange(1, len(x)+1) / float(len(x))
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()

swing['state'].unique()

array(['IL', 'PA', 'OH'], dtype=object)

def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / float(n)

    return x, y

plt.plot?

plt.figure(figsize=[16,5])

x1, y1 = ecdf(swing[swing['state']=='IL']['dem_share'])
x2, y2 = ecdf(swing[swing['state']=='PA']['dem_share'])
x3, y3 = ecdf(swing[swing['state']=='OH']['dem_share'])

_ = plt.plot(x1, y1, marker='.', linestyle='none')
_ = plt.plot(x2, y2, marker='.', linestyle='none')
_ = plt.plot(x3, y3, marker='.', linestyle='none')



_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()

	state	county	total_votes	dem_votes	rep_votes	other_votes	dem_share	east_west
3148	OH	Hamilton County	425086	225213	195530	4343	53.53	east
3149	OH	Highland County	19186	6856	11907	423	36.54	east
3150	OH	Hocking County	12961	6259	6364	338	49.58	east
3151	OH	Licking County	82356	33932	46918	1506	41.97	east
3152	OH	Madison County	17454	6532	10606	316	38.11	east

	total_votes
state
VT	23217.571429
WA	77868.666667
WI	41436.347222
WV	12970.218182
WY	11072.086957

	state	county	total_votes	dem_votes	rep_votes	other_votes	dem_share	east_west
0	AK	State House District 8, Denali-University	10320	4995	4983	342	50.06	west
1	AK	State House District 37, Bristol Bay-Aleuti	4665	1868	2661	136	41.24	west
2	AK	State House District 12, Richardson-Glenn H	7589	1914	5467	208	25.93	west
3	AK	State House District 13, Greater Palmer	11526	2800	8432	294	24.93	west
4	AK	State House District 14, Greater Wasilla	10456	2132	8108	216	20.82	west

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2