In [9]:
# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns

# Set default Seaborn style
sns.set()

from sklearn.datasets import load_iris

data = load_iris()
type(data)
Out[9]:
sklearn.utils.Bunch
In [10]:
data.target_names
Out[10]:
array(['setosa', 'versicolor', 'virginica'], dtype='|S10')
In [11]:
data.data.shape
Out[11]:
(150, 4)
In [12]:
data.target
Out[12]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [13]:
versicolor = data.data[data.target==1]
versicolor.shape
Out[13]:
(50, 4)
In [14]:
data.feature_names
Out[14]:
['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']
In [15]:
versicolor_petal_lengths = versicolor[:,2]
print versicolor_petal_lengths.shape
versicolor_petal_lengths
(50,)
Out[15]:
array([4.7, 4.5, 4.9, 4. , 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4. ,
       4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4. , 4.9, 4.7, 4.3, 4.4,
       4.8, 5. , 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1,
       4. , 4.4, 4.6, 4. , 3.3, 4.2, 4.2, 4.2, 4.3, 3. , 4.1])

Plotting a histogram of iris data

Plot a histogram of the petal lengths of his 50 samples of Iris versicolor using matplotlib/seaborn's default settings. Recall that to specify the default seaborn style, you can use sns.set(), where sns is the alias that seaborn is imported as.

The subset of the data set containing the Iris versicolor petal lengths in units of centimeters (cm) is stored in the NumPy array versicolor_petal_length.

Also, Justin assigned his plotting statements (except for plt.show()) to the dummy variable _. This is to prevent unnecessary output from being displayed.

Adjusting the number of bins in a histogram

In [24]:
# Import numpy
import numpy as np

# Compute number of data points: n_data
n_data = len(versicolor_petal_lengths)
print n_data

# Number of bins is the square root of number of data points: n_bins
n_bins = np.sqrt(n_data)
print n_bins

# Convert number of bins to integer: n_bins
n_bins = int(n_bins)
print n_bins
50
7.0710678118654755
7
In [26]:
# Plot histogram of versicolor petal lengths
_ = plt.hist(versicolor_petal_lengths, bins= n_bins, ec='black')

# Label axes
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('count')

# Show histogram
plt.show()

convert sklearn dataset to DataFrame

In [32]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# save load_iris() sklearn dataset to iris
# if you'd like to check dataset type use: type(load_iris())
# if you'd like to view list of attributes use: dir(load_iris())
iris = load_iris()

# np.c_ is the numpy concatenate function
# which is used to concat iris['data'] and iris['target'] arrays 
# for pandas column argument: concat iris['feature_names'] list
# and string list (in this case one string); you can make this anything you'd like..  
# the original dataset would probably call this ['Species']
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])
df.shape
Out[32]:
(150, 5)
In [40]:
np.c_[np.array([1,2,3]), np.array([4,5,6])]
Out[40]:
array([[1, 4],
       [2, 5],
       [3, 6]])
In [51]:
np.stack([[1,2,3],[4,5,6]], 1)
Out[51]:
array([[1, 4],
       [2, 5],
       [3, 6]])
In [35]:
df.head()
Out[35]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0.0
1 4.9 3.0 1.4 0.2 0.0
2 4.7 3.2 1.3 0.2 0.0
3 4.6 3.1 1.5 0.2 0.0
4 5.0 3.6 1.4 0.2 0.0
In [55]:
iris.target_names
Out[55]:
array(['setosa', 'versicolor', 'virginica'], dtype='|S10')
In [56]:
df['species']=df['target'].map({0:'setosa',1:'versicolor',2:'virginica'})
In [57]:
df.head()
Out[57]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target species
0 5.1 3.5 1.4 0.2 0.0 setosa
1 4.9 3.0 1.4 0.2 0.0 setosa
2 4.7 3.2 1.3 0.2 0.0 setosa
3 4.6 3.1 1.5 0.2 0.0 setosa
4 5.0 3.6 1.4 0.2 0.0 setosa

Bee swarm plot

Make a bee swarm plot of the iris petal lengths. Your x-axis should contain each of the three species, and the y-axis the petal lengths.

  • A data frame containing the data is in your namespace as df
In [58]:
# Create bee swarm plot with Seaborn's default settings
_ = sns.swarmplot(x='species', y='petal length (cm)', data=df)

# Label the axes
_ = plt.xlabel('species')
_ = plt.ylabel('petal length (cm)')

# Show the plot
plt.show()
In [61]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / float(n)

    return x, y
In [62]:
ecdf([1,2,3,4])
Out[62]:
(array([1, 2, 3, 4]), array([0.25, 0.5 , 0.75, 1.  ]))
In [66]:
ecdf(versicolor_petal_lengths)
Out[66]:
(array([3. , 3.3, 3.3, 3.5, 3.5, 3.6, 3.7, 3.8, 3.9, 3.9, 3.9, 4. , 4. ,
        4. , 4. , 4. , 4.1, 4.1, 4.1, 4.2, 4.2, 4.2, 4.2, 4.3, 4.3, 4.4,
        4.4, 4.4, 4.4, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.6, 4.6, 4.6,
        4.7, 4.7, 4.7, 4.7, 4.7, 4.8, 4.8, 4.9, 4.9, 5. , 5.1]),
 array([0.02, 0.04, 0.06, 0.08, 0.1 , 0.12, 0.14, 0.16, 0.18, 0.2 , 0.22,
        0.24, 0.26, 0.28, 0.3 , 0.32, 0.34, 0.36, 0.38, 0.4 , 0.42, 0.44,
        0.46, 0.48, 0.5 , 0.52, 0.54, 0.56, 0.58, 0.6 , 0.62, 0.64, 0.66,
        0.68, 0.7 , 0.72, 0.74, 0.76, 0.78, 0.8 , 0.82, 0.84, 0.86, 0.88,
        0.9 , 0.92, 0.94, 0.96, 0.98, 1.  ]))
In [64]:
# Compute ECDF for versicolor data: x_vers, y_vers
x_vers, y_vers = ecdf(versicolor_petal_lengths)

# Generate plot
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')

# Make the margins nice
plt.margins(0.02)

# Label the axes
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

# Display the plot
plt.show()
In [68]:
df.head()
Out[68]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target species
0 5.1 3.5 1.4 0.2 0.0 setosa
1 4.9 3.0 1.4 0.2 0.0 setosa
2 4.7 3.2 1.3 0.2 0.0 setosa
3 4.6 3.1 1.5 0.2 0.0 setosa
4 5.0 3.6 1.4 0.2 0.0 setosa
In [76]:
# Compute ECDFs
plt.figure(figsize=((15,5)))

x_set, y_set = ecdf(df.iloc[:,0][df['target']==0])
x_vers, y_vers = ecdf(df.iloc[:,1][df['target']==1])
x_virg, y_virg = ecdf(df.iloc[:,2][df['target']==2])

# Plot all ECDFs on the same plot
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')

# Make nice margins
plt.margins(0.02)

# Annotate the plot
_ = plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

# Display the plot
plt.show()

In [78]:
df = pd.read_csv('./2008_all_states.csv')
df.shape
Out[78]:
(3153, 8)
In [80]:
df.tail()
Out[80]:
state county total_votes dem_votes rep_votes other_votes dem_share east_west
3148 OH Hamilton County 425086 225213 195530 4343 53.53 east
3149 OH Highland County 19186 6856 11907 423 36.54 east
3150 OH Hocking County 12961 6259 6364 338 49.58 east
3151 OH Licking County 82356 33932 46918 1506 41.97 east
3152 OH Madison County 17454 6532 10606 316 38.11 east
In [83]:
df['state'].value_counts().head()
Out[83]:
TX    254
GA    159
VA    134
KY    120
MO    115
Name: state, dtype: int64
In [91]:
df.pivot_table(index='east_west',values='total_votes', aggfunc='mean')
Out[91]:
total_votes
east_west
east 50271.391413
west 32811.060802
In [93]:
df.pivot_table(index='state',values='total_votes', aggfunc='mean').tail()
Out[93]:
total_votes
state
VT 23217.571429
WA 77868.666667
WI 41436.347222
WV 12970.218182
WY 11072.086957
In [94]:
df.head()
Out[94]:
state county total_votes dem_votes rep_votes other_votes dem_share east_west
0 AK State House District 8, Denali-University 10320 4995 4983 342 50.06 west
1 AK State House District 37, Bristol Bay-Aleuti 4665 1868 2661 136 41.24 west
2 AK State House District 12, Richardson-Glenn H 7589 1914 5467 208 25.93 west
3 AK State House District 13, Greater Palmer 11526 2800 8432 294 24.93 west
4 AK State House District 14, Greater Wasilla 10456 2132 8108 216 20.82 west
In [96]:
swing = df[df['state'].apply(lambda x: x in ['PA','OH','IL'])]
swing.shape
Out[96]:
(257, 8)
In [100]:
df_swing = swing
In [101]:
_ = plt.hist(df_swing['dem_share'])
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('number of counties')
plt.show()
In [102]:
_ = sns.swarmplot(x='state', y='dem_share', data=df_swing)
_ = plt.xlabel('state')
_ = plt.ylabel('percent of vote for Obama')
plt.show()
In [110]:
import numpy as np
plt.figure(figsize=[16,5])
x = np.sort(df_swing['dem_share'])
y = np.arange(1, len(x)+1) / float(len(x))
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()
In [113]:
swing['state'].unique()
Out[113]:
array(['IL', 'PA', 'OH'], dtype=object)
In [120]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""

    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n+1) / float(n)

    return x, y
In [117]:
plt.plot?
In [121]:
plt.figure(figsize=[16,5])

x1, y1 = ecdf(swing[swing['state']=='IL']['dem_share'])
x2, y2 = ecdf(swing[swing['state']=='PA']['dem_share'])
x3, y3 = ecdf(swing[swing['state']=='OH']['dem_share'])

_ = plt.plot(x1, y1, marker='.', linestyle='none')
_ = plt.plot(x2, y2, marker='.', linestyle='none')
_ = plt.plot(x3, y3, marker='.', linestyle='none')



_ = plt.xlabel('percent of vote for Obama')
_ = plt.ylabel('ECDF')
plt.margins(0.02) # Keeps data off plot edges
plt.show()