import pandas as pd
sales = pd.DataFrame({'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
'bread': [139, 237, 326, 456],
'butter': [20, 45, 70, 98] } )
sales
sales.loc[sales['weekday'] == 'Sun'].count()
sales.groupby('weekday').count()
sales.groupby('weekday')
print dir(sales.groupby('weekday'))
sales.groupby('weekday').sum()
sales.groupby('weekday').std()
sales.groupby('weekday').last()
sales.groupby('weekday')['bread'].sum()
sales.groupby('weekday')['bread','butter'].sum()
sales.groupby('weekday')[['bread','butter']].sum()
sales.groupby('weekday')[['bread']].sum()
sales.groupby(['city','weekday']).mean()
customers = pd.Series(['Dave','Alice','Bob','Alice'])
customers
sales
sales.groupby(customers)['bread'].sum()
sales['weekday'].unique()
sales['weekday'] = sales['weekday'].astype('category')
sales
sales['weekday']
sales.groupby('city')[['bread','butter']].agg(['max','sum'])
def data_range(series):
return series.max() - series.min()
sales.groupby('weekday')[['bread', 'butter']].agg(data_range)
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})
titanic = pd.read_csv('train.csv')
titanic.info()
titanic.head()
# Group titanic by 'pclass'
# Aggregate 'survived' column of by_class by count
p_group = titanic.groupby('Pclass')['Survived'].count()
p_group
# Group titanic by 'embarked' and 'pclass'
# Aggregate 'survived' column of by_mult by count
multi_group = titanic.groupby(['Embarked','Pclass']).count()
multi_group
multi_survive = titanic.groupby(['Embarked','Pclass'])['Survived'].count()
multi_survive
# Group titanic by 'pclass': by_class
by_class = titanic.groupby('Pclass')
# Select 'age' and 'fare'
by_class_sub = by_class[['Age','Fare']]
# Aggregate by_class_sub by 'max' and 'median': aggregated
aggregated = by_class_sub.agg(['max','median'])
aggregated
# Print the maximum age in each class
print(aggregated.loc[:, ('Age','max')])
# Print the median fare in each class
print(aggregated.loc[:, ('Fare','median')])
# Read life_fname into a DataFrame: life
life = pd.read_csv('https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/life_expectancy.csv',
index_col='Country')
# Read regions_fname into a DataFrame: regions
regions = pd.read_csv('https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/regions.csv',
index_col='Country')
print life.shape
print regions.shape
life.head(3)
regions.head()
# Group life by regions['region']: life_by_region
life_by_region = life.groupby(regions['region'])
# Print the mean over the '2010' column of life_by_region
life_by_region['2010'].mean()
from pandas import Timestamp
df = pd.DataFrame({'Company': {Timestamp('2015-02-02 08:30:00'): 'Hooli',
Timestamp('2015-02-02 21:00:00'): 'Mediacore',
Timestamp('2015-02-03 14:00:00'): 'Initech',
Timestamp('2015-02-04 15:30:00'): 'Streeplex',
Timestamp('2015-02-04 22:00:00'): 'Acme Coporation',
Timestamp('2015-02-05 02:00:00'): 'Acme Coporation',
Timestamp('2015-02-05 22:00:00'): 'Hooli',
Timestamp('2015-02-07 23:00:00'): 'Acme Coporation',
Timestamp('2015-02-09 09:00:00'): 'Streeplex',
Timestamp('2015-02-09 13:00:00'): 'Mediacore',
Timestamp('2015-02-11 20:00:00'): 'Initech',
Timestamp('2015-02-11 23:00:00'): 'Hooli',
Timestamp('2015-02-16 12:00:00'): 'Hooli',
Timestamp('2015-02-19 11:00:00'): 'Mediacore',
Timestamp('2015-02-19 16:00:00'): 'Mediacore',
Timestamp('2015-02-21 05:00:00'): 'Mediacore',
Timestamp('2015-02-21 20:30:00'): 'Hooli',
Timestamp('2015-02-25 00:30:00'): 'Initech',
Timestamp('2015-02-26 09:00:00'): 'Streeplex'},
'Product': {Timestamp('2015-02-02 08:30:00'): 'Software',
Timestamp('2015-02-02 21:00:00'): 'Hardware',
Timestamp('2015-02-03 14:00:00'): 'Software',
Timestamp('2015-02-04 15:30:00'): 'Software',
Timestamp('2015-02-04 22:00:00'): 'Hardware',
Timestamp('2015-02-05 02:00:00'): 'Software',
Timestamp('2015-02-05 22:00:00'): 'Service',
Timestamp('2015-02-07 23:00:00'): 'Hardware',
Timestamp('2015-02-09 09:00:00'): 'Service',
Timestamp('2015-02-09 13:00:00'): 'Software',
Timestamp('2015-02-11 20:00:00'): 'Software',
Timestamp('2015-02-11 23:00:00'): 'Software',
Timestamp('2015-02-16 12:00:00'): 'Software',
Timestamp('2015-02-19 11:00:00'): 'Hardware',
Timestamp('2015-02-19 16:00:00'): 'Service',
Timestamp('2015-02-21 05:00:00'): 'Software',
Timestamp('2015-02-21 20:30:00'): 'Hardware',
Timestamp('2015-02-25 00:30:00'): 'Service',
Timestamp('2015-02-26 09:00:00'): 'Service'},
'Units': {Timestamp('2015-02-02 08:30:00'): 3,
Timestamp('2015-02-02 21:00:00'): 9,
Timestamp('2015-02-03 14:00:00'): 13,
Timestamp('2015-02-04 15:30:00'): 13,
Timestamp('2015-02-04 22:00:00'): 14,
Timestamp('2015-02-05 02:00:00'): 19,
Timestamp('2015-02-05 22:00:00'): 10,
Timestamp('2015-02-07 23:00:00'): 1,
Timestamp('2015-02-09 09:00:00'): 19,
Timestamp('2015-02-09 13:00:00'): 7,
Timestamp('2015-02-11 20:00:00'): 7,
Timestamp('2015-02-11 23:00:00'): 4,
Timestamp('2015-02-16 12:00:00'): 10,
Timestamp('2015-02-19 11:00:00'): 16,
Timestamp('2015-02-19 16:00:00'): 10,
Timestamp('2015-02-21 05:00:00'): 3,
Timestamp('2015-02-21 20:30:00'): 3,
Timestamp('2015-02-25 00:30:00'): 10,
Timestamp('2015-02-26 09:00:00'): 4}})
df
# Create a groupby object: by_day
by_day = df.groupby(df.index.strftime('%a'))
by_day
df.index.strftime('%a')
# Create sum: units_sum
units_sum = by_day['Units'].sum()
units_sum
def Zscore(series):
return (series - series.mean()) / series.std()
from scipy.stats import zscore
%%timeit
Zscore(df['Units'])
%%timeit
zscore(df['Units'])
df.head()
df.index.weekday
df.groupby(df.index.weekday).sum()
df.groupby(df.index.weekday).transform(zscore)
df.groupby(df.index.weekday)['Units'].transform(zscore)
df.groupby(df.index.weekday).apply(sum)
df.groupby(df.index.weekday)['Units'].apply(Zscore)
df.groupby(df.index.weekday)['Units'].apply(zscore)
titanic.info()
# Write a function that imputes median
def impute_median(series):
return series.fillna(series.median())
# Create a groupby object: by_sex_class
by_sex_class = titanic.groupby(['Sex','Pclass'])
by_sex_class.count()
# Impute age and assign to titanic['age']
titanic['Age'] = by_sex_class['Age'].transform(impute_median)
titanic.head(4)
auto = pd.read_csv('mpg.csv')
auto.info()
auto.head()
auto.groupby('model_year')['mpg'].mean()
splitting = auto.groupby('model_year')
print type(splitting)
print type(splitting.groups)
print splitting.groups
print splitting.groups.keys()
for group_name, group in splitting:
avg = group['mpg'].mean()
print(group_name, avg)
for group_name, group in splitting:
avg = group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
print(group_name, avg)
chevy_means = {year:group.loc[group['name'].str.contains('chevrolet'),'mpg'].mean()
for year,group in splitting}
chevy_means
chevy = auto['name'].str.contains('chevrolet')
print type(chevy)
print chevy.shape
chevy.head()
auto.shape
auto.groupby(['model_year', chevy]).mean()
titanic.head()
def c_deck_survival(gr):
c_passengers = gr['Cabin'].str.startswith('C').fillna(False)
return gr.loc[c_passengers, 'Survived'].mean()
# Create a groupby object using titanic over the 'sex' column: by_sex
by_sex = titanic.groupby('Sex')
# Call by_sex.apply with the function c_deck_survival and print the result
c_surv_by_sex = by_sex.apply(c_deck_survival)
c_surv_by_sex
titanic.loc[titanic['Cabin'].str.startswith('C').fillna(False)].groupby('Sex')['Survived'].mean()
titanic['Cabin'].str.startswith('C').head(10)
titanic['Cabin'].str.startswith('C').fillna(False).head(10)
titanic.loc[titanic['Cabin'].str.startswith('C').fillna(False)].groupby('Sex')['Survived'].mean()
df
# Group sales by 'Company': by_company
by_company = df.groupby('Company')
# Compute the sum of the 'Units' of by_company: by_com_sum
by_com_sum = by_company['Units'].sum()
by_com_sum
# Filter 'Units' where the sum is > 35: by_com_filt
by_com_filt = by_company.filter(lambda g:g['Units'].sum() > 35)
by_com_filt
# Create the Boolean Series: under10
under10 = (titanic['Age'] < 10).map({True:'under 10', False:'over 10'})
print type(under10)
under10.head(10)
# Group by under10 and compute the survival rate
survived_mean_1 = titanic.groupby(under10)['Survived'].mean()
print(survived_mean_1)
# Group by under10 and pclass and compute the survival rate
survived_mean_2 = titanic.groupby([under10, 'Pclass'])['Survived'].mean()
survived_mean_2