Series.groupby
and
DataFrame.groupby
GroupBy
With
series.loc[series.index == borough]
we get five shorter
Series
es.
With
series.loc[borough]
we get three shorter
Series
es
(Bronx, Manhattan, Queens)
and two scalars
(Brooklyn, Staten Island).
import sys import pandas as pd shootings = [ ["Brooklyn", 20], ["Bronx", 10], ["Bronx", 5], ["Manhattan", 40], ["Queens", 10], ["Staten Island", 12], ["Manhattan", 20], ["Queens", 30], ["Bronx", 7] ] index = pd.Index([shooting[0] for shooting in shootings], name = "Boroughs") data = [shooting[1] for shooting in shootings] series = pd.Series(data = data, index = index, name = "Shootings") print(series) print() for borough in series.index.unique().sort_values(): print(f'{borough}{(80 - len(borough)) * "-"}') print(series.loc[series.index == borough]) #what goes wrong with series.loc[borough] print() sys.exit(0)
Boroughs Brooklyn 20 Bronx 10 Bronx 5 Manhattan 40 Queens 10 Staten Island 12 Manhattan 20 Queens 30 Bronx 7 Name: Shootings, dtype: int64 Bronx--------------------------------------------------------------------------- Boroughs Bronx 10 Bronx 5 Bronx 7 Name: Shootings, dtype: int64 Brooklyn------------------------------------------------------------------------ Boroughs Brooklyn 20 Name: Shootings, dtype: int64 Manhattan----------------------------------------------------------------------- Boroughs Manhattan 40 Manhattan 20 Name: Shootings, dtype: int64 Queens-------------------------------------------------------------------------- Boroughs Queens 10 Queens 30 Name: Shootings, dtype: int64 Staten Island------------------------------------------------------------------- Boroughs Staten Island 12 Name: Shootings, dtype: int64
Manhattan had two shootings,
totalling 60 victims.
Think of
series.index
as the following list of nine strings, containing five distinct values:
[ "Brooklyn", "Bronx", "Bronx", "Manhattan", "Queens", "Staten Island", "Manhattan", "Queens", "Bronx" ]That’s why
groupby
divides the
Series
into five shorter
Series
es.
import sys
import pandas as pd
shootings = [
["Brooklyn", 20],
["Bronx", 10],
["Bronx", 5],
["Manhattan", 40],
["Queens", 10],
["Staten Island", 12],
["Manhattan", 20],
["Queens", 30],
["Bronx", 7]
]
index = pd.Index([shooting[0] for shooting in shootings], name = "Boroughs")
data = [shooting[1] for shooting in shootings]
series = pd.Series(data = data, index = index, name = "Shootings")
print(series)
print()
groups = series.groupby(series.index)
for groupName, group in groups: #groupName is string, group is a Series of ints
print(f'{groupName}{(80 - len(groupName)) * "-"}')
#group.name = f"{groupName} Shootings"
print(group)
print()
sys.exit(0)
Sorry that each of the five smaller
Series
es
is named
"Shootings"
,
but we could rename them by uncommenting the above comment.
Boroughs Brooklyn 20 Bronx 10 Bronx 5 Manhattan 40 Queens 10 Staten Island 12 Manhattan 20 Queens 30 Bronx 7 Name: Shootings, dtype: int64 Bronx--------------------------------------------------------------------------- Boroughs Bronx 10 Bronx 5 Bronx 7 Name: Shootings, dtype: int64 Brooklyn------------------------------------------------------------------------ Boroughs Brooklyn 20 Name: Shootings, dtype: int64 Manhattan----------------------------------------------------------------------- Boroughs Manhattan 40 Manhattan 20 Name: Shootings, dtype: int64 Queens-------------------------------------------------------------------------- Boroughs Queens 10 Queens 30 Name: Shootings, dtype: int64 Staten Island------------------------------------------------------------------- Boroughs Staten Island 12 Name: Shootings, dtype: int64
Think of
series.values >= 15
as the following list of nine bools,
containing two distinct values:
[ True, False, False, True, False, False, True, True, False ]That’s why
groupby
divides the
Series
into two shorter Serieses.
import sys
import pandas as pd
shootings = [
["Brooklyn", 20],
["Bronx", 10],
["Bronx", 5],
["Manhattan", 40],
["Queens", 10],
["Staten Island", 12],
["Manhattan", 20],
["Queens", 30],
["Bronx", 7]
]
index = pd.Index([shooting[0] for shooting in shootings], name = "Boroughs")
data = [shooting[1] for shooting in shootings]
series = pd.Series(data = data, index = index, name = "Shootings")
groups = series.groupby(series.values >= 15)
for groupName, group in groups: #groupName is bool, group is Series of ints
groupName = "Big shootings" if groupName else "Small Shootings"
print(f'{groupName}{(80 - len(groupName)) * "-"}')
print(group)
print()
sys.exit(0)
Small Shootings----------------------------------------------------------------- Boroughs Bronx 10 Bronx 5 Queens 10 Staten Island 12 Bronx 7 Name: Shootings, dtype: int64 Big shootings------------------------------------------------------------------- Boroughs Brooklyn 20 Manhattan 40 Manhattan 20 Queens 30 Name: Shootings, dtype: int64
Try the following groupings:
groups = series.groupby(series.values % 2 == 0)
#Instead of "Big shootings" and "Small shootings", the names should be
#"Odd numbers of victims" and "Even number of victims".
groups = series.groupby(series.index.str.startswith("B"))
#Instead of "Big shootings" and "Small shootings", the names should be "B's" and "Non-B's".
groups = series.groupby(nparange(9) >= 4) #import numpy as np
#Instead of "Big shootings" and "Small shootings", the names should be "First four" and "Last five".
arbitraryListOfBools = [True, False, True, True, False, False, True, True, True]
groups = series.groupby(arbitraryListOfBools)
#Instead of "Big shootings" and "Small shootings", make up two names.
arbitraryListOfColors = ["red", "blue", "red", "gold", "blue, "red", "blue, "blue", "red"]
groups = series.groupby(arbitraryListOfColors)
#Instead of "Big shootings" and "Small shootings", make up three names.
Think of
df.index
as the following list of ten
int
s:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Think of
df.index // 2
as the following list of ten
int
s, containing five distinct values:
[0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
That’s why
groupby
divides the
DataFrame
into five shorter
DataFrame
s.
import sys
import numpy as np
import pandas as pd
nrows = 10
ncols = 10
data = np.arange(nrows * ncols).reshape(nrows, ncols)
df = pd.DataFrame(data = data)
print(df)
print()
groups = df.groupby(df.index // 2) #or groups = df.groupby(df.index // 2, axis = 0)
for groupName, group in groups: #groupName is int, group is DataFrame
print(f'{groupName}{(80 - len(str(groupName))) * "-"}')
print(group)
print()
sys.exit(0)
0 1 2 3 4 5 6 7 8 9 0 0 1 2 3 4 5 6 7 8 9 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 54 55 56 57 58 59 6 60 61 62 63 64 65 66 67 68 69 7 70 71 72 73 74 75 76 77 78 79 8 80 81 82 83 84 85 86 87 88 89 9 90 91 92 93 94 95 96 97 98 99 0------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 0 0 1 2 3 4 5 6 7 8 9 1 10 11 12 13 14 15 16 17 18 19 1------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 2 20 21 22 23 24 25 26 27 28 29 3 30 31 32 33 34 35 36 37 38 39 2------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 54 55 56 57 58 59 3------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 6 60 61 62 63 64 65 66 67 68 69 7 70 71 72 73 74 75 76 77 78 79 4------------------------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 8 80 81 82 83 84 85 86 87 88 89 9 90 91 92 93 94 95 96 97 98 99
import sys
import numpy as np
import pandas as pd
nrows = 10
ncols = 10
data = np.arange(nrows * ncols).reshape(nrows, ncols)
df = pd.DataFrame(data = data)
print(df)
print()
groups = df.groupby(df.index // 2, axis = 1)
for groupName, group in groups: #groupName is int, group is DataFrame
print(f'{groupName}{(80 - len(str(groupName))) * "-"}')
print(group)
print()
sys.exit(0)
0------------------------------------------------------------------------------- 0 1 0 0 1 1 10 11 2 20 21 3 30 31 4 40 41 5 50 51 6 60 61 7 70 71 8 80 81 9 90 91 1------------------------------------------------------------------------------- 2 3 0 2 3 1 12 13 2 22 23 3 32 33 4 42 43 5 52 53 6 62 63 7 72 73 8 82 83 9 92 93 2------------------------------------------------------------------------------- 4 5 0 4 5 1 14 15 2 24 25 3 34 35 4 44 45 5 54 55 6 64 65 7 74 75 8 84 85 9 94 95 3------------------------------------------------------------------------------- 6 7 0 6 7 1 16 17 2 26 27 3 36 37 4 46 47 5 56 57 6 66 67 7 76 77 8 86 87 9 96 97 4------------------------------------------------------------------------------- 8 9 0 8 9 1 18 19 2 28 29 3 38 39 4 48 49 5 58 59 6 68 69 7 78 79 8 88 89 9 98 99
Let’s go back to our original Series.
import sys
import pandas as pd
shootings = [
["Brooklyn", 20],
["Bronx", 10],
["Bronx", 5],
["Manhattan", 40],
["Queens", 10],
["Staten Island", 12],
["Manhattan", 20],
["Queens", 30],
["Bronx", 7]
]
index = pd.Index([shooting[0] for shooting in shootings], name = "Boroughs")
data = [shooting[1] for shooting in shootings]
series = pd.Series(data = data, index = index, name = "Shootings")
print(series)
print()
aggregatedSeries = series.groupby(series.index).size()
print(aggregatedSeries)
sys.exit(0)
This time we are creating one
Series
containing five rows,
instead of creating five
Series
es.
The
size
function will measure the size of the five
Series
es
we created in the original program.
For example,
it will first measure the size of the
Bronx
Series
,
whose values are 10, 5, 7.
Boroughs Brooklyn 20 Bronx 10 Bronx 5 Manhattan 40 Queens 10 Staten Island 12 Manhattan 20 Queens 30 Bronx 7 Name: Shootings, dtype: int64 Boroughs Bronx 3 Brooklyn 1 Manhattan 2 Queens 2 Staten Island 1 Name: Shootings, dtype: int64
Try the following aggregations:
aggregatedSeries = series.groupby(series.index).size()
aggregatedSeries = series.groupby(series.index).count()
aggregatedSeries = series.groupby(series.index).sum()
aggregatedSeries = series.groupby(series.index).mean() #or median
aggregatedSeries = series.groupby(series.index).max() #or min
See
GroupBy
for the complete list.
def spread(series): "How far apart are the largest and smallest numbers in the Series?" return series.max() - series.min() aggregatedSeries = series.groupby(series.index).agg(spread)
aggregatedSeries = series.groupby(series.index).agg(lambda series: series.max() - series.min())
Each of the above functions
(size
,
count
,
sum
,
mean
,
spread
)
returned one number,
and we ended up with an
aggregatedSeries
whose index had one level.
The following function
getstats
returns a
Series
of numbers
(so we must call
apply
instead of
agg
),
and we end up with an
aggregatedSeries
whose index has two levels.
I like the following
unstack
ing
with
level = -1
,
rather than
level = 0
,
because it lets me convert the counts, mins, and maxs to integers.
import sys import pandas as pd shootings = [ ["Brooklyn", 20], ["Bronx", 10], ["Bronx", 5], ["Manhattan", 40], ["Queens", 10], ["Staten Island", 12], ["Manhattan", 20], ["Queens", 30], ["Bronx", 7] ] index = pd.Index([shooting[0] for shooting in shootings], name = "Boroughs") data = [shooting[1] for shooting in shootings] series = pd.Series(data = data, index = index, name = "Shootings") print(series) print() def getstats(series): "Return a new Series of four numbers about the Series received as an argument." index = pd.Index(data = ["count", "min", "max", "mean"], name = "Functions") data = [series.count(), series.min(), series.max(), series.mean()] return pd.Series(data = data, index = index) aggregatedSeries = series.groupby(series.index).apply(getstats) print(aggregatedSeries) print() #Since the index of the aggregatedSeries has 2 levels, there are 2 ways to unstack it. df = aggregatedSeries.unstack(level = -1) #Level -1 is the rightmost level, Functions. df = df.astype({"count": int, "min": int, "max": int}) print(df) print() df = aggregatedSeries.unstack(level = 0) #Level 0 is the leftmost level, Boroughs. print(df) sys.exit(0)
Boroughs Brooklyn 20 Bronx 10 Bronx 5 Manhattan 40 Queens 10 Staten Island 12 Manhattan 20 Queens 30 Bronx 7 Name: Shootings, dtype: int64 Boroughs Functions Bronx count 3.000000 min 5.000000 max 10.000000 mean 7.333333 Brooklyn count 1.000000 min 20.000000 max 20.000000 mean 20.000000 Manhattan count 2.000000 min 20.000000 max 40.000000 mean 30.000000 Queens count 2.000000 min 10.000000 max 30.000000 mean 20.000000 Staten Island count 1.000000 min 12.000000 max 12.000000 mean 12.000000 Name: Shootings, dtype: float64 Functions count min max mean Boroughs Bronx 3 5 10 7.333333 Brooklyn 1 20 20 20.000000 Manhattan 2 20 40 30.000000 Queens 2 10 30 20.000000 Staten Island 1 12 12 12.000000 Boroughs Bronx Brooklyn Manhattan Queens Staten Island Functions count 3.000000 1.0 2.0 2.0 1.0 min 5.000000 20.0 20.0 10.0 12.0 max 10.000000 20.0 40.0 30.0 12.0 mean 7.333333 20.0 30.0 20.0 12.0
The above transformation
getstats
changed a
Series
into a new
Series
of four numbers.
Try the following transformation,
which will change a
Series
into a new
Series
of eight numbers.
Three different
describe
s:
df = series.groupby(series.index).describe()
Divide the
DataFrame
into five shorter
DataFrame
s.
For example,
the first shorter
DataFrame
is rows 0 and 1 of the original
DataFrame
.
The first column of this shorter
DataFrame
is the two numbers 0 and 10.
Their mean is 5.
import sys import numpy as np import pandas as pd nrows = 10 ncols = 10 data = np.arange(nrows * ncols).reshape(nrows, ncols) df = pd.DataFrame(data = data) print(df) print() aggregatedDf = df.groupby(df.index // 2).mean() print(aggregatedDf) sys.exit(0)
0 1 2 3 4 5 6 7 8 9 0 0 1 2 3 4 5 6 7 8 9 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 54 55 56 57 58 59 6 60 61 62 63 64 65 66 67 68 69 7 70 71 72 73 74 75 76 77 78 79 8 80 81 82 83 84 85 86 87 88 89 9 90 91 92 93 94 95 96 97 98 99 0 1 2 3 4 5 6 7 8 9 0 5 6 7 8 9 10 11 12 13 14 1 25 26 27 28 29 30 31 32 33 34 2 45 46 47 48 49 50 51 52 53 54 3 65 66 67 68 69 70 71 72 73 74 4 85 86 87 88 89 90 91 92 93 94
The following function
f
is called 5 × 10 = 50 times,
because there are 5 short
DataFrame
s,
and each one has 10 columns.
import sys import numpy as np import pandas as pd nrows = 10 ncols = 10 data = np.arange(nrows * ncols).reshape(nrows, ncols) df = pd.DataFrame(data = data) print(df) print() def f(series): "Return the average of the largest and smallest numbers in the Series." return (series.max() + series.min()) / 2 aggregatedDf = df.groupby(df.index // 2).agg(f) print(aggregatedDf) sys.exit(0)
0 1 2 3 4 5 6 7 8 9 0 0 1 2 3 4 5 6 7 8 9 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 54 55 56 57 58 59 6 60 61 62 63 64 65 66 67 68 69 7 70 71 72 73 74 75 76 77 78 79 8 80 81 82 83 84 85 86 87 88 89 9 90 91 92 93 94 95 96 97 98 99 0 1 2 3 4 5 6 7 8 9 0 5 6 7 8 9 10 11 12 13 14 1 25 26 27 28 29 30 31 32 33 34 2 45 46 47 48 49 50 51 52 53 54 3 65 66 67 68 69 70 71 72 73 74 4 85 86 87 88 89 90 91 92 93 94
The
agg
statement calls
f
5 × 10 = 50 times,
because there are 5 short
DataFrame
s,
and each one has 10 columns.
The
apply
statement calls
f
5
because there are 5 short
DataFrame
s.
import sys import numpy as np import pandas as pd nrows = 10 ncols = 10 data = np.arange(nrows * ncols).reshape(nrows, ncols) df = pd.DataFrame(data = data) print(df) print() #When f is called by agg, x is a Series of 2 ints. #When f is called by apply, x is a 2 by 10 DataFrame holding 20 ints. def f(x): "Return the number of numbers in x." return x.size #or try x.ndim or x.shape aggregatedDf = df.groupby(df.index // 2).agg(f) print(aggregatedDf) print() aggregatedSeries = df.groupby(df.index // 2).apply(f) print(aggregatedSeries) print() sys.exit(0)
0 1 2 3 4 5 6 7 8 9 0 0 1 2 3 4 5 6 7 8 9 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 54 55 56 57 58 59 6 60 61 62 63 64 65 66 67 68 69 7 70 71 72 73 74 75 76 77 78 79 8 80 81 82 83 84 85 86 87 88 89 9 90 91 92 93 94 95 96 97 98 99 0 1 2 3 4 5 6 7 8 9 0 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 0 20 1 20 2 20 3 20 4 20 dtype: int64
Series
vs.
DataFrame
.
Applied function returns scaler,
Series
,
DataFrame
.
describe
?
unstack
the result, quartile