In [1]:
import numpy as np import pandas as pd PREVIOUS_MAX_ROWS = pd.options.display.max_rows pd.options.display.max_columns = 20 pd.options.display.max_rows = 20 pd.options.display.max_colwidth = 80 np.random.seed(12345) import matplotlib.pyplot as plt plt.rc("figure", figsize=(10, 6)) np.set_printoptions(precision=4, suppress=True)
In [2]:
import numpy as np import pandas as pd
In [3]:
df = pd.DataFrame({"key1" : ["a", "a", None, "b", "b", "a", None], "key2" : pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"), "data1" : np.random.standard_normal(7), "data2" : np.random.standard_normal(7)}) df
In [4]:
grouped = df["data1"].groupby(df["key1"]) grouped
In [6]:
means = df["data1"].groupby([df["key1"], df["key2"]]).mean() means
In [8]:
states = np.array(["OH", "CA", "CA", "OH", "OH", "CA", "OH"]) years = [2005, 2005, 2006, 2005, 2006, 2005, 2006] df["data1"].groupby([states, years]).mean()
In [9]:
df.groupby("key1").mean() df.groupby("key2").mean(numeric_only=True) df.groupby(["key1", "key2"]).mean()
In [10]:
df.groupby(["key1", "key2"]).size()
In [11]:
df.groupby("key1", dropna=False).size() df.groupby(["key1", "key2"], dropna=False).size()
In [12]:
df.groupby("key1").count()
In [13]:
for name, group in df.groupby("key1"): print(name) print(group)
In [14]:
for (k1, k2), group in df.groupby(["key1", "key2"]): print((k1, k2)) print(group)
In [15]:
pieces = {name: group for name, group in df.groupby("key1")} pieces["b"]
In [16]:
grouped = df.groupby({"key1": "key", "key2": "key", "data1": "data", "data2": "data"}, axis="columns")
In [17]:
for group_key, group_values in grouped: print(group_key) print(group_values)
In [18]:
df.groupby(["key1", "key2"])[["data2"]].mean()
In [19]:
s_grouped = df.groupby(["key1", "key2"])["data2"] s_grouped s_grouped.mean()
In [20]:
people = pd.DataFrame(np.random.standard_normal((5, 5)), columns=["a", "b", "c", "d", "e"], index=["Joe", "Steve", "Wanda", "Jill", "Trey"]) people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values people
In [21]:
mapping = {"a": "red", "b": "red", "c": "blue", "d": "blue", "e": "red", "f" : "orange"}
In [22]:
by_column = people.groupby(mapping, axis="columns") by_column.sum()
In [23]:
map_series = pd.Series(mapping) map_series people.groupby(map_series, axis="columns").count()
In [24]:
people.groupby(len).sum()
In [25]:
key_list = ["one", "one", "one", "two", "two"] people.groupby([len, key_list]).min()
In [26]:
columns = pd.MultiIndex.from_arrays([["US", "US", "US", "JP", "JP"], [1, 3, 5, 1, 3]], names=["cty", "tenor"]) hier_df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns) hier_df
In [27]:
hier_df.groupby(level="cty", axis="columns").count()
In [28]:
df grouped = df.groupby("key1") grouped["data1"].nsmallest(2)
In [29]:
def peak_to_peak(arr): return arr.max() - arr.min() grouped.agg(peak_to_peak)
In [31]:
tips = pd.read_csv("examples/tips.csv") tips.head()
In [32]:
tips["tip_pct"] = tips["tip"] / tips["total_bill"] tips.head()
In [33]:
grouped = tips.groupby(["day", "smoker"])
In [34]:
grouped_pct = grouped["tip_pct"] grouped_pct.agg("mean")
In [35]:
grouped_pct.agg(["mean", "std", peak_to_peak])
In [36]:
grouped_pct.agg([("average", "mean"), ("stdev", np.std)])
In [37]:
functions = ["count", "mean", "max"] result = grouped[["tip_pct", "total_bill"]].agg(functions) result
In [39]:
ftuples = [("Average", "mean"), ("Variance", np.var)] grouped[["tip_pct", "total_bill"]].agg(ftuples)
In [40]:
grouped.agg({"tip" : np.max, "size" : "sum"}) grouped.agg({"tip_pct" : ["min", "max", "mean", "std"], "size" : "sum"})
In [41]:
grouped = tips.groupby(["day", "smoker"], as_index=False) grouped.mean(numeric_only=True)
In [42]:
def top(df, n=5, column="tip_pct"): return df.sort_values(column, ascending=False)[:n] top(tips, n=6)
In [43]:
tips.groupby("smoker").apply(top)
In [44]:
tips.groupby(["smoker", "day"]).apply(top, n=1, column="total_bill")
In [45]:
result = tips.groupby("smoker")["tip_pct"].describe() result result.unstack("smoker")
In [46]:
tips.groupby("smoker", group_keys=False).apply(top)
In [47]:
frame = pd.DataFrame({"data1": np.random.standard_normal(1000), "data2": np.random.standard_normal(1000)}) frame.head() quartiles = pd.cut(frame["data1"], 4) quartiles.head(10)
In [48]:
def get_stats(group): return pd.DataFrame( {"min": group.min(), "max": group.max(), "count": group.count(), "mean": group.mean()} ) grouped = frame.groupby(quartiles) grouped.apply(get_stats)
In [49]:
grouped.agg(["min", "max", "count", "mean"])
In [50]:
quartiles_samp = pd.qcut(frame["data1"], 4, labels=False) quartiles_samp.head() grouped = frame.groupby(quartiles_samp) grouped.apply(get_stats)
In [51]:
s = pd.Series(np.random.standard_normal(6)) s[::2] = np.nan s s.fillna(s.mean())
In [52]:
states = ["Ohio", "New York", "Vermont", "Florida", "Oregon", "Nevada", "California", "Idaho"] group_key = ["East", "East", "East", "East", "West", "West", "West", "West"] data = pd.Series(np.random.standard_normal(8), index=states) data
In [53]:
data[["Vermont", "Nevada", "Idaho"]] = np.nan data data.groupby(group_key).size() data.groupby(group_key).count() data.groupby(group_key).mean()
In [54]:
def fill_mean(group): return group.fillna(group.mean()) data.groupby(group_key).apply(fill_mean)
In [55]:
fill_values = {"East": 0.5, "West": -1} def fill_func(group): return group.fillna(fill_values[group.name]) data.groupby(group_key).apply(fill_func)
In [56]:
suits = ["H", "S", "C", "D"] # Hearts, Spades, Clubs, Diamonds card_val = (list(range(1, 11)) + [10] * 3) * 4 base_names = ["A"] + list(range(2, 11)) + ["J", "K", "Q"] cards = [] for suit in suits: cards.extend(str(num) + suit for num in base_names) deck = pd.Series(card_val, index=cards)
In [58]:
def draw(deck, n=5): return deck.sample(n) draw(deck)
In [59]:
def get_suit(card): # last letter is suit return card[-1] deck.groupby(get_suit).apply(draw, n=2)
In [60]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)
In [61]:
df = pd.DataFrame({"category": ["a", "a", "a", "a", "b", "b", "b", "b"], "data": np.random.standard_normal(8), "weights": np.random.uniform(size=8)}) df
In [62]:
grouped = df.groupby("category") def get_wavg(group): return np.average(group["data"], weights=group["weights"]) grouped.apply(get_wavg)
In [63]:
close_px = pd.read_csv("examples/stock_px.csv", parse_dates=True, index_col=0) close_px.info() close_px.tail(4)
In [64]:
def spx_corr(group): return group.corrwith(group["SPX"])
In [65]:
rets = close_px.pct_change().dropna()
In [66]:
def get_year(x): return x.year by_year = rets.groupby(get_year) by_year.apply(spx_corr)
In [67]:
def corr_aapl_msft(group): return group["AAPL"].corr(group["MSFT"]) by_year.apply(corr_aapl_msft)
In [68]:
import statsmodels.api as sm def regress(data, yvar=None, xvars=None): Y = data[yvar] X = data[xvars] X["intercept"] = 1. result = sm.OLS(Y, X).fit() return result.params
In [69]:
by_year.apply(regress, yvar="AAPL", xvars=["SPX"])
In [70]:
df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4, 'value': np.arange(12.)}) df
In [71]:
g = df.groupby('key')['value'] g.mean()
In [72]:
def get_mean(group): return group.mean() g.transform(get_mean)
In [74]:
def times_two(group): return group * 2 g.transform(times_two)
In [75]:
def get_ranks(group): return group.rank(ascending=False) g.transform(get_ranks)
In [76]:
def normalize(x): return (x - x.mean()) / x.std()
In [77]:
g.transform(normalize) g.apply(normalize)
In [78]:
g.transform('mean') normalized = (df['value'] - g.transform('mean')) / g.transform('std') normalized
In [79]:
tips.head() tips.pivot_table(index=["day", "smoker"], values=["size", "tip", "tip_pct", "total_bill"])
In [80]:
tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"])
In [81]:
tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"], margins=True)
In [82]:
tips.pivot_table(index=["time", "smoker"], columns="day", values="tip_pct", aggfunc=len, margins=True)
In [83]:
tips.pivot_table(index=["time", "size", "smoker"], columns="day", values="tip_pct", fill_value=0)
In [84]:
from io import StringIO data = """Sample Nationality Handedness 1 USA Right-handed 2 Japan Left-handed 3 USA Right-handed 4 Japan Right-handed 5 Japan Left-handed 6 Japan Right-handed 7 USA Right-handed 8 USA Left-handed 9 Japan Right-handed 10 USA Right-handed""" data = pd.read_table(StringIO(data), sep="\s+")
In [86]:
pd.crosstab(data["Nationality"], data["Handedness"], margins=True)
In [87]:
pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True)
In [89]:
pd.options.display.max_rows = PREVIOUS_MAX_ROWS
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4