I am testing out the dev version of dplyr and have noticed some performance regressions when using summarize with a large number of groups. Calling n()
with a large number of groups produces a ~400x increased runtime, whereas using max()
has ~10x increased runtime.
Performance on 0.8.5
library(dplyr, warn.conflicts = FALSE) packageVersion("dplyr") #> [1] '0.8.5' set.seed(42) many_grps <- data.frame(grp = sample(1:1e5, 1e6, replace = TRUE), val = runif(1e6)) %>% group_by(grp) n_groups(many_grps) #> [1] 99997 set.seed(42) few_grps <- data.frame(grp = sample(1:100, 1e6, replace = TRUE), val = runif(1e6)) %>% group_by(grp) n_groups(few_grps) #> [1] 100 microbenchmark::microbenchmark(summarize(many_grps, n = n()), summarize(many_grps, m = max(val)), summarize(few_grps, n = n()), summarize(few_grps, m = max(val)), times = 5, unit = 'ms') #> Unit: milliseconds #> expr min lq mean median #> summarize(many_grps, n = n()) 2.474665 2.531869 2.786606 2.743778 #> summarize(many_grps, m = max(val)) 17.693114 19.297248 22.774355 20.640482 #> summarize(few_grps, n = n()) 0.144234 0.154476 0.182271 0.175776 #> summarize(few_grps, m = max(val)) 8.792012 10.393226 11.908963 10.482098 #> uq max neval cld #> 3.037315 3.145401 5 a #> 27.791042 28.449888 5 c #> 0.190168 0.246701 5 a #> 14.835186 15.042294 5 b
Created on 2020-03-21 by the reprex package (v0.3.0)
Performance on current dev version
library(dplyr, warn.conflicts = FALSE) packageVersion("dplyr") #> [1] '0.8.99.9002' set.seed(42) many_grps <- data.frame(grp = sample(1:1e5, 1e6, replace = TRUE), val = runif(1e6)) %>% group_by(grp) n_groups(many_grps) #> [1] 99997 set.seed(42) few_grps <- data.frame(grp = sample(1:100, 1e6, replace = TRUE), val = runif(1e6)) %>% group_by(grp) n_groups(few_grps) #> [1] 100 microbenchmark::microbenchmark(summarize(many_grps, n = n()), summarize(many_grps, m = max(val)), summarize(few_grps, n = n()), summarize(few_grps, m = max(val)), times = 5, unit = 'ms') #> Unit: milliseconds #> expr min lq mean #> summarize(many_grps, n = n()) 1129.489705 1170.905902 1177.632328 #> summarize(many_grps, m = max(val)) 164.942559 180.437278 212.037870 #> summarize(few_grps, n = n()) 2.286928 2.307006 2.408776 #> summarize(few_grps, m = max(val)) 13.396918 14.531448 15.026723 #> median uq max neval cld #> 1177.479608 1188.72916 1221.55727 5 c #> 205.139373 218.82458 290.84556 5 b #> 2.324175 2.40646 2.71931 5 a #> 14.850954 15.62192 16.73238 5 a
Created on 2020-03-21 by the reprex package (v0.3.0)
tungttnguyen, foundinblank, chrislim5, MajoroMask and seabbs
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4