diff --git a/NEWS.md b/NEWS.md index 24a078836..c09d65ffb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,8 @@ - Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934) - Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815) +4. Joins (`y[x, on=]` or `merge(x, y, ...)`) now display join statistics with `options(datatable.verbose=TRUE)`, showing row counts, matched rows, and join columns used, [#4677](https://github.com/Rdatatable/data.table/issues/4677). Thanks @thorek1 and @grantmcdermott for the suggestion and @ben-schwen for the implementation. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/R/bmerge.R b/R/bmerge.R index 3c903ae35..00113adc3 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -25,7 +25,7 @@ coerce_col = function(dt, col, from_type, to_type, from_name, to_name, from_deta set(dt, j=col, value=cast_with_attrs(dt[[col]], cast_fun)) } -bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose) +bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose, notjoin=FALSE) { if (roll != 0.0 && length(icols)) { last_x_idx = tail(xcols, 1L) @@ -224,6 +224,41 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()} # TO DO: xo could be moved inside Cbmerge + # join statistics + if (verbose) { + nrow_x = nrow(x) + nrow_i = nrow(i) + inner_join = is.null(nomatch) || identical(nomatch, 0L) + idx = if (inner_join) ans$starts != 0L else !is.na(ans$starts) + matched_i = sum(idx) + + if (notjoin) { + # Anti-join: count rows in x that were NOT matched + result_rows = if (matched_i > 0L) nrow_x - length(unique(ans$starts[idx])) else nrow_x + } else if (inner_join) { + # Inner join: sum lengths for matched rows only + result_rows = if (matched_i > 0L) sum(ans$lens[idx]) else 0L + } else { + # Left join: sum all lengths (includes NAs for unmatched) + result_rows = sum(ans$lens) + } + + op_symbols = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops] else strrep("==", length(icols)) + join_str = toString(sprintf("%s %s %s", names(x)[xcols], op_symbols, names(i)[icols])) + num_width = max(vapply_1i(list(nrow_x, nrow_i, matched_i, result_rows, join_str), nchar)) + # nchar("rows in x: ") == 14L + separator = strrep("-", 14L + num_width) + + catf("Join summary:\n") + catf(" rows in x: %*d\n", num_width, nrow_x) + catf(" rows in i: %*d\n", num_width, nrow_i) + catf(" matched rows: %*d\n", num_width, matched_i) + catf(" join columns: %s\n", join_str) + catf(" %s\n", separator) + catf(" result rows: %*d\n", num_width, result_rows) + flush.console() + } + ans$xo = xo # for further use by [.data.table ans } diff --git a/R/data.table.R b/R/data.table.R index 85d623d39..ea2ae8526 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -892,7 +892,7 @@ replace_dot_alias = function(e) { setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted } i = .shallow(i, retain.key = TRUE) - ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose) + ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose, notjoin=notjoin) if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all" xo = ans$xo ## to make it available for further use. # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1c7ab6837..cb192606d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21509,3 +21509,22 @@ setdroplevels(x) setdroplevels(y) test(2364.2, levels(x$a), levels(y$a)) rm(x, y) + +# join statistics #4677 +x = data.table(A = 1:5, B = 6:10) +y = data.table(A = c(1L, 1L, 4L), C = LETTERS[c(1L, 2L, 4L)]) +test(2365.1, nrow(x[y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n") +test(2365.2, nrow(y[x, on = "A", verbose=TRUE]), 6L, output="result rows: \\s+6\n") +test(2365.3, nrow(y[x, on = "A", nomatch=NULL, verbose=TRUE]), 3L, output="result rows: \\s+3\n") +test(2365.4, nrow(x[!y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n") +test(2365.5, nrow(y[y, on = "A", allow.cartesian=TRUE, verbose=TRUE]), 5L, output="result rows: \\s+5\n") +test(2365.6, options=c(datatable.verbose=TRUE), nrow(merge(x, y, by="A")), 3L, output="result rows: \\s+3\n") +x = data.table(id = c("A", "A", "A", "B", "B"), date = as.IDate(c("2010-01-01", "2012-01-01", "2014-01-01", "2010-01-01", "2012-01-01"))) +y = data.table(id = c("A", "B"), date = as.IDate(c("2013-01-01", "2013-01-01"))) +test(2365.7, nrow(x[y, on = .(id, date <= date), verbose=TRUE]), 4L, output="join columns: id == id, date <= date.*result rows: \\s+4\n") +x = data.table(A = integer(0)) +y = data.table(A = 1:3) +test(2365.8, nrow(x[y, on="A", verbose=TRUE]), 3L, output="result rows: \\s+3\n") +x = data.table(A = 1:3) +y = data.table(A = 4:6) +test(2365.9, nrow(x[y, on="A", nomatch=NULL, verbose=TRUE]), 0L, output="matched rows: \\s+0\n")