Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
- Type conversion support in GForce expressions (e.g., `sum(as.numeric(x))` will use GForce, saving the need to coerce `x` in a setup step) [#2934](https://github.com/Rdatatable/data.table/issues/2934)
- Arithmetic operation support in GForce (e.g., `max(x) - min(x)` will use GForce on both `max(x)` and `min(x)`, saving the need to do the subtraction in a follow-up step) [#3815](https://github.com/Rdatatable/data.table/issues/3815)

4. Joins (`y[x, on=]` or `merge(x, y, ...)`) now display join statistics with `options(datatable.verbose=TRUE)`, showing row counts, matched rows, and join columns used, [#4677](https://github.com/Rdatatable/data.table/issues/4677). Thanks @thorek1 and @grantmcdermott for the suggestion and @ben-schwen for the implementation.

### BUG FIXES

1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix.
Expand Down
37 changes: 36 additions & 1 deletion R/bmerge.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ coerce_col = function(dt, col, from_type, to_type, from_name, to_name, from_deta
set(dt, j=col, value=cast_with_attrs(dt[[col]], cast_fun))
}

bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose)
bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbose, notjoin=FALSE)
{
if (roll != 0.0 && length(icols)) {
last_x_idx = tail(xcols, 1L)
Expand Down Expand Up @@ -224,6 +224,41 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
if (verbose) {catf("bmerge done in %s\n",timetaken(last.started.at)); flush.console()}
# TO DO: xo could be moved inside Cbmerge

# join statistics
if (verbose) {
nrow_x = nrow(x)
nrow_i = nrow(i)
inner_join = is.null(nomatch) || identical(nomatch, 0L)
idx = if (inner_join) ans$starts != 0L else !is.na(ans$starts)
matched_i = sum(idx)

if (notjoin) {
# Anti-join: count rows in x that were NOT matched
result_rows = if (matched_i > 0L) nrow_x - length(unique(ans$starts[idx])) else nrow_x
} else if (inner_join) {
# Inner join: sum lengths for matched rows only
result_rows = if (matched_i > 0L) sum(ans$lens[idx]) else 0L
} else {
# Left join: sum all lengths (includes NAs for unmatched)
result_rows = sum(ans$lens)
}

op_symbols = if (length(ops)) c("==", "<=", "<", ">=", ">")[ops] else strrep("==", length(icols))
join_str = toString(sprintf("%s %s %s", names(x)[xcols], op_symbols, names(i)[icols]))
num_width = max(vapply_1i(list(nrow_x, nrow_i, matched_i, result_rows, join_str), nchar))
# nchar("rows in x: ") == 14L
separator = strrep("-", 14L + num_width)

catf("Join summary:\n")
catf(" rows in x: %*d\n", num_width, nrow_x)
catf(" rows in i: %*d\n", num_width, nrow_i)
catf(" matched rows: %*d\n", num_width, matched_i)
catf(" join columns: %s\n", join_str)
catf(" %s\n", separator)
catf(" result rows: %*d\n", num_width, result_rows)
flush.console()
}

ans$xo = xo # for further use by [.data.table
ans
}
2 changes: 1 addition & 1 deletion R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ replace_dot_alias = function(e) {
setattr(i, 'sorted', names(i)) # since 'x' has key set, this'll always be sorted
}
i = .shallow(i, retain.key = TRUE)
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose, notjoin=notjoin)
if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all"
xo = ans$xo ## to make it available for further use.
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
Expand Down
19 changes: 19 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -21509,3 +21509,22 @@ setdroplevels(x)
setdroplevels(y)
test(2364.2, levels(x$a), levels(y$a))
rm(x, y)

# join statistics #4677
x = data.table(A = 1:5, B = 6:10)
y = data.table(A = c(1L, 1L, 4L), C = LETTERS[c(1L, 2L, 4L)])
test(2365.1, nrow(x[y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
test(2365.2, nrow(y[x, on = "A", verbose=TRUE]), 6L, output="result rows: \\s+6\n")
test(2365.3, nrow(y[x, on = "A", nomatch=NULL, verbose=TRUE]), 3L, output="result rows: \\s+3\n")
test(2365.4, nrow(x[!y, on = "A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
test(2365.5, nrow(y[y, on = "A", allow.cartesian=TRUE, verbose=TRUE]), 5L, output="result rows: \\s+5\n")
test(2365.6, options=c(datatable.verbose=TRUE), nrow(merge(x, y, by="A")), 3L, output="result rows: \\s+3\n")
x = data.table(id = c("A", "A", "A", "B", "B"), date = as.IDate(c("2010-01-01", "2012-01-01", "2014-01-01", "2010-01-01", "2012-01-01")))
y = data.table(id = c("A", "B"), date = as.IDate(c("2013-01-01", "2013-01-01")))
test(2365.7, nrow(x[y, on = .(id, date <= date), verbose=TRUE]), 4L, output="join columns: id == id, date <= date.*result rows: \\s+4\n")
x = data.table(A = integer(0))
y = data.table(A = 1:3)
test(2365.8, nrow(x[y, on="A", verbose=TRUE]), 3L, output="result rows: \\s+3\n")
x = data.table(A = 1:3)
y = data.table(A = 4:6)
test(2365.9, nrow(x[y, on="A", nomatch=NULL, verbose=TRUE]), 0L, output="matched rows: \\s+0\n")
Loading