From 3789d0f46821026142659f51bec9a889f07809d6 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 25 Jun 2026 10:51:56 -0700 Subject: [PATCH 1/9] gitattributes: document how an external diff driver interacts with diff features The "Defining an external diff driver" section explains how to configure diff..command but not how the driver relates to the rest of Git's diff machinery. In particular, the command only replaces the textual patch: word diff, function context, color, and the like cannot apply to its output, while the summary formats, blame, and git log -L do not run it at all and keep using the builtin diff. Spell this out so the scope of an external diff driver is clear. Signed-off-by: Michael Montalbo --- Documentation/gitattributes.adoc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index bd76167a45eb71..2c4fbfd7f1495c 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -784,6 +784,16 @@ with the above configuration, i.e. `j-c-diff`, with 7 parameters, just like `GIT_EXTERNAL_DIFF` program is called. See linkgit:git[1] for details. +An external diff driver replaces the patch Git would otherwise +produce for the path: Git runs the command and shows its output in +place of its own. Output features that post-process Git's diff do +not apply to it; word diff, function context (`-W`), `--color-moved`, +and coloring all act on Git's builtin diff, not the driver's output. +The driver is consulted only when Git generates a textual patch. The +summary formats (`--stat`, `--numstat`, `--shortstat`, and +`--dirstat`), `git blame`, and `git log -L` do not run it and +continue to use Git's builtin diff. + If the program is able to ignore certain changes (similar to `git diff --ignore-space-change`), then also set the option `trustExitCode` to true. It is then expected to return exit code 1 if From a208cc85cb71ca854490130057855d63de965648 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:37:54 -0700 Subject: [PATCH 2/9] xdiff: support external hunks via xpparam_t Add two new xpparam_t fields (external_hunks, external_hunks_nr) that let callers supply pre-computed hunks. When set, xdl_diff() populates the changed[] arrays from these hunks instead of running the diff algorithm, then continues through compaction and emission as usual. Validate supplied hunks before use: reject out-of-bounds line numbers, overlapping or out-of-order hunks, negative counts, and misaligned unchanged runs. The run of unchanged lines between two hunks (and before the first and after the last) must be the same length on both sides; xdl_build_script() walks the two files in lockstep over unchanged lines, so a balanced total is not enough. On validation failure, fall back to the builtin diff algorithm; this re-runs xdl_prepare_env() since the first call may have dirtied the changed[] arrays. Skip trim_common_tail() in xdi_diff() when external hunks are present, since external hunks reference line numbers in the original content. Signed-off-by: Michael Montalbo --- xdiff-interface.c | 7 ++- xdiff/xdiff.h | 14 +++++ xdiff/xdiffi.c | 135 +++++++++++++++++++++++++++++++++++++++++++++- xdiff/xprepare.c | 10 ++++ xdiff/xprepare.h | 1 + 5 files changed, 164 insertions(+), 3 deletions(-) diff --git a/xdiff-interface.c b/xdiff-interface.c index db6938689f0a9e..1fa16af6681cf2 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -124,7 +124,12 @@ int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t co if (mf1->size > MAX_XDIFF_SIZE || mf2->size > MAX_XDIFF_SIZE) return -1; - if (!xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) + /* + * External hunks reference line numbers in the original content; + * trimming the tail would change line counts and invalidate them. + */ + if (!xpp->external_hunks && + !xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) trim_common_tail(&a, &b); return xdl_diff(&a, &b, xpp, xecfg, xecb); diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h index dc370712e92860..dd4915fe16ff19 100644 --- a/xdiff/xdiff.h +++ b/xdiff/xdiff.h @@ -78,6 +78,16 @@ typedef struct s_mmbuffer { long size; } mmbuffer_t; +/* + * Hunk descriptor for externally computed diffs. + * Line numbers are 1-based; a start of 0 is accepted when + * count is 0 (empty file side, matching git diff output). + */ +struct xdl_hunk { + long old_start, old_count; + long new_start, new_count; +}; + typedef struct s_xpparam { unsigned long flags; @@ -88,6 +98,10 @@ typedef struct s_xpparam { /* See Documentation/diff-options.adoc. */ char **anchors; size_t anchors_nr; + + /* Externally computed hunks: bypass the diff algorithm. Owned by caller. */ + struct xdl_hunk *external_hunks; + size_t external_hunks_nr; } xpparam_t; typedef struct s_xdemitcb { diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index c5a892f91e00c0..7fe22557f59136 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -1085,16 +1085,147 @@ static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe, } } +/* + * Populate the changed[] arrays from externally supplied hunks, + * bypassing the diff algorithm. Validates that hunks are in order, + * non-overlapping, and within bounds. + * + * Returns 0 on success, -1 on validation failure. + */ +static int xdl_populate_hunks_from_external(xdfenv_t *xe, + struct xdl_hunk *hunks, + size_t nr_hunks) +{ + size_t i; + long j, prev_old_end = 0, prev_new_end = 0; + + /* + * xdl_prepare_env() may dirty changed[] via xdl_cleanup_records(). + * Clear them so only the external hunks are marked. + */ + xdl_clear_changed(&xe->xdf1); + xdl_clear_changed(&xe->xdf2); + + for (i = 0; i < nr_hunks; i++) { + struct xdl_hunk *h = &hunks[i]; + + if (h->old_count < 0 || h->new_count < 0) { + warning("diff process hunk %"PRIuMAX": " + "negative count (old=%ld, new=%ld)", + (uintmax_t)(i + 1), + h->old_count, h->new_count); + return -1; + } + if (h->old_start < 1 || h->new_start < 1) { + warning("diff process hunk %"PRIuMAX": " + "start must be >= 1 (old=%ld, new=%ld)", + (uintmax_t)(i + 1), + h->old_start, h->new_start); + return -1; + } + + /* + * Range must fit: start + count - 1 <= nrec, + * rewritten to avoid overflow. Same for both sides. + * + * When count is 0 (pure insert/delete) the check + * reduces to 0 > nrec - start + 1, which rejects + * start > nrec + 1 and allows start == nrec + 1 + * (the position after the last line). + */ + if (h->old_count > (long)xe->xdf1.nrec - h->old_start + 1) { + warning("diff process hunk %"PRIuMAX": " + "old range %ld+%ld exceeds %lu lines", + (uintmax_t)(i + 1), + h->old_start, h->old_count, + (unsigned long)xe->xdf1.nrec); + return -1; + } + if (h->new_count > (long)xe->xdf2.nrec - h->new_start + 1) { + warning("diff process hunk %"PRIuMAX": " + "new range %ld+%ld exceeds %lu lines", + (uintmax_t)(i + 1), + h->new_start, h->new_count, + (unsigned long)xe->xdf2.nrec); + return -1; + } + + /* Ordering: no overlap with previous hunk (adjacent is OK) */ + if (h->old_start < prev_old_end || + h->new_start < prev_new_end) { + warning("diff process hunk %"PRIuMAX": " + "overlaps with previous hunk", + (uintmax_t)(i + 1)); + return -1; + } + + /* + * Lockstep alignment: the run of unchanged lines between + * the previous hunk and this one must be the same length + * on both sides. xdl_build_script() walks the two files + * together over unchanged lines, so a mismatched gap + * desynchronizes it and yields a corrupt script even + * though the *total* unchanged counts may still match. + */ + if (h->old_start - prev_old_end != + h->new_start - prev_new_end) { + warning("diff process hunk %"PRIuMAX": " + "unchanged run before hunk differs between " + "sides (old %ld, new %ld)", + (uintmax_t)(i + 1), + h->old_start - prev_old_end, + h->new_start - prev_new_end); + return -1; + } + + for (j = 0; j < h->old_count; j++) + xe->xdf1.changed[h->old_start - 1 + j] = true; + for (j = 0; j < h->new_count; j++) + xe->xdf2.changed[h->new_start - 1 + j] = true; + + prev_old_end = h->old_start + h->old_count; + prev_new_end = h->new_start + h->new_count; + } + + /* + * The trailing run of unchanged lines after the last hunk must + * also match. Together with the per-hunk gap check above, this + * guarantees the two files stay synchronized for + * xdl_build_script()'s lockstep walk, and it subsumes the + * weaker "total unchanged counts match" invariant. + */ + if ((long)xe->xdf1.nrec - prev_old_end != + (long)xe->xdf2.nrec - prev_new_end) { + warning("diff process: unchanged line count mismatch " + "(old: %ld trailing, new: %ld trailing)", + (long)xe->xdf1.nrec - prev_old_end, + (long)xe->xdf2.nrec - prev_new_end); + return -1; + } + + return 0; +} + int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t const *xecfg, xdemitcb_t *ecb) { xdchange_t *xscr; xdfenv_t xe; emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff; - if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) { + if (xpp->external_hunks) { + if (xdl_prepare_env(mf1, mf2, xpp, &xe) < 0) + return -1; + if (xdl_populate_hunks_from_external(&xe, + xpp->external_hunks, + xpp->external_hunks_nr) == 0) + goto diff_done; + xdl_free_env(&xe); + } + if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) return -1; - } + +diff_done: if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 || xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 || xdl_build_script(&xe, &xscr) < 0) { diff --git a/xdiff/xprepare.c b/xdiff/xprepare.c index 11bada2608a7a4..f4ab93533286da 100644 --- a/xdiff/xprepare.c +++ b/xdiff/xprepare.c @@ -471,3 +471,13 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, return 0; } + +/* + * Reset the changed[] array so that no lines are marked as changed. + * Also clears the sentinel slots at changed[-1] and changed[nrec] + * that xdl_change_compact() relies on during backward scans. + */ +void xdl_clear_changed(xdfile_t *xdf) +{ + memset(xdf->changed - 1, 0, (xdf->nrec + 2) * sizeof(bool)); +} diff --git a/xdiff/xprepare.h b/xdiff/xprepare.h index 947d9fc1bb8cf9..0413baf07bcc90 100644 --- a/xdiff/xprepare.h +++ b/xdiff/xprepare.h @@ -28,6 +28,7 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdfenv_t *xe); void xdl_free_env(xdfenv_t *xe); +void xdl_clear_changed(xdfile_t *xdf); From 8ad73871ccc5c9e0c47c9614a95704c1ed1ce692 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 21 May 2026 02:37:59 -0700 Subject: [PATCH 3/9] userdiff: add diff..process config Add the process field to struct userdiff_driver and teach the config parser to populate it from diff..process. Signed-off-by: Michael Montalbo --- userdiff.c | 7 +++++++ userdiff.h | 2 ++ 2 files changed, 9 insertions(+) diff --git a/userdiff.c b/userdiff.c index b5412e6bc3ecd3..7547874aa2569d 100644 --- a/userdiff.c +++ b/userdiff.c @@ -509,6 +509,13 @@ int userdiff_config(const char *k, const char *v) drv->algorithm = drv->algorithm_owned; return ret; } + if (!strcmp(type, "process")) { + int ret; + FREE_AND_NULL(drv->process_owned); + ret = git_config_string(&drv->process_owned, k, v); + drv->process = drv->process_owned; + return ret; + } return 0; } diff --git a/userdiff.h b/userdiff.h index 827361b0bc9569..51c26e0d4190e5 100644 --- a/userdiff.h +++ b/userdiff.h @@ -31,6 +31,8 @@ struct userdiff_driver { char *textconv_owned; struct notes_cache *textconv_cache; int textconv_want_cache; + const char *process; + char *process_owned; }; enum userdiff_driver_type { USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0, From 00f1e68656449a02d1e2988b402610529ff7957e Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 28 May 2026 16:21:32 -0700 Subject: [PATCH 4/9] sub-process: separate process lifecycle from hashmap management subprocess_start() and subprocess_stop() couple two concerns: managing a child process (setup, handshake, teardown) and managing a hashmap that indexes running processes by command string. The hashmap suits callers like convert.c where many files may share one filter process looked up by name, but callers that manage process lifetime through their own data structures do not need it. Extract subprocess_start_command() and subprocess_stop_command() so callers can reuse the child process setup and handshake machinery without maintaining a hashmap. subprocess_start() and subprocess_stop() become thin wrappers that add hashmap operations on top. Signed-off-by: Michael Montalbo --- sub-process.c | 28 +++++++++++++++++++++++----- sub-process.h | 9 ++++++++- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/sub-process.c b/sub-process.c index 2d5c965169727b..3cef42b0880e3f 100644 --- a/sub-process.c +++ b/sub-process.c @@ -49,7 +49,7 @@ int subprocess_read_status(int fd, struct strbuf *status) return (len < 0) ? len : 0; } -void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry) +void subprocess_stop_command(struct subprocess_entry *entry) { if (!entry) return; @@ -57,7 +57,14 @@ void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry) entry->process.clean_on_exit = 0; kill(entry->process.pid, SIGTERM); finish_command(&entry->process); +} +void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry) +{ + if (!entry) + return; + + subprocess_stop_command(entry); hashmap_remove(hashmap, &entry->ent, NULL); } @@ -72,7 +79,7 @@ static void subprocess_exit_handler(struct child_process *process) finish_command(process); } -int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, +int subprocess_start_command(struct subprocess_entry *entry, const char *cmd, subprocess_start_fn startfn) { int err; @@ -96,15 +103,26 @@ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, co return err; } - hashmap_entry_init(&entry->ent, strhash(cmd)); - err = startfn(entry); if (err) { error("initialization for subprocess '%s' failed", cmd); - subprocess_stop(hashmap, entry); + subprocess_stop_command(entry); return err; } + return 0; +} + +int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, + subprocess_start_fn startfn) +{ + int err; + + err = subprocess_start_command(entry, cmd, startfn); + if (err) + return err; + + hashmap_entry_init(&entry->ent, strhash(cmd)); hashmap_add(hashmap, &entry->ent); return 0; } diff --git a/sub-process.h b/sub-process.h index bfc3959a1b4894..45f1b8e5e3212f 100644 --- a/sub-process.h +++ b/sub-process.h @@ -52,10 +52,17 @@ int cmd2process_cmp(const void *unused_cmp_data, */ typedef int(*subprocess_start_fn)(struct subprocess_entry *entry); -/* Start a subprocess and add it to the subprocess hashmap. */ +/* Start a subprocess and run the startfn (typically handshake). */ +int subprocess_start_command(struct subprocess_entry *entry, const char *cmd, + subprocess_start_fn startfn); + +/* Start a subprocess, run startfn, and add it to the subprocess hashmap. */ int subprocess_start(struct hashmap *hashmap, struct subprocess_entry *entry, const char *cmd, subprocess_start_fn startfn); +/* Kill a subprocess. */ +void subprocess_stop_command(struct subprocess_entry *entry); + /* Kill a subprocess and remove it from the subprocess hashmap. */ void subprocess_stop(struct hashmap *hashmap, struct subprocess_entry *entry); From bf0f2d27da04be69d38918b09bf9d047cd7094f4 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 28 May 2026 15:08:38 -0700 Subject: [PATCH 5/9] diff: add long-running diff process via diff..process Add support for external diff processes that communicate via the long-running process protocol (pkt-line over stdin/stdout). A diff process is configured per userdiff driver: [diff "cdiff"] process = /path/to/diff-tool The tool provides custom line-matching: it receives file pairs and returns hunks that reference line numbers in the content. When textconv is also configured, the tool receives the textconv-transformed content. The tool controls which lines are marked as changed while the display shows the file content. Patch output features (word diff, function context, color) work normally. A new "Which features consult the diff process" documentation section lays out which features use the tool's hunks, which compute independently, and why; the summary formats such as --stat still use the builtin diff for now. The handshake negotiates version=1 and capability=hunks. Per-file requests send command=hunks, pathname, and both file contents as packetized data. The tool responds with hunk lines and a status packet (success, error, or abort). On error, Git warns and falls back to the builtin diff algorithm for that file. On abort, Git silently falls back for the current file and stops sending further requests to the tool for the remainder of the session. When the tool returns no hunks followed by status=success, Git treats the file as having no changes and produces no diff output. This also means --exit-code reports no changes for that file. The subprocess is stored on the userdiff_driver struct and launched on first use. If the process fails to start, the handshake fails, or a communication error occurs mid-stream, the failure is cached on the driver to avoid retrying and re-warning on every subsequent file. Git falls back to the builtin diff (rather than consulting the tool) when an option the tool cannot honor is in effect: the whitespace-ignoring flags, -I, and --anchored. A change that only adds or removes the trailing newline is likewise not expressible as hunks, so it too uses the builtin diff. The hunk parser ignores unknown trailing fields on a hunk line for response forward-compatibility. Hunk accumulation is bounded by the combined line count of the two files, so a misbehaving tool that floods hunk lines cannot grow memory without bound before validation runs. diff_process_fill_hunks() is the sole public entry point. It handles driver lookup, flag checks, subprocess management, and error reporting, returning an enum that lets callers distinguish "hunks populated" from "files equivalent" from "not applicable" from "tool failure." Helped-by: Johannes Schindelin Signed-off-by: Michael Montalbo --- Documentation/config/diff.adoc | 5 + Documentation/gitattributes.adoc | 228 ++++++++++++ Makefile | 2 + diff-process.c | 350 ++++++++++++++++++ diff-process.h | 39 ++ diff.c | 13 + diff.h | 3 + meson.build | 1 + t/helper/meson.build | 1 + t/helper/test-diff-process-backend.c | 341 +++++++++++++++++ t/helper/test-tool.c | 1 + t/helper/test-tool.h | 1 + t/meson.build | 1 + t/t4080-diff-process.sh | 529 +++++++++++++++++++++++++++ userdiff.h | 3 + 15 files changed, 1518 insertions(+) create mode 100644 diff-process.c create mode 100644 diff-process.h create mode 100644 t/helper/test-diff-process-backend.c create mode 100755 t/t4080-diff-process.sh diff --git a/Documentation/config/diff.adoc b/Documentation/config/diff.adoc index 1135a62a0ad3de..ac0635bb3bee1c 100644 --- a/Documentation/config/diff.adoc +++ b/Documentation/config/diff.adoc @@ -218,6 +218,11 @@ endif::git-diff[] Set this option to `true` to make the diff driver cache the text conversion outputs. See linkgit:gitattributes[5] for details. +`diff..process`:: + The command to run as a long-running diff process that + provides hunks to Git's diff pipeline. + See linkgit:gitattributes[5] for details. + `diff.indentHeuristic`:: Set this option to `false` to disable the default heuristics that shift diff hunk boundaries to make patches easier to read. diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index 2c4fbfd7f1495c..568b5583a4d280 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -831,6 +831,234 @@ NOTE: If `diff..command` is defined for path with the (see above), and adding `diff..algorithm` has no effect, as the algorithm is not passed to the external diff driver. +Using an external diff process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If `diff..process` is defined, Git sends the old and new file +content to an external tool and receives back a list of changed +regions (pairs of line ranges in the old and new file). Git uses +these instead of its builtin diff algorithm, but still controls +all output formatting, so features like word diff, function context, +color, and blame work normally. This is achieved by using the +long-running process protocol (described in +Documentation/technical/long-running-process-protocol.adoc). +Unlike `diff..command`, which replaces Git's output entirely, +the diff process feeds results back into the standard pipeline. If +both are configured for a path, `diff..command` takes precedence +for the output it produces: since it replaces the diff, the process is +not consulted there. + +First, in `.gitattributes`, assign the `diff` attribute for paths. + +------------------------ +*.c diff=cdiff +------------------------ + +Then, define a "diff..process" configuration to specify +the diff process command. + +---------------------------------------------------------------- +[diff "cdiff"] + process = /path/to/diff-process-tool +---------------------------------------------------------------- + +When Git encounters the first file that needs to be diffed, it starts +the process and performs the handshake. In the handshake, the welcome +message sent by Git is "git-diff-client", only version 1 is supported, +and the supported capability is "hunks" (the changed regions +described below). + +For each file, Git sends a list of "key=value" pairs terminated with +a flush packet, followed by the old and new file content as packetized +data, each terminated with a flush packet. The pathname is relative +to the repository root. When `diff..textconv` is also set, +the tool receives the textconv-transformed content rather than the +raw blob. Git does not send binary files to the diff process. + +----------------------- +packet: git> command=hunks +packet: git> pathname=path/file.c +packet: git> 0000 +packet: git> OLD_CONTENT +packet: git> 0000 +packet: git> NEW_CONTENT +packet: git> 0000 +----------------------- + +The tool is expected to respond with zero or more hunk lines, +a flush packet, and a status packet terminated with a flush packet. +Each hunk line has the form: + + `hunk ` + +where `` and `` identify a range of lines in +the old file, and `` and `` identify the +replacement range in the new file. Start values are 1-based and +counts are non-negative. Ranges must not extend beyond the end of +the file. For example, `hunk 3 2 3 4` means that 2 lines starting +at line 3 in the old file were replaced by 4 lines starting at +line 3 in the new file. An `` of 0 means no lines were +removed (pure insertion); a `` of 0 means no lines were +added (pure deletion). A start value of 0 is accepted when +the corresponding count is 0 (e.g., `hunk 0 0 1 5` for a newly +added file), matching what `git diff` itself emits for empty +file sides. Git ignores any extra whitespace-separated tokens after +``, so a future protocol version can append fields to a +hunk line (for example a "moved" marker) without older clients +rejecting it. + +Lines are delimited by newlines. A file `"foo\nbar\n"` and a +file `"foo\nbar"` both have 2 lines. + +Hunks must be listed in order and must not overlap. Any line not +covered by a hunk is treated as unchanged and is paired, in order, +with the unchanged lines on the other side. Each run of unchanged +lines between two hunks (and the run before the first hunk and +after the last) must therefore be the same length on both sides, +not merely equal in total. For the hunks `1 3 1 5` and `10 2 12 2` +below, lines 4-9 of the old file and lines 6-11 of the new file are +both the six unchanged lines between the two hunks. A response that +balances only the total unchanged count but misaligns one of these +runs is rejected, and Git falls back to the builtin diff. + +Git does not check that the lines a hunk leaves unchanged are +byte-for-byte identical between the two sides; it pairs them by +position and shows the new side as context. A tool may therefore +report lines that differ textually (a pure reformatting, say) as +unchanged, and the diff reflects that judgment. This is +the point of a semantic backend, but it means a misbehaving tool can +produce a diff whose context does not match the old blob; as with +`git diff -w`, such a patch may not apply against the old content. + +----------------------- +packet: git< hunk 1 3 1 5 +packet: git< hunk 10 2 12 2 +packet: git< 0000 +packet: git< status=success +packet: git< 0000 +----------------------- + +If the tool responds with hunks and "success", Git marks those lines +as changed and feeds them into the standard diff pipeline. Git may +still slide or regroup those changes against matching context for +display, exactly as it compacts its own diffs, so the tool controls +which lines are reported as changed, not the precise hunk boundaries. +Patch output features (word diff, function context, color) work +normally. Summary formats such as `--stat` still compute their counts +with the builtin diff for now; see "Which features consult the diff +process" below for the full picture and the reasoning behind it. + +If no hunk lines precede the flush, followed by "success", Git +treats the files as having no changes: `git diff` produces no output, +`git diff --exit-code` and `--quiet` report success even though the +stored blobs differ, and `git blame` skips the commit, attributing +lines to earlier commits. +The one exception is a change that only adds or removes the file's +trailing newline: it cannot be expressed as line hunks, so when the +line content otherwise matches Git keeps the builtin diff for that +file (preserving the `\ No newline at end of file` marker) instead of +treating the two sides as equal. + +----------------------- +packet: git< 0000 +packet: git< status=success +packet: git< 0000 +----------------------- + +If the tool returns invalid hunks (out of bounds, overlapping, or +mismatched unchanged line counts), Git warns and falls back to the +builtin diff algorithm. + +In case the tool cannot or does not want to process the content, +it is expected to respond with an "error" status. Git warns and +falls back to the builtin diff algorithm for this file. The tool +remains available for subsequent files. + +----------------------- +packet: git< 0000 +packet: git< status=error +packet: git< 0000 +----------------------- + +In case the tool cannot or does not want to process the content as +well as any future content for the lifetime of the Git process, it +is expected to respond with an "abort" status. Git silently falls +back to the builtin diff algorithm for this file and does not send +further requests to the tool. + +----------------------- +packet: git< 0000 +packet: git< status=abort +packet: git< 0000 +----------------------- + +If the tool dies during the communication or does not adhere to the +protocol then Git will stop the process and fall back to the builtin +diff algorithm. Git warns once and does not restart the process for +subsequent files. + +Tools should ignore unknown keys in the per-file request to remain +forward-compatible. Future versions of Git may send additional +`command=` values; tools that receive an unrecognized command should +respond with `status=error` rather than terminating. + +Which features consult the diff process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The diff process answers a single question: given two blobs, which +line ranges differ? Whether a particular feature consults it follows +from whether that is the question the feature is really asking. + +Features that ask "which lines changed" use the tool's hunks in place +of the builtin algorithm: + +- `git diff` patch output, together with everything layered on it: + word diff, function context (`-W`), `--color-moved`, the `@@` hunk + headers, and the `-L` line-range display. These operate on the + lines the patch step already emitted, so they reflect the tool's + hunks without any further negotiation. +- `git blame`: a commit whose change the tool reports as equivalent is + skipped, and its lines are attributed to an earlier commit. + +Features that ask a different question do not consult the process, by +design: + +- The pickaxe `-G` searches the textual diff for a pattern; it + asks "does this string appear in the diff," not "did these lines + change." (`-S` runs at an earlier stage and is likewise unaffected.) +- `git patch-id` must produce a stable hash for `git rebase` and + cherry-pick detection; deriving it from a configured tool would make + equal patches hash differently from machine to machine. +- The merge machinery (`git merge-tree`, `rerere`) computes merge + content and conflict signatures rather than display output, so the + tool's hunks must not alter its results. +- `git range-diff` diffs patch text, not source blobs, so source-file + hunks do not apply to it. +- `--check` reports whitespace errors in added lines using the builtin + diff's notion of which lines are added, not the tool's. It can + therefore flag (and exit non-zero on) a line the tool treats as + unchanged and that `git diff` shows as context. Whitespace breakage + is a property of the literal bytes, so `--check` keeps the builtin + partition deliberately; a future change could wire it to the tool if + matching `git diff` exactly became desirable. +- `--raw`, `--name-only`, and `--name-status` compare object ids at + the tree level and never run a line-level diff at all. + +Some features ask "which lines changed" but still use the builtin +algorithm for now, and may consult the process in a later change: the +summary formats (`--stat`, `--numstat`, `--shortstat`); `git log -L`'s +commit selection and parent range propagation (as distinct from its +display, which is covered above); and combined diffs (`--cc` and merge +diffs), whose protocol would have to be extended from a single old/new +pair to one comparison per merge parent. + +`--diff-algorithm` bypasses the process entirely, for every feature +listed above. The whitespace-ignoring options (`-w`, +`--ignore-space-change`, `--ignore-blank-lines`, and the like), +`-I`, and `--anchored` also bypass it for the affected files: +the tool is never told about these options, so it could not honor +them, and Git falls back to the builtin diff, which does. + Defining a custom hunk-header ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Makefile b/Makefile index 1cec251f4387cf..1314c104634310 100644 --- a/Makefile +++ b/Makefile @@ -811,6 +811,7 @@ TEST_BUILTINS_OBJS += test-csprng.o TEST_BUILTINS_OBJS += test-date.o TEST_BUILTINS_OBJS += test-delete-gpgsig.o TEST_BUILTINS_OBJS += test-delta.o +TEST_BUILTINS_OBJS += test-diff-process-backend.o TEST_BUILTINS_OBJS += test-dir-iterator.o TEST_BUILTINS_OBJS += test-drop-caches.o TEST_BUILTINS_OBJS += test-dump-cache-tree.o @@ -1140,6 +1141,7 @@ LIB_OBJS += diff-delta.o LIB_OBJS += diff-merges.o LIB_OBJS += diff-lib.o LIB_OBJS += diff-no-index.o +LIB_OBJS += diff-process.o LIB_OBJS += diff.o LIB_OBJS += diffcore-break.o LIB_OBJS += diffcore-delta.o diff --git a/diff-process.c b/diff-process.c new file mode 100644 index 00000000000000..9a161c6f10a97d --- /dev/null +++ b/diff-process.c @@ -0,0 +1,350 @@ +/* + * Diff process backend: communicates with a long-running external + * tool via the pkt-line protocol to obtain custom line-matching + * results. The tool controls which lines are marked as changed + * while the display shows the file content (after any textconv + * transformation, if configured). + * + * Protocol: pkt-line over stdin/stdout, following the pattern of + * the long-running filter process protocol (see convert.c). + * + * Handshake: + * git> git-diff-client / version=1 / flush + * tool< git-diff-server / version=1 / flush + * git> capability=hunks / flush + * tool< capability=hunks / flush + * + * Per-file: + * git> command=hunks / pathname= / flush + * git> / flush + * git> / flush + * tool< hunk + * tool< ... / flush + * tool< status=success / flush + * + * When the tool returns no hunks with status=success, it considers + * the files equivalent. Git will skip the diff for that file. + */ + +#include "git-compat-util.h" +#include "diff-process.h" +#include "diff.h" +#include "gettext.h" +#include "repository.h" +#include "sigchain.h" +#include "userdiff.h" +#include "sub-process.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "xdiff/xdiff.h" + +#define CAP_HUNKS (1u << 0) + +struct diff_subprocess { + struct subprocess_entry subprocess; + unsigned int supported_capabilities; +}; + +static int start_diff_process_fn(struct subprocess_entry *subprocess) +{ + static int versions[] = { 1, 0 }; + static struct subprocess_capability capabilities[] = { + { "hunks", CAP_HUNKS }, + { NULL, 0 } + }; + struct diff_subprocess *entry = + container_of(subprocess, struct diff_subprocess, subprocess); + + return subprocess_handshake(subprocess, "git-diff", + versions, NULL, + capabilities, + &entry->supported_capabilities); +} + +static struct diff_subprocess *get_or_launch_process( + struct userdiff_driver *drv) +{ + struct diff_subprocess *entry; + + if (drv->diff_subprocess) + return drv->diff_subprocess; + + entry = xcalloc(1, sizeof(*entry)); + if (subprocess_start_command(&entry->subprocess, drv->process, + start_diff_process_fn)) { + free(entry); + drv->diff_process_failed = 1; + return NULL; + } + + drv->diff_subprocess = entry; + return entry; +} + +static int send_file_content(int fd, const char *buf, long size) +{ + int ret = 0; + + if (size < 0) + return -1; + if (size > 0) + ret = write_packetized_from_buf_no_flush(buf, size, fd); + if (ret) + return ret; + return packet_flush_gently(fd); +} + +static int parse_hunk_line(const char *line, struct xdl_hunk *hunk) +{ + char *end; + + /* + * Format: "hunk " + * All numbers must be non-negative decimal with no leading + * whitespace or sign characters. + */ + if (!skip_prefix(line, "hunk ", &line)) + return -1; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->old_start = strtol(line, &end, 10); + if (errno || end == line || *end++ != ' ') + return -1; + line = end; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->old_count = strtol(line, &end, 10); + if (errno || end == line || *end++ != ' ') + return -1; + line = end; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->new_start = strtol(line, &end, 10); + if (errno || end == line || *end++ != ' ') + return -1; + line = end; + + if (!isdigit(*line)) + return -1; + errno = 0; + hunk->new_count = strtol(line, &end, 10); + /* + * Accept (and ignore) any trailing space-separated tokens after + * the four counts, so future protocol versions can append fields + * to a hunk line (e.g. a "moved" or "semantic" marker) without + * older clients rejecting it. Mirrors the request-side rule that + * tools ignore unknown keys. + */ + if (errno || end == line || (*end != '\0' && *end != ' ')) + return -1; + + /* + * git diff emits start=0 when count=0 (empty file side). + * Normalize to 1-based so downstream validation can assume start >= 1. + */ + if (!hunk->old_count && !hunk->old_start) + hunk->old_start = 1; + if (!hunk->new_count && !hunk->new_start) + hunk->new_start = 1; + + return 0; +} + +static enum diff_process_result get_hunks( + struct userdiff_driver *drv, + const char *path, + const char *old_buf, long old_size, + const char *new_buf, long new_size, + struct xdl_hunk **hunks_out, + size_t *nr_hunks_out) +{ + struct diff_subprocess *backend; + struct child_process *process; + int fd_in, fd_out; + struct strbuf status = STRBUF_INIT; + struct xdl_hunk *hunks = NULL; + struct xdl_hunk hunk; + size_t nr_hunks = 0, alloc_hunks = 0, max_hunks; + int len; + char *line; + + backend = get_or_launch_process(drv); + if (!backend) + return DIFF_PROCESS_ERROR; + + if (!(backend->supported_capabilities & CAP_HUNKS)) + return DIFF_PROCESS_SKIP; + + process = subprocess_get_child_process(&backend->subprocess); + fd_in = process->in; + fd_out = process->out; + + sigchain_push(SIGPIPE, SIG_IGN); + + /* Send request */ + if (packet_write_fmt_gently(fd_in, "command=hunks\n") || + packet_write_fmt_gently(fd_in, "pathname=%s\n", path) || + packet_flush_gently(fd_in)) + goto comm_error; + + /* Send old file content */ + if (send_file_content(fd_in, old_buf, old_size)) + goto comm_error; + + /* Send new file content */ + if (send_file_content(fd_in, new_buf, new_size)) + goto comm_error; + + /* + * Hunks are non-overlapping and each covers at least one line, so + * a valid response cannot contain more hunks than the two files + * have lines, which is bounded by their byte sizes. Cap the + * accumulation accordingly so a misbehaving tool that floods hunk + * lines cannot drive unbounded memory growth before validation. + */ + max_hunks = (size_t)old_size + (size_t)new_size + 1; + + /* Read hunks until flush packet */ + while ((len = packet_read_line_gently(fd_out, NULL, &line)) >= 0 && + line) { + if (parse_hunk_line(line, &hunk) < 0) + goto comm_error; + if (nr_hunks >= max_hunks) { + warning(_("diff process '%s' sent too many hunks" + " for '%s'"), drv->process, path); + goto comm_error; + } + ALLOC_GROW(hunks, nr_hunks + 1, alloc_hunks); + hunks[nr_hunks++] = hunk; + } + if (len < 0) + goto comm_error; + + /* Read status */ + if (subprocess_read_status(fd_out, &status)) + goto comm_error; + + if (!strcmp(status.buf, "success")) { + *hunks_out = hunks; + *nr_hunks_out = nr_hunks; + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_OK; + } + + if (!strcmp(status.buf, "abort")) { + /* + * The tool voluntarily withdrew: stop sending requests + * but do not warn (this is not a failure). + */ + backend->supported_capabilities &= ~CAP_HUNKS; + free(hunks); + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_SKIP; + } + + /* status=error or unknown status */ + free(hunks); + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_ERROR; + +comm_error: + /* + * Communication failure (broken pipe, malformed response). + * Tear down the process and mark as failed so we do not + * retry on every subsequent file. + */ + drv->diff_process_failed = 1; + drv->diff_subprocess = NULL; + subprocess_stop_command(&backend->subprocess); + free(backend); + free(hunks); + strbuf_release(&status); + sigchain_pop(SIGPIPE); + return DIFF_PROCESS_ERROR; +} + +/* + * Whether exactly one of the two blobs ends in a newline. A change + * that only adds or removes the trailing newline is not expressible as + * line hunks, so a tool comparing lines reports the files as equal. + */ +static int eof_newline_differs(const mmfile_t *a, const mmfile_t *b) +{ + int a_nl = a->size > 0 && a->ptr[a->size - 1] == '\n'; + int b_nl = b->size > 0 && b->ptr[b->size - 1] == '\n'; + return a_nl != b_nl; +} + +enum diff_process_result diff_process_fill_hunks( + struct diff_options *diffopt, + const char *path, + const mmfile_t *file_a, + const mmfile_t *file_b, + xpparam_t *xpp) +{ + struct userdiff_driver *drv; + struct xdl_hunk *ext_hunks = NULL; + size_t nr = 0; + enum diff_process_result res; + + if (!diffopt || !path) + return DIFF_PROCESS_SKIP; + if (diffopt->flags.no_diff_process || diffopt->ignore_driver_algorithm) + return DIFF_PROCESS_SKIP; + /* + * Whitespace-ignoring, regex-ignore (-I) and anchored options + * change which lines count as different, but the tool is never + * told about them, so its hunks could not honor them. Rather + * than silently override the user's request, fall back to the + * builtin diff, which does honor these flags. + */ + if ((diffopt->xdl_opts & (XDF_WHITESPACE_FLAGS | XDF_IGNORE_BLANK_LINES)) || + diffopt->ignore_regex_nr || diffopt->anchors_nr) + return DIFF_PROCESS_SKIP; + + drv = userdiff_find_by_path(diffopt->repo->index, path); + if (!drv || !drv->process) + return DIFF_PROCESS_SKIP; + if (drv->diff_process_failed) + return DIFF_PROCESS_SKIP; + + res = get_hunks(drv, path, + file_a->ptr, file_a->size, + file_b->ptr, file_b->size, + &ext_hunks, &nr); + if (res == DIFF_PROCESS_OK) { + if (!nr) { + free(ext_hunks); + /* + * Zero hunks means the tool considers the line + * content identical, but it cannot express a + * trailing-newline-only change. When that is the + * actual difference, fall back to the builtin diff + * so the "\ No newline at end of file" marker is + * preserved instead of reporting the files equal. + */ + if (eof_newline_differs(file_a, file_b)) + return DIFF_PROCESS_SKIP; + return DIFF_PROCESS_EQUIVALENT; + } + xpp->external_hunks = ext_hunks; + xpp->external_hunks_nr = nr; + return DIFF_PROCESS_OK; + } + if (res == DIFF_PROCESS_ERROR) { + warning(_("diff process '%s' failed for '%s'," + " falling back to builtin diff"), + drv->process, path); + return DIFF_PROCESS_ERROR; + } + return DIFF_PROCESS_SKIP; +} diff --git a/diff-process.h b/diff-process.h new file mode 100644 index 00000000000000..d34b42f8116a04 --- /dev/null +++ b/diff-process.h @@ -0,0 +1,39 @@ +#ifndef DIFF_PROCESS_H +#define DIFF_PROCESS_H + +#include "xdiff/xdiff.h" + +struct diff_options; + +enum diff_process_result { + DIFF_PROCESS_ERROR = -1, /* tool failure: warned, fell back */ + DIFF_PROCESS_OK = 0, /* hunks populated in xpp */ + DIFF_PROCESS_SKIP, /* no process configured: use builtin */ + DIFF_PROCESS_EQUIVALENT, /* tool says files are equivalent */ +}; + +/* + * Consult the diff process configured for 'path' and populate + * xpp->external_hunks with the returned hunks. + * + * Handles driver lookup, flag checks (--no-ext-diff, + * --diff-algorithm), subprocess management, and error reporting. + * + * Returns DIFF_PROCESS_OK when hunks are populated in xpp. + * The caller owns xpp->external_hunks and must free() it. + * + * Returns DIFF_PROCESS_EQUIVALENT when the tool returns no hunks + * (files are considered identical); caller should skip diff/blame. + * Returns DIFF_PROCESS_SKIP when no process applies; caller + * should use the builtin diff algorithm. + * Returns DIFF_PROCESS_ERROR on tool failure (already warned); + * caller should fall back to the builtin diff algorithm. + */ +enum diff_process_result diff_process_fill_hunks( + struct diff_options *diffopt, + const char *path, + const mmfile_t *file_a, + const mmfile_t *file_b, + xpparam_t *xpp); + +#endif /* DIFF_PROCESS_H */ diff --git a/diff.c b/diff.c index 2a9d0d86871139..68bee81f0f56df 100644 --- a/diff.c +++ b/diff.c @@ -25,6 +25,7 @@ #include "utf8.h" #include "odb.h" #include "userdiff.h" +#include "diff-process.h" #include "submodule.h" #include "hashmap.h" #include "mem-pool.h" @@ -4055,6 +4056,17 @@ static void builtin_diff(const char *name_a, xpp.ignore_regex_nr = o->ignore_regex_nr; xpp.anchors = o->anchors; xpp.anchors_nr = o->anchors_nr; + + if (diff_process_fill_hunks(o, name_a, + &mf1, &mf2, &xpp) + == DIFF_PROCESS_EQUIVALENT) { + if (textconv_one) + free(mf1.ptr); + if (textconv_two) + free(mf2.ptr); + goto free_ab_and_return; + } + xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_FUNCNAMES; @@ -4135,6 +4147,7 @@ static void builtin_diff(const char *name_a, } else if (xdi_diff_outf(&mf1, &mf2, NULL, fn_out_consume, &ecbdata, &xpp, &xecfg)) die("unable to generate diff for %s", one->path); + free(xpp.external_hunks); if (o->word_diff) free_diff_words_data(&ecbdata); if (textconv_one) diff --git a/diff.h b/diff.h index bb5cddaf3499e9..7dc157968d69f9 100644 --- a/diff.h +++ b/diff.h @@ -173,6 +173,9 @@ struct diff_flags { */ unsigned allow_external; + /** Disables diff..process. */ + unsigned no_diff_process; + /** * For communication between the calling program and the options parser; * tell the calling program to signal the presence of difference using diff --git a/meson.build b/meson.build index 3247697f74aae1..aa532f5200a916 100644 --- a/meson.build +++ b/meson.build @@ -328,6 +328,7 @@ libgit_sources = [ 'diff-merges.c', 'diff-lib.c', 'diff-no-index.c', + 'diff-process.c', 'diff.c', 'diffcore-break.c', 'diffcore-delta.c', diff --git a/t/helper/meson.build b/t/helper/meson.build index 3235f10ab8aae1..6abcda4afb89c0 100644 --- a/t/helper/meson.build +++ b/t/helper/meson.build @@ -12,6 +12,7 @@ test_tool_sources = [ 'test-date.c', 'test-delete-gpgsig.c', 'test-delta.c', + 'test-diff-process-backend.c', 'test-dir-iterator.c', 'test-drop-caches.c', 'test-dump-cache-tree.c', diff --git a/t/helper/test-diff-process-backend.c b/t/helper/test-diff-process-backend.c new file mode 100644 index 00000000000000..a5a70dd40b5e3a --- /dev/null +++ b/t/helper/test-diff-process-backend.c @@ -0,0 +1,341 @@ +/* + * Test backend for the long-running diff process protocol + * (see diff-process.c and Documentation/gitattributes.adoc). + * + * Usage: test-tool diff-process-backend --mode= [--log=] + * + * Implements the server side of the pkt-line handshake and a per-file + * response loop. The --mode= switch selects the response shape + * (success, error, abort, crash, malformed hunks). + * + * Per-file request from Git: + * + * packet: git> command=hunks + * packet: git> pathname= + * packet: git> 0000 + * packet: git> OLD_CONTENT + * packet: git> 0000 + * packet: git> NEW_CONTENT + * packet: git> 0000 + * + * Response varies by --mode (default: whole-file): + * + * whole-file packet: git< hunk 1 1 + * fixed-hunk packet: git< hunk 5 2 5 2 + * no-hunks (no hunk packets) + * bad-hunk packet: git< hunk 999 1 999 1 + * bad-parse packet: git< garbage not a hunk + * bad-sync packet: git< hunk 1 2 1 1 + * bad-gap packet: git< hunk 1 1 3 1 + * multi-hunk packet: git< hunk 5 2 5 2 + * packet: git< hunk 9 2 9 2 + * flood packet: git< hunk 1 1 1 1 (x100000) + * overlap packet: git< hunk 1 5 1 5 + * packet: git< hunk 3 2 3 2 + * no-cap (omits capability=hunks during handshake) + * error (status=error instead of status=success) + * abort (status=abort instead of status=success) + * crash exit(1) before sending any response + * + * All non-error/abort modes end with: + * + * packet: git< 0000 + * packet: git< status=success + * packet: git< 0000 + * + * Each request is logged to --log as: + * + * command= pathname= old= new= + */ + +#include "test-tool.h" +#include "pkt-line.h" +#include "parse-options.h" +#include "strbuf.h" + +static FILE *logfile; + +enum mode { + MODE_WHOLE_FILE, + MODE_FIXED_HUNK, + MODE_NO_HUNKS, + MODE_BAD_HUNK, + MODE_BAD_PARSE, + MODE_BAD_SYNC, + MODE_BAD_GAP, + MODE_MULTI_HUNK, + MODE_FLOOD, + MODE_OVERLAP, + MODE_NO_CAP, + MODE_ERROR, + MODE_ABORT, + MODE_CRASH, +}; + +static enum mode parse_mode(const char *s) +{ + if (!strcmp(s, "whole-file")) + return MODE_WHOLE_FILE; + if (!strcmp(s, "fixed-hunk")) + return MODE_FIXED_HUNK; + if (!strcmp(s, "no-hunks")) + return MODE_NO_HUNKS; + if (!strcmp(s, "bad-hunk")) + return MODE_BAD_HUNK; + if (!strcmp(s, "bad-parse")) + return MODE_BAD_PARSE; + if (!strcmp(s, "bad-sync")) + return MODE_BAD_SYNC; + if (!strcmp(s, "bad-gap")) + return MODE_BAD_GAP; + if (!strcmp(s, "multi-hunk")) + return MODE_MULTI_HUNK; + if (!strcmp(s, "flood")) + return MODE_FLOOD; + if (!strcmp(s, "overlap")) + return MODE_OVERLAP; + if (!strcmp(s, "no-cap")) + return MODE_NO_CAP; + if (!strcmp(s, "error")) + return MODE_ERROR; + if (!strcmp(s, "abort")) + return MODE_ABORT; + if (!strcmp(s, "crash")) + return MODE_CRASH; + die("unknown --mode=%s", s); +} + +/* + * Read "key=value" packets up to a flush, capturing "command" and + * "pathname". Returns 1 if a request was read, 0 on EOF. + * + * The first packet uses the gentle variant so that a clean shutdown + * by Git (EOF) does not produce a spurious "the remote end hung up + * unexpectedly" on stderr. Subsequent packets use the non-gentle + * variant: once inside a request, truncation is a protocol violation + * and dying loudly is the correct response. + */ +static int read_request_header(char **command, char **pathname) +{ + int first = 1; + char *line; + + *command = *pathname = NULL; + for (;;) { + const char *value; + + if (first) { + if (packet_read_line_gently(0, NULL, &line) < 0) + return 0; + first = 0; + } else { + line = packet_read_line(0, NULL); + } + if (!line) + break; + if (skip_prefix(line, "command=", &value)) + *command = xstrdup(value); + else if (skip_prefix(line, "pathname=", &value)) + *pathname = xstrdup(value); + } + return 1; +} + +static size_t count_lines(const struct strbuf *buf) +{ + size_t lines = 0; + + for (size_t i = 0; i < buf->len; i++) + if (buf->buf[i] == '\n') + lines++; + + return lines + (buf->len > 0 && buf->buf[buf->len - 1] != '\n'); +} + +static void send_status(const char *status) +{ + packet_flush(1); + packet_write_fmt(1, "%s\n", status); + packet_flush(1); +} + +static void respond(enum mode mode, + const struct strbuf *old_buf, + const struct strbuf *new_buf) +{ + switch (mode) { + case MODE_ERROR: + send_status("status=error"); + return; + case MODE_ABORT: + send_status("status=abort"); + return; + case MODE_CRASH: + exit(1); + case MODE_FIXED_HUNK: + packet_write_fmt(1, "hunk 5 2 5 2\n"); + break; + case MODE_BAD_HUNK: + packet_write_fmt(1, "hunk 999 1 999 1\n"); + break; + case MODE_BAD_PARSE: + packet_write_fmt(1, "garbage not a hunk\n"); + break; + case MODE_BAD_SYNC: + packet_write_fmt(1, "hunk 1 2 1 1\n"); + break; + case MODE_BAD_GAP: + /* + * Globally balanced (1 changed line on each side, so the + * total unchanged counts match) but the gap before the + * change differs between sides: old line 1 vs new line 3. + * Exercises the per-gap lockstep-alignment check. + */ + packet_write_fmt(1, "hunk 1 1 3 1\n"); + break; + case MODE_MULTI_HUNK: + /* + * Two valid, non-overlapping, gap-aligned hunks. Exercises + * the accepting branch of the per-gap lockstep check with a + * non-zero previous-hunk end (the realistic two-region case). + */ + packet_write_fmt(1, "hunk 5 2 5 2\n"); + packet_write_fmt(1, "hunk 9 2 9 2\n"); + break; + case MODE_FLOOD: { + /* + * Emit far more hunks than any small file has lines, so Git + * trips its accumulation cap and falls back before reading + * them all. + */ + int i; + for (i = 0; i < 100000; i++) + packet_write_fmt(1, "hunk 1 1 1 1\n"); + break; + } + case MODE_OVERLAP: + packet_write_fmt(1, "hunk 1 5 1 5\n"); + packet_write_fmt(1, "hunk 3 2 3 2\n"); + break; + case MODE_NO_HUNKS: + break; + case MODE_NO_CAP: + case MODE_WHOLE_FILE: { + size_t old_lines = count_lines(old_buf); + size_t new_lines = count_lines(new_buf); + /* + * Match git diff output: start=0 when count=0 + * (empty file side), 1 otherwise. + */ + packet_write_fmt(1, "hunk %"PRIuMAX" %"PRIuMAX + " %"PRIuMAX" %"PRIuMAX"\n", + (uintmax_t)(old_lines ? 1 : 0), + (uintmax_t)old_lines, + (uintmax_t)(new_lines ? 1 : 0), + (uintmax_t)new_lines); + break; + } + } + send_status("status=success"); +} + +static void command_loop(enum mode mode) +{ + for (;;) { + char *command = NULL, *pathname = NULL; + struct strbuf obuf = STRBUF_INIT; + struct strbuf nbuf = STRBUF_INIT; + + if (!read_request_header(&command, &pathname)) + break; /* EOF: Git closed its end */ + + read_packetized_to_strbuf(0, &obuf, 0); + read_packetized_to_strbuf(0, &nbuf, 0); + + if (logfile) { + fprintf(logfile, + "command=%s pathname=%s old=%.*s new=%.*s\n", + command ? command : "(none)", + pathname ? pathname : "(none)", + (int)(strchrnul(obuf.buf, '\n') - obuf.buf), + obuf.buf, + (int)(strchrnul(nbuf.buf, '\n') - nbuf.buf), + nbuf.buf); + fflush(logfile); + } + + respond(mode, &obuf, &nbuf); + + free(command); + free(pathname); + strbuf_release(&obuf); + strbuf_release(&nbuf); + } +} + +static void handshake(enum mode mode) +{ + char *line; + + line = packet_read_line(0, NULL); + if (!line || strcmp(line, "git-diff-client")) + die("bad welcome: '%s'", line ? line : "(eof)"); + line = packet_read_line(0, NULL); + if (!line || strcmp(line, "version=1")) + die("bad version: '%s'", line ? line : "(eof)"); + if (packet_read_line(0, NULL)) + die("expected flush after version"); + + packet_write_fmt(1, "git-diff-server\n"); + packet_write_fmt(1, "version=1\n"); + packet_flush(1); + + /* Drain capabilities advertised by Git */ + while ((line = packet_read_line(0, NULL))) + ; /* drain */ + + /* Respond with our capabilities (or none for no-cap mode) */ + if (mode != MODE_NO_CAP) + packet_write_fmt(1, "capability=hunks\n"); + packet_flush(1); +} + +static const char *const usage_str[] = { + "test-tool diff-process-backend --mode= [--log=]", + NULL +}; + +int cmd__diff_process_backend(int argc, const char **argv) +{ + const char *mode_str = NULL, *log_path = NULL; + enum mode mode = MODE_WHOLE_FILE; + struct option options[] = { + OPT_STRING(0, "mode", &mode_str, "mode", + "response shape: whole-file (default), fixed-hunk," + " no-hunks, bad-hunk, bad-sync, overlap, error," + " abort, crash"), + OPT_STRING(0, "log", &log_path, "path", + "append per-request summary to this file"), + OPT_END() + }; + + argc = parse_options(argc, argv, NULL, options, usage_str, 0); + if (argc) + usage_with_options(usage_str, options); + + if (mode_str) + mode = parse_mode(mode_str); + + if (log_path) { + logfile = fopen(log_path, "a"); + if (!logfile) + die_errno("failed to open log '%s'", log_path); + } + + handshake(mode); + command_loop(mode); + + if (logfile && fclose(logfile)) + die_errno("error closing log"); + return 0; +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index b71a22b43bbc9e..3c3f95269c6279 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -22,6 +22,7 @@ static struct test_cmd cmds[] = { { "date", cmd__date }, { "delete-gpgsig", cmd__delete_gpgsig }, { "delta", cmd__delta }, + { "diff-process-backend", cmd__diff_process_backend }, { "dir-iterator", cmd__dir_iterator }, { "drop-caches", cmd__drop_caches }, { "dump-cache-tree", cmd__dump_cache_tree }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index f2885b33d58aa8..a5bb7555162c8e 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -15,6 +15,7 @@ int cmd__csprng(int argc, const char **argv); int cmd__date(int argc, const char **argv); int cmd__delta(int argc, const char **argv); int cmd__delete_gpgsig(int argc, const char **argv); +int cmd__diff_process_backend(int argc, const char **argv); int cmd__dir_iterator(int argc, const char **argv); int cmd__drop_caches(int argc, const char **argv); int cmd__dump_cache_tree(int argc, const char **argv); diff --git a/t/meson.build b/t/meson.build index 3219264fe7d497..6afbfa6a87155b 100644 --- a/t/meson.build +++ b/t/meson.build @@ -512,6 +512,7 @@ integration_tests = [ 't4072-diff-max-depth.sh', 't4073-diff-stat-name-width.sh', 't4074-diff-shifted-matched-group.sh', + 't4080-diff-process.sh', 't4100-apply-stat.sh', 't4101-apply-nonl.sh', 't4102-apply-rename.sh', diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh new file mode 100755 index 00000000000000..c101fc375e0c2a --- /dev/null +++ b/t/t4080-diff-process.sh @@ -0,0 +1,529 @@ +#!/bin/sh + +test_description='diff process via long-running process' + +. ./test-lib.sh + +# See t/helper/test-diff-process-backend.c for the backend implementation +# and available --mode= options. + +BACKEND="test-tool diff-process-backend" + +test_expect_success 'setup' ' + echo "*.c diff=cdiff" >.gitattributes && + git add .gitattributes && + + # boundary.c: 10 lines, changes at 5-6 and 9-10. + # Used by: hunk boundaries, error fallback, crash, bad hunks, overlap. + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + OLD5 + OLD6 + line7 + line8 + OLD9 + OLD10 + EOF + git add boundary.c && + + # worddiff.c: single-line function, value changes 1 -> 999. + # Used by: word-diff, --diff-algorithm, --no-ext-diff, --stat. + cat >worddiff.c <<-\EOF && + int value(void) { return 1; } + EOF + git add worddiff.c && + + # newfile.c: single-line function, value changes 42 -> 99. + # Used by: modified file, --exit-code, multiple drivers. + cat >newfile.c <<-\EOF && + int new_func(void) { return 42; } + EOF + git add newfile.c && + + # logtest.c: single-line function for log/format-patch tests. + # Needs two commits so log -1 has a diff. + cat >logtest.c <<-\EOF && + int logfunc(void) { return 1; } + EOF + git add logtest.c && + + # two.c/one.c: two-file pair for error/abort/startup-failure tests. + cat >one.c <<-\EOF && + int first(void) { return 1; } + EOF + cat >two.c <<-\EOF && + int second(void) { return 2; } + EOF + git add one.c two.c && + + git commit -m "initial" && + + # Second commit for logtest.c (so log -1 has something to show). + cat >logtest.c <<-\EOF && + int logfunc(void) { return 2; } + EOF + git add logtest.c && + git commit -m "change logtest.c" && + + # Working tree modifications (not committed). + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + NEW5 + NEW6 + line7 + line8 + NEW9 + NEW10 + EOF + + cat >worddiff.c <<-\EOF && + int value(void) { return 999; } + EOF + + cat >newfile.c <<-\EOF && + int new_func(void) { return 99; } + EOF + + cat >one.c <<-\EOF && + int first(void) { return 10; } + EOF + + cat >two.c <<-\EOF + int second(void) { return 20; } + EOF +' + +# +# Core behavior: the tool controls which lines are marked as changed. +# + +test_expect_success 'diff process hunk boundaries affect output' ' + # The file has changes at lines 5-6 and 9-10, but fixed-hunk + # only reports lines 5-6 as changed. Lines 9-10 should not + # appear as changed in the output. + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + diff boundary.c >actual && + test_grep "^-OLD5" actual && + test_grep "^-OLD6" actual && + test_grep "^+NEW5" actual && + test_grep "^+NEW6" actual && + test_grep ! "^-OLD9" actual && + test_grep ! "^-OLD10" actual && + test_grep ! "^+NEW9" actual && + test_grep ! "^+NEW10" actual +' + +test_expect_success 'diff process accepts valid multi-hunk output' ' + # multi-hunk reports both changed regions (5-6 and 9-10) as two + # gap-aligned hunks. This exercises the accepting branch of the + # per-gap lockstep check (non-zero previous-hunk end) and must + # produce a correct two-region diff with the lines between the + # hunks kept as context. + git -c diff.cdiff.process="$BACKEND --mode=multi-hunk" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + test_grep "^ line7" actual && + test_grep "^ line8" actual && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with modified file' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- newfile.c >actual 2>stderr && + test_grep "return 99" actual && + test_grep "pathname=newfile.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with added file (empty old side)' ' + cat >added.c <<-\EOF && + int added(void) { return 1; } + EOF + git add added.c && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --cached -- added.c >actual 2>stderr && + test_grep "added" actual && + test_grep "pathname=added.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with deleted file (empty new side)' ' + git add added.c && + git commit -m "commit added.c" && + git rm added.c && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --cached -- added.c >actual 2>stderr && + test_grep "deleted file" actual && + test_grep "pathname=added.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process skipped for binary files' ' + printf "\\0binary" >binary.c && + git add binary.c && + git commit -m "add binary" && + printf "\\0changed" >binary.c && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- binary.c >actual && + test_grep "Binary files" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process not consulted for unmatched driver' ' + echo "not tracked by cdiff" >unmatched.txt && + git add unmatched.txt && + git commit -m "add unmatched.txt" && + + echo "modified" >unmatched.txt && + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- unmatched.txt >actual && + test_grep "modified" actual && + test_path_is_missing backend.log +' + +test_expect_success 'multiple drivers use separate processes' ' + echo "*.h diff=hdiff" >>.gitattributes && + git add .gitattributes && + + cat >multi.h <<-\EOF && + int header(void) { return 1; } + EOF + git add multi.h && + git commit -m "add multi.h" && + + cat >multi.h <<-\EOF && + int header(void) { return 2; } + EOF + + test_when_finished "rm -f backend-c.log backend-h.log" && + git -c diff.cdiff.process="$BACKEND --log=backend-c.log" \ + -c diff.hdiff.process="$BACKEND --log=backend-h.log" \ + diff -- newfile.c multi.h >actual 2>stderr && + test_grep "pathname=newfile.c" backend-c.log && + test_grep "pathname=multi.h" backend-h.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works alongside textconv' ' + write_script uppercase-filter <<-\EOF && + tr "a-z" "A-Z" <"$1" + EOF + + cat >textconv.c <<-\EOF && + hello world + EOF + git add textconv.c && + git commit -m "add textconv.c" && + + cat >textconv.c <<-\EOF && + goodbye world + EOF + + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.textconv="./uppercase-filter" \ + -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- textconv.c >actual 2>stderr && + # The diff process receives textconv-transformed (uppercase) content. + test_grep "pathname=textconv.c" backend.log && + test_grep "old=HELLO WORLD" backend.log && + test_grep "new=GOODBYE WORLD" backend.log && + test_must_be_empty stderr +' + +# +# Downstream features: word diff, log, equivalent files, exit code. +# + +test_expect_success 'diff process with --word-diff' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --word-diff worddiff.c >actual 2>stderr && + test_grep "\[-1;-\]" actual && + test_grep "{+999;+}" actual && + test_grep "pathname=worddiff.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process works with git log -p' ' + # With no-hunks mode, the tool says the files are equivalent, + # so log -p should show the commit but no diff content. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + log -1 -p -- logtest.c >actual 2>stderr && + test_grep "change logtest.c" actual && + test_grep ! "return 2" actual && + test_grep "command=hunks pathname=logtest.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process no hunks suppresses diff output' ' + cat >nohunks.c <<-\EOF && + int zero(void) { return 0; } + EOF + git add nohunks.c && + git commit -m "add nohunks.c" && + + cat >nohunks.c <<-\EOF && + int zero(void) { return 999; } + EOF + + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + diff nohunks.c >actual && + test_must_be_empty actual +' + +test_expect_success 'diff process no hunks with --exit-code returns success' ' + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + diff --exit-code nohunks.c +' + +test_expect_success 'diff process falls back for trailing-newline-only change' ' + test_when_finished "rm -f backend.log" && + printf "a\nb\nc\n" >eofnl.c && + git add eofnl.c && + git commit -m "add eofnl.c" && + printf "a\nb\nc" >eofnl.c && + # Same lines, only the final newline removed. The tool reports + # no hunks (it sees identical lines), but that change is not + # expressible as hunks, so git falls back to the builtin diff + # rather than treating the files as equivalent. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff eofnl.c >actual 2>stderr && + test_grep "No newline at end of file" actual && + test_grep "pathname=eofnl.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process falls back for added file (empty old side)' ' + test_when_finished "rm -f backend.log" && + printf "x\ny\nz\n" >addnl.c && + git add addnl.c && + # The empty old side has no trailing newline while the new side + # does, so the newline fallback shows the addition rather than + # letting no-hunks suppress the whole new file. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --cached addnl.c >actual 2>stderr && + test_grep "^+x" actual && + test_grep "pathname=addnl.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process with --exit-code and hunks returns failure' ' + test_expect_code 1 git -c diff.cdiff.process="$BACKEND" \ + diff --exit-code newfile.c +' + +# +# Bypass mechanisms: flags and commands that skip the diff process. +# + +test_expect_success 'diff process bypassed by --diff-algorithm' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --diff-algorithm=patience worddiff.c >actual && + test_grep "return 999" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process bypassed under whitespace-ignoring flags' ' + test_when_finished "rm -f backend.log" && + printf "a\nb\nc\n" >wsbypass.c && + git add wsbypass.c && + git commit -m "add wsbypass.c" && + printf "a\n b \nc\n" >wsbypass.c && + # The tool is never told about these options and could not honor + # them, so git bypasses the process for each (covering the whole + # XDF_WHITESPACE_FLAGS | XDF_IGNORE_BLANK_LINES mask, not just -w). + for opt in -w -b --ignore-space-at-eol --ignore-blank-lines + do + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff $opt wsbypass.c >actual 2>stderr && + test_path_is_missing backend.log && + test_must_be_empty stderr || + return 1 + done && + # -w additionally suppresses the whitespace-only change via the + # builtin diff that now runs. + git -c diff.cdiff.process="$BACKEND" diff -w wsbypass.c >actual && + test_must_be_empty actual +' + +# +# Error handling and fallback. +# + +test_expect_success 'diff process fallback on tool error status' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=error --log=backend.log" \ + diff boundary.c >actual 2>stderr && + # Fallback produces the full builtin diff (both change regions). + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + # Tool was contacted (it replied with error, not crash). + test_grep "command=hunks pathname=boundary.c" backend.log && + test_grep "diff process.*failed" stderr +' + +test_expect_success 'diff process error keeps tool available for next file' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=error --log=backend.log" \ + diff -- one.c two.c >actual 2>stderr && + # Unlike abort, error keeps the tool available: both files + # are sent to the tool (and both fall back). + test_grep "pathname=one.c" backend.log && + test_grep "pathname=two.c" backend.log && + test_grep "return 10" actual && + test_grep "return 20" actual && + test_grep "diff process.*failed" stderr +' + +test_expect_success 'diff process abort disables for session' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=abort --log=backend.log" \ + diff -- one.c two.c >actual 2>stderr && + # Both files should still produce diff output via fallback. + test_grep "return 10" actual && + test_grep "return 20" actual && + # The tool aborts on the first file and git clears its + # capability. The second file never contacts the tool. + test_grep "pathname=one.c" backend.log && + test_grep ! "pathname=two.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process fallback on tool crash' ' + git -c diff.cdiff.process="$BACKEND --mode=crash" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + # Crash is a communication failure, so a warning is emitted. + test_grep "diff process.*failed" stderr +' + +test_expect_success 'diff process startup failure only warns once' ' + git -c diff.cdiff.process="/nonexistent/tool" \ + diff -- one.c two.c >actual 2>stderr && + # Both files produce diff output via fallback. + test_grep "return 10" actual && + test_grep "return 20" actual && + # Sentinel prevents repeated warnings: only one, not one per file. + test_grep "diff process.*failed" stderr >warnings && + test_line_count = 1 warnings +' + + +test_expect_success 'diff process fallback on bad hunks' ' + git -c diff.cdiff.process="$BACKEND --mode=bad-hunk" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual && + test_grep "^-OLD9" actual && + test_grep "^+NEW9" actual && + test_grep "exceeds.*lines" stderr +' + +test_expect_success 'diff process fallback on mismatched unchanged totals' ' + cat >synctest.c <<-\EOF && + line1 + line2 + line3 + EOF + git add synctest.c && + git commit -m "add synctest.c" && + + cat >synctest.c <<-\EOF && + line1 + changed + line3 + EOF + + # bad-sync reports hunk 1 2 1 1: marks 2 old lines and 1 new + # line as changed, leaving 1 unchanged old vs 2 unchanged new. + # The synchronization invariant fails and git falls back. + git -c diff.cdiff.process="$BACKEND --mode=bad-sync" \ + diff synctest.c >actual 2>stderr && + test_grep "changed" actual && + test_grep "unchanged line count mismatch" stderr +' + +test_expect_success 'diff process fallback on misaligned hunk gap' ' + # bad-gap reports hunk 1 1 3 1 on boundary.c: one changed line + # on each side, so the total unchanged counts match, but the + # unchanged run before the change differs (old line 1 vs new + # line 3). A global count check would accept this and emit a + # corrupt diff; the per-gap lockstep check rejects it and git + # falls back to the builtin algorithm. + git -c diff.cdiff.process="$BACKEND --mode=bad-gap" \ + diff boundary.c >actual 2>stderr && + # The builtin fallback shows both changed regions as additions + # (a corrupt-accepted hunk would show NEW5 only as context). + test_grep "^+NEW5" actual && + test_grep "^+NEW9" actual && + test_grep "unchanged run before hunk differs" stderr +' + +test_expect_success 'diff process fallback on overlapping hunks' ' + # boundary.c has 10 lines, so both hunks are in bounds + # but they overlap at lines 3-5, triggering the ordering check. + git -c diff.cdiff.process="$BACKEND --mode=overlap" \ + diff boundary.c >actual 2>stderr && + test_grep "NEW5" actual && + test_grep "overlaps with previous" stderr +' + +test_expect_success 'diff process fallback on malformed hunk line' ' + git -c diff.cdiff.process="$BACKEND --mode=bad-parse" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "^+NEW5" actual +' + +test_expect_success 'diff process caps a flood of hunks and falls back' ' + # flood emits far more hunks than the file has lines. Git must + # stop accumulating and fall back to the builtin diff rather than + # grow memory without bound. + git -c diff.cdiff.process="$BACKEND --mode=flood" \ + diff boundary.c >actual 2>stderr && + test_grep "^-OLD5" actual && + test_grep "too many hunks" stderr +' + +test_expect_success 'diff process skipped when tool omits capability' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-cap --log=backend.log" \ + diff boundary.c >actual 2>stderr && + # Builtin diff runs: all changes appear, including lines 9-10 + # that a tool-provided hunk would have narrowed away. + test_grep "^-OLD5" actual && + test_grep "^-OLD9" actual && + # The process started (the handshake created the log) but was + # never sent a per-file request, so no hunks command is logged. + test_path_is_file backend.log && + test_grep ! "command=hunks" backend.log && + test_must_be_empty stderr +' + +test_done diff --git a/userdiff.h b/userdiff.h index 51c26e0d4190e5..a98eabe3770cc6 100644 --- a/userdiff.h +++ b/userdiff.h @@ -3,6 +3,7 @@ #include "notes-cache.h" +struct diff_subprocess; struct index_state; struct repository; @@ -33,6 +34,8 @@ struct userdiff_driver { int textconv_want_cache; const char *process; char *process_owned; + struct diff_subprocess *diff_subprocess; + unsigned diff_process_failed : 1; }; enum userdiff_driver_type { USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0, From 527acd60069b0f14d21c1d355d479ae976d91ff3 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 28 May 2026 15:09:25 -0700 Subject: [PATCH 6/9] diff: bypass diff process with --no-ext-diff and in format-patch Make --no-ext-diff disable diff..process in addition to diff..command. Although the two mechanisms work differently (command replaces Git's output, process feeds hunks back into the pipeline), both invoke external tools and --no-ext-diff means "no external tools." Replace the OPT_BOOL for --ext-diff with an OPT_CALLBACK that sets both allow_external and no_diff_process, so a single option controls both. Passing --ext-diff explicitly clears no_diff_process, so a later --ext-diff overrides an earlier --no-ext-diff. Disable the diff process unconditionally in format-patch so that generated patches are always based on the builtin diff algorithm and can be applied reliably by recipients who do not have the external tool. Document that --diff-algorithm also bypasses the diff process, since it forces the builtin algorithm. Signed-off-by: Michael Montalbo --- Documentation/diff-algorithm-option.adoc | 3 +++ Documentation/diff-options.adoc | 4 +++- Documentation/gitattributes.adoc | 6 +++--- builtin/log.c | 7 +++++++ diff.c | 16 ++++++++++++++-- diff.h | 4 +++- t/t4080-diff-process.sh | 16 ++++++++++++++++ 7 files changed, 49 insertions(+), 7 deletions(-) diff --git a/Documentation/diff-algorithm-option.adoc b/Documentation/diff-algorithm-option.adoc index 8e3a0b63d784d8..4d7e2ec35f97ea 100644 --- a/Documentation/diff-algorithm-option.adoc +++ b/Documentation/diff-algorithm-option.adoc @@ -18,3 +18,6 @@ For instance, if you configured the `diff.algorithm` variable to a non-default value and want to use the default one, then you have to use `--diff-algorithm=default` option. ++ +If you explicitly choose a diff algorithm, it also bypasses +`diff..process` (see linkgit:gitattributes[5]). diff --git a/Documentation/diff-options.adoc b/Documentation/diff-options.adoc index c8242e24627eef..a884445211ed8e 100644 --- a/Documentation/diff-options.adoc +++ b/Documentation/diff-options.adoc @@ -833,7 +833,9 @@ endif::git-format-patch[] to use this option with linkgit:git-log[1] and friends. `--no-ext-diff`:: - Disallow external diff drivers. + Disallow external diff helpers, including + `diff..command` and `diff..process` + (see linkgit:gitattributes[5]). `--textconv`:: `--no-textconv`:: diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index 568b5583a4d280..67a81e1aa3ef16 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -1052,9 +1052,9 @@ display, which is covered above); and combined diffs (`--cc` and merge diffs), whose protocol would have to be extended from a single old/new pair to one comparison per merge parent. -`--diff-algorithm` bypasses the process entirely, for every feature -listed above. The whitespace-ignoring options (`-w`, -`--ignore-space-change`, `--ignore-blank-lines`, and the like), +`--no-ext-diff` and `--diff-algorithm` bypass the process entirely, +for every feature listed above. The whitespace-ignoring options +(`-w`, `--ignore-space-change`, `--ignore-blank-lines`, and the like), `-I`, and `--anchored` also bypass it for the affected files: the tool is never told about these options, so it could not honor them, and Git falls back to the builtin diff, which does. diff --git a/builtin/log.c b/builtin/log.c index d027ce1e0bc833..7821a61143c9d5 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -2217,6 +2217,13 @@ int cmd_format_patch(int argc, if (argc > 1) die(_("unrecognized argument: %s"), argv[1]); + /* + * Disable diff..process so that patches generated by + * format-patch are always based on the builtin diff algorithm + * and can be applied reliably. + */ + rev.diffopt.flags.no_diff_process = 1; + if (rev.diffopt.output_format & DIFF_FORMAT_NAME) die(_("--name-only does not make sense")); if (rev.diffopt.output_format & DIFF_FORMAT_NAME_STATUS) diff --git a/diff.c b/diff.c index 68bee81f0f56df..baf1767618a27b 100644 --- a/diff.c +++ b/diff.c @@ -5940,6 +5940,17 @@ static int diff_opt_submodule(const struct option *opt, return 0; } +static int diff_opt_ext_diff(const struct option *opt, + const char *arg, int unset) +{ + struct diff_options *options = opt->value; + + BUG_ON_OPT_ARG(arg); + options->flags.allow_external = !unset; + options->flags.no_diff_process = unset; + return 0; +} + static int diff_opt_textconv(const struct option *opt, const char *arg, int unset) { @@ -6270,8 +6281,9 @@ struct option *add_diff_options(const struct option *opts, N_("exit with 1 if there were differences, 0 otherwise")), OPT_BOOL(0, "quiet", &options->flags.quick, N_("disable all output of the program")), - OPT_BOOL(0, "ext-diff", &options->flags.allow_external, - N_("allow an external diff helper to be executed")), + OPT_CALLBACK_F(0, "ext-diff", options, NULL, + N_("allow an external diff helper to be executed"), + PARSE_OPT_NOARG, diff_opt_ext_diff), OPT_CALLBACK_F(0, "textconv", options, NULL, N_("run external text conversion filters when comparing binary files"), PARSE_OPT_NOARG, diff_opt_textconv), diff --git a/diff.h b/diff.h index 7dc157968d69f9..bc7da6986a4aac 100644 --- a/diff.h +++ b/diff.h @@ -173,7 +173,9 @@ struct diff_flags { */ unsigned allow_external; - /** Disables diff..process. */ + /** + * Disables diff..process. Set by --no-ext-diff. + */ unsigned no_diff_process; /** diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh index c101fc375e0c2a..779c0f1db5b784 100755 --- a/t/t4080-diff-process.sh +++ b/t/t4080-diff-process.sh @@ -343,6 +343,22 @@ test_expect_success 'diff process bypassed by --diff-algorithm' ' test_path_is_missing backend.log ' +test_expect_success 'diff process bypassed by --no-ext-diff' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --no-ext-diff worddiff.c >actual && + test_grep "return 999" actual && + test_path_is_missing backend.log +' + +test_expect_success 'diff process not used by format-patch' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + format-patch -1 --stdout -- logtest.c >actual && + test_grep "return 2" actual && + test_path_is_missing backend.log +' + test_expect_success 'diff process bypassed under whitespace-ignoring flags' ' test_when_finished "rm -f backend.log" && printf "a\nb\nc\n" >wsbypass.c && From 37d016a1a6dc7dd575f46e2dd1e2cb8a86fa64a2 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Thu, 28 May 2026 15:09:38 -0700 Subject: [PATCH 7/9] blame: consult diff process for no-hunk detection When a diff process is configured via diff..process, consult it during blame's per-commit diffing. If the process returns no hunks for a commit's changes to a file, treat the commit as having no changes, causing blame to attribute lines to earlier commits. The consultation happens at the pass_blame_to_parent() callsite using diff_process_fill_hunks(), matching how builtin_diff() in diff.c uses the same function. A new diff_hunks_xpp() variant accepts a pre-populated xpparam_t so callers can pass external hunks, while the existing diff_hunks() retains its original signature and behavior. The copy-detection callsite is unaffected since it does not use the diff process. The subprocess is long-running (one startup cost amortized across the blame traversal), but each commit in the file's history incurs a round-trip to the tool. Blame's -w option is not communicated to the process and it could not honor it, so blame consults the process only when -w is unset and otherwise uses the builtin diff, matching the bypass for "git diff". Signed-off-by: Michael Montalbo --- blame.c | 46 ++++++++++++--- t/t4080-diff-process.sh | 126 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 9 deletions(-) diff --git a/blame.c b/blame.c index 126e2324162353..a914430e0270b9 100644 --- a/blame.c +++ b/blame.c @@ -19,6 +19,8 @@ #include "tag.h" #include "trace2.h" #include "blame.h" +#include "diff-process.h" +#include "xdiff-interface.h" #include "alloc.h" #include "commit-slab.h" #include "bloom.h" @@ -314,17 +316,25 @@ static struct commit *fake_working_tree_commit(struct repository *r, -static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, - xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) +static int diff_hunks_xpp(mmfile_t *file_a, mmfile_t *file_b, + xdl_emit_hunk_consume_func_t hunk_func, + void *cb_data, xpparam_t *xpp) { - xpparam_t xpp = {0}; xdemitconf_t xecfg = {0}; xdemitcb_t ecb = {NULL}; - xpp.flags = xdl_opts; xecfg.hunk_func = hunk_func; ecb.priv = cb_data; - return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb); + return xdi_diff(file_a, file_b, xpp, &xecfg, &ecb); +} + +static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, + xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) +{ + xpparam_t xpp = {0}; + + xpp.flags = xdl_opts; + return diff_hunks_xpp(file_a, file_b, hunk_func, cb_data, &xpp); } static const char *get_next_line(const char *start, const char *end) @@ -1946,6 +1956,7 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb, struct blame_origin *parent, int ignore_diffs) { mmfile_t file_p, file_o; + xpparam_t xpp = {0}; struct blame_chunk_cb_data d; struct blame_entry *newdest = NULL; @@ -1964,10 +1975,27 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb, &sb->num_read_blob, ignore_diffs); sb->num_get_patch++; - if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts)) - die("unable to generate diff (%s -> %s)", - oid_to_hex(&parent->commit->object.oid), - oid_to_hex(&target->commit->object.oid)); + xpp.flags = sb->xdl_opts; + /* + * Whitespace-ignoring options are not communicated to the diff + * process, so it could not honor them; consult it only when none + * are set, and otherwise use the builtin diff (which does honor + * them via xpp.flags), matching the bypass in + * diff_process_fill_hunks() for "git diff". If the process + * considers the files equivalent, skip the diff so blame looks + * past this commit. + */ + if ((xpp.flags & (XDF_WHITESPACE_FLAGS | XDF_IGNORE_BLANK_LINES)) || + diff_process_fill_hunks(&sb->revs->diffopt, target->path, + &file_p, &file_o, &xpp) + != DIFF_PROCESS_EQUIVALENT) { + if (diff_hunks_xpp(&file_p, &file_o, blame_chunk_cb, + &d, &xpp)) + die("unable to generate diff (%s -> %s)", + oid_to_hex(&parent->commit->object.oid), + oid_to_hex(&target->commit->object.oid)); + } + free(xpp.external_hunks); /* The rest are the same as the parent */ blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, 0, parent, target, 0); diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh index 779c0f1db5b784..db04f20edcbb51 100755 --- a/t/t4080-diff-process.sh +++ b/t/t4080-diff-process.sh @@ -542,4 +542,130 @@ test_expect_success 'diff process skipped when tool omits capability' ' test_must_be_empty stderr ' +# +# Blame integration. +# + +test_expect_success 'blame uses tool-provided hunks' ' + cat >blame-hunk.c <<-\EOF && + line1 + line2 + line3 + line4 + original5 + original6 + line7 + line8 + line9 + line10 + EOF + git add blame-hunk.c && + git commit -m "add blame-hunk.c" && + ORIG=$(git rev-parse --short HEAD) && + + cat >blame-hunk.c <<-\EOF && + line1 + line2 + line3 + line4 + changed5 + changed6 + line7 + line8 + changed9 + changed10 + EOF + git add blame-hunk.c && + git commit -m "change blame-hunk.c" && + CHANGE=$(git rev-parse --short HEAD) && + + # With fixed-hunk mode the tool reports only lines 5-6 as changed, + # so blame should attribute lines 9-10 to the original commit + # even though the builtin diff would show them as changed. + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + blame blame-hunk.c >actual && + sed -n "9p" actual >line9 && + sed -n "10p" actual >line10 && + test_grep "$ORIG" line9 && + test_grep "$ORIG" line10 && + sed -n "5p" actual >line5 && + sed -n "6p" actual >line6 && + test_grep "$CHANGE" line5 && + test_grep "$CHANGE" line6 +' + +test_expect_success 'blame skips commits with no hunks from diff process' ' + cat >blame.c <<-\EOF && + int main(void) { + return 0; + } + EOF + git add blame.c && + git commit -m "add blame.c" && + ORIG_COMMIT=$(git rev-parse --short HEAD) && + + cat >blame.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add blame.c && + git commit -m "reformat blame.c" && + BLAME_COMMIT=$(git rev-parse --short HEAD) && + + # Without no-hunks mode, blame attributes the change. + git blame blame.c >without && + test_grep "$BLAME_COMMIT" without && + + # With no-hunks mode, the process considers the files equivalent + # and blame skips the reformat commit, attributing to the original. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + blame blame.c >with && + test_grep ! "$BLAME_COMMIT" with && + test_grep "$ORIG_COMMIT" with +' + +test_expect_success 'blame --no-ext-diff bypasses diff process' ' + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + blame --no-ext-diff blame.c >actual && + # Without the process, blame attributes the reformat commit normally. + test_grep "$BLAME_COMMIT" actual && + test_path_is_missing backend.log +' + +test_expect_success 'blame --no-ext-diff uses builtin hunks' ' + # fixed-hunk mode would narrow blame to lines 5-6, but + # --no-ext-diff should bypass it and use the builtin diff. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk --log=backend.log" \ + blame --no-ext-diff blame-hunk.c >actual && + # Builtin diff attributes lines 9-10 to the change commit. + sed -n "9p" actual >line9 && + test_grep "$CHANGE" line9 && + test_path_is_missing backend.log +' + +test_expect_success 'blame -w bypasses diff process' ' + test_when_finished "rm -f backend.log" && + printf "alpha\nbeta\ngamma\n" >blamew.c && + git add blamew.c && + git commit -m "add blamew.c" && + orig=$(git rev-parse --short HEAD) && + printf "alpha\n beta \ngamma\n" >blamew.c && + git commit -am "reindent beta" && + reindent=$(git rev-parse --short HEAD) && + # blame -w must ignore the whitespace-only change and attribute + # beta to the original commit, not the reindent commit. The tool + # is never told about -w, so blame must bypass it (not let tool + # hunks override -w). + git -c diff.cdiff.process="$BACKEND --mode=whole-file --log=backend.log" \ + blame -w blamew.c >actual && + sed -n "2p" actual >line2 && + test_grep "$orig" line2 && + test_grep ! "$reindent" line2 && + test_path_is_missing backend.log +' + test_done From 883effb674993df83bae477a3202a4de3e0e3369 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Wed, 24 Jun 2026 20:38:45 -0700 Subject: [PATCH 8/9] diff: consult diff process for --stat counts builtin_diff() already consults a configured diff..process: a file the tool reports as equivalent emits no patch, and otherwise the tool's hunks drive the output. builtin_diffstat() ran its own xdiff and ignored the process, so "git diff --stat" still counted a byte-level change for a file that "git diff" showed as unchanged. Consult diff_process_fill_hunks() before the stat xdiff, mirroring the blame integration. On DIFF_PROCESS_EQUIVALENT, skip the xdiff so the file keeps its zero inserted and deleted counts and the existing "nothing changed" pruning drops it, matching the empty patch. Otherwise the tool's hunks, or the builtin fallback, feed the counts through the shared xpparam_t. Like the builtin summary path, builtin_diffstat() does not apply textconv, so the process is consulted on the raw blob content here, unlike builtin_diff() which sends textconv'd content. This keeps "git diff --stat" counting raw lines as it does today; the asymmetry between patch output and summary counts under textconv predates this change. Move the summary formats out of the "not yet wired" group of the "Which features consult the diff process" documentation and into the list of features that use the tool's hunks, noting the raw, non-textconv content they receive. Add tests covering counts that come from the tool's hunks (--numstat), an equivalent file producing no stat line, and the raw, non-textconv content the tool receives for --stat. Document that the line-counting --dirstat=lines follows these counts while the default --dirstat does not, and that summary formats and blame (only under --textconv) differ from patch output in whether they textconv the content the tool sees. Extend the tests to cover --shortstat, --stat --exit-code, a multi-file mix of equivalent and changed files, and a mode-only change. Signed-off-by: Michael Montalbo --- Documentation/gitattributes.adoc | 33 +++++--- diff.c | 17 +++- t/t4080-diff-process.sh | 128 +++++++++++++++++++++++++++++++ 3 files changed, 167 insertions(+), 11 deletions(-) diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index 67a81e1aa3ef16..d644c2956c1849 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -873,7 +873,10 @@ a flush packet, followed by the old and new file content as packetized data, each terminated with a flush packet. The pathname is relative to the repository root. When `diff..textconv` is also set, the tool receives the textconv-transformed content rather than the -raw blob. Git does not send binary files to the diff process. +raw blob, matching what the consuming feature itself diffs: patch +output is textconv'd, the summary formats (noted below) are not, and +`git blame` applies textconv only under `--textconv`. Git does not +send binary files to the diff process. ----------------------- packet: git> command=hunks @@ -944,8 +947,8 @@ still slide or regroup those changes against matching context for display, exactly as it compacts its own diffs, so the tool controls which lines are reported as changed, not the precise hunk boundaries. Patch output features (word diff, function context, color) work -normally. Summary formats such as `--stat` still compute their counts -with the builtin diff for now; see "Which features consult the diff +normally, as do summary formats like `--stat`. Not every feature +consults the process, though; see "Which features consult the diff process" below for the full picture and the reasoning behind it. If no hunk lines precede the flush, followed by "success", Git @@ -1019,6 +1022,17 @@ of the builtin algorithm: hunks without any further negotiation. - `git blame`: a commit whose change the tool reports as equivalent is skipped, and its lines are attributed to an earlier commit. +- `--stat`, `--numstat`, and `--shortstat`: the inserted and deleted + counts come from the tool's hunks, so a file the tool calls + equivalent contributes no stat line, matching the empty patch that + `git diff` produces for it. These summary formats do not apply + textconv (just as the builtin summary path does not), so the tool + is consulted on the raw blob content even when a `textconv` is also + configured for patch output; this mirrors how builtin `--stat` + already counts raw lines rather than the textconv'd view. The + line-counting `--dirstat=lines` uses these same counts; the default + `--dirstat`, which weighs byte changes, is computed on its own and + does not consult the tool. Features that ask a different question do not consult the process, by design: @@ -1044,13 +1058,12 @@ design: - `--raw`, `--name-only`, and `--name-status` compare object ids at the tree level and never run a line-level diff at all. -Some features ask "which lines changed" but still use the builtin -algorithm for now, and may consult the process in a later change: the -summary formats (`--stat`, `--numstat`, `--shortstat`); `git log -L`'s -commit selection and parent range propagation (as distinct from its -display, which is covered above); and combined diffs (`--cc` and merge -diffs), whose protocol would have to be extended from a single old/new -pair to one comparison per merge parent. +Two cases ask "which lines changed" but still use the builtin +algorithm, and may consult the process in a later change: `git log +-L`'s commit selection and parent range propagation (as distinct from +its display, which is covered above), and combined diffs (`--cc` and +merge diffs), whose protocol would have to be extended from a single +old/new pair to one comparison per merge parent. `--no-ext-diff` and `--diff-algorithm` bypass the process entirely, for every feature listed above. The whitespace-ignoring options diff --git a/diff.c b/diff.c index baf1767618a27b..a260ecfaf48e3c 100644 --- a/diff.c +++ b/diff.c @@ -4261,9 +4261,24 @@ static void builtin_diffstat(const char *name_a, const char *name_b, xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_NO_HUNK_HDR; - if (xdi_diff_outf(&mf1, &mf2, NULL, + /* + * Consult the diff process so --stat reflects the + * tool's view of which lines changed rather than the + * builtin line diff. As in the builtin path, --stat + * does not apply textconv, so the tool is fed the same + * raw mmfiles the stat itself diffs (unlike builtin_diff, + * which consults the process on textconv'd content). + * When the tool reports the files as equivalent we skip + * xdiff entirely, leaving added and deleted at zero so + * the file is pruned below, just as builtin_diff() emits + * no patch for an equivalent file. + */ + if (diff_process_fill_hunks(o, name_a, &mf1, &mf2, &xpp) + != DIFF_PROCESS_EQUIVALENT && + xdi_diff_outf(&mf1, &mf2, NULL, diffstat_consume, diffstat, &xpp, &xecfg)) die("unable to generate diffstat for %s", one->path); + free(xpp.external_hunks); if (DIFF_FILE_VALID(one) && DIFF_FILE_VALID(two)) { struct diffstat_file *file = diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh index db04f20edcbb51..2c9613d1c4185e 100755 --- a/t/t4080-diff-process.sh +++ b/t/t4080-diff-process.sh @@ -248,6 +248,21 @@ test_expect_success 'diff process works alongside textconv' ' test_must_be_empty stderr ' +test_expect_success 'diff process --stat is fed raw, not textconv, content' ' + # Reuses textconv.c from the previous test (committed "hello + # world", modified to "goodbye world"). Unlike patch output, + # --stat does not apply textconv, so the tool sees raw lowercase + # content here even with a textconv configured. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.textconv="./uppercase-filter" \ + -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --stat -- textconv.c >actual 2>stderr && + test_grep "pathname=textconv.c" backend.log && + test_grep "old=hello world" backend.log && + test_grep "new=goodbye world" backend.log && + test_must_be_empty stderr +' + # # Downstream features: word diff, log, equivalent files, exit code. # @@ -331,6 +346,119 @@ test_expect_success 'diff process with --exit-code and hunks returns failure' ' diff --exit-code newfile.c ' +test_expect_success 'diff process feeds --numstat counts' ' + # fixed-hunk reports only lines 5-6 as changed, so the stat + # counts come from the tool (2/2), not the builtin diff (4/4). + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk --log=backend.log" \ + diff --numstat boundary.c >actual 2>stderr && + printf "2\t2\tboundary.c\n" >expect && + test_cmp expect actual && + test_grep "command=hunks pathname=boundary.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process --numstat sums multi-hunk counts' ' + # multi-hunk reports both 2-line regions (5-6 and 9-10), so the + # counts add up across both hunks: 4 inserted, 4 deleted. This + # exercises the two-region hunk path through builtin_diffstat. + git -c diff.cdiff.process="$BACKEND --mode=multi-hunk" \ + diff --numstat boundary.c >actual && + printf "4\t4\tboundary.c\n" >expect && + test_cmp expect actual +' + +test_expect_success 'diff process equivalent files produce no --stat line' ' + # A file the tool calls equivalent contributes no stat line, + # matching the empty patch that git diff produces for it. + test_when_finished "rm -f backend.log" && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --stat worddiff.c >actual 2>stderr && + test_must_be_empty actual && + test_grep "command=hunks pathname=worddiff.c" backend.log && + test_must_be_empty stderr +' + +test_expect_success 'diff process feeds --shortstat counts' ' + # fixed-hunk reports lines 5-6 only, so the summary counts come + # from the tool (2 insertions, 2 deletions), not builtin (4/4). + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + diff --shortstat boundary.c >actual && + test_grep "2 insertions" actual && + test_grep "2 deletions" actual +' + +test_expect_success 'diff process equivalent file makes --stat --exit-code succeed' ' + # The tool reports worddiff.c equivalent, so --exit-code reports + # no change (0); the builtin diff would report a change (1). + git -c diff.cdiff.process="$BACKEND --mode=no-hunks" \ + diff --stat --exit-code worddiff.c && + test_expect_code 1 git diff --no-ext-diff --stat --exit-code worddiff.c +' + +test_expect_success 'diff process --numstat with mixed equivalent and changed files' ' + test_when_finished "rm -f c.log h.log" && + # Self-contained fixtures: *.c uses whole-file (changed); *.mh + # uses no-hunks (equivalent). + echo "*.mh diff=hdiff" >>.gitattributes && + git add .gitattributes && + printf "int a(void) { return 1; }\n" >mixed.c && + printf "int b(void) { return 1; }\n" >mixed.mh && + git add mixed.c mixed.mh && + git commit -m "add mixed fixtures" && + printf "int a(void) { return 2; }\n" >mixed.c && + printf "int b(void) { return 2; }\n" >mixed.mh && + git -c diff.cdiff.process="$BACKEND --mode=whole-file --log=c.log" \ + -c diff.hdiff.process="$BACKEND --mode=no-hunks --log=h.log" \ + diff --numstat mixed.c mixed.mh >actual 2>stderr && + test_grep "mixed.c" actual && + test_grep ! "mixed.mh" actual && + test_grep "pathname=mixed.c" c.log && + test_grep "pathname=mixed.mh" h.log && + test_must_be_empty stderr +' + +test_expect_success POSIXPERM 'diff process keeps mode-only change in --stat' ' + test_when_finished "rm -f backend.log" && + cat >modeonly.c <<-\EOF && + int m(void) { return 1; } + EOF + git add modeonly.c && + git commit -m "add modeonly.c" && + cat >modeonly.c <<-\EOF && + int m(void) { return 2; } + EOF + git add modeonly.c && + test_chmod +x modeonly.c && + git commit -m "edit and chmod modeonly.c" && + # Content and mode both changed, but no-hunks reports the content + # equivalent. The tool is consulted (counts are zero, not the + # builtin 1/1), yet the mode change keeps the file from being + # pruned. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --stat HEAD^ HEAD >actual 2>stderr && + test_grep "modeonly.c" actual && + test_grep "command=hunks pathname=modeonly.c" backend.log && + test_grep ! "1 insertion" actual && + test_must_be_empty stderr +' + +test_expect_success 'diff process not consulted for default --dirstat' ' + # Default --dirstat counts via its own path and never contacts + # the tool, so the change is still reported even though no-hunks + # would call it equivalent. (--dirstat=lines uses the stat path.) + test_when_finished "rm -f backend.log" && + mkdir -p dsub && + printf "a\nb\nc\n" >dsub/d.c && + git add dsub/d.c && + git commit -m "add dsub/d.c" && + printf "a\nB\nc\n" >dsub/d.c && + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + diff --dirstat=0 dsub/d.c >actual && + test_grep "dsub" actual && + test_path_is_missing backend.log +' + # # Bypass mechanisms: flags and commands that skip the diff process. # From 079f4f0a0f3a45d003df331ec4b754da02b23e60 Mon Sep 17 00:00:00 2001 From: Michael Montalbo Date: Wed, 24 Jun 2026 21:43:48 -0700 Subject: [PATCH 9/9] line-log: consult diff process for range tracking git log -L tracks line ranges by diffing each commit against its parent in collect_diff(). This pass used the builtin diff while the displayed diff (builtin_diff()) consults a configured diff..process, so the two could disagree: a reformat-only commit selected by builtin tracking was then rendered with an empty diff because the tool reported the files equivalent. Consult the process in collect_diff() too, mirroring the blame integration. When the tool reports the files equivalent, collect no ranges; the tracked range then maps across unchanged and the commit drops out of the log, matching what is displayed. Like the summary formats, the tracking pass diffs raw content, so the tool is consulted on the raw blobs here. Signed-off-by: Michael Montalbo --- Documentation/gitattributes.adoc | 20 ++++++++++--------- line-log.c | 24 ++++++++++++++++++++--- t/t4080-diff-process.sh | 33 ++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index d644c2956c1849..8c8cb051c7075a 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -1016,12 +1016,16 @@ Features that ask "which lines changed" use the tool's hunks in place of the builtin algorithm: - `git diff` patch output, together with everything layered on it: - word diff, function context (`-W`), `--color-moved`, the `@@` hunk - headers, and the `-L` line-range display. These operate on the - lines the patch step already emitted, so they reflect the tool's - hunks without any further negotiation. + word diff, function context (`-W`), `--color-moved`, and the `@@` + hunk headers. These operate on the lines the patch step already + emitted, so they reflect the tool's hunks without any further + negotiation. - `git blame`: a commit whose change the tool reports as equivalent is skipped, and its lines are attributed to an earlier commit. +- `git log -L`: both the line-range display and the underlying range + tracking consult the tool, so a commit it reports as equivalent is + dropped from the log (its tracked range maps across unchanged) + rather than selected and then shown with an empty diff. - `--stat`, `--numstat`, and `--shortstat`: the inserted and deleted counts come from the tool's hunks, so a file the tool calls equivalent contributes no stat line, matching the empty patch that @@ -1058,11 +1062,9 @@ design: - `--raw`, `--name-only`, and `--name-status` compare object ids at the tree level and never run a line-level diff at all. -Two cases ask "which lines changed" but still use the builtin -algorithm, and may consult the process in a later change: `git log --L`'s commit selection and parent range propagation (as distinct from -its display, which is covered above), and combined diffs (`--cc` and -merge diffs), whose protocol would have to be extended from a single +Combined diffs (`--cc` and merge diffs) ask "which lines changed" but +still use the builtin algorithm, and may consult the process in a +later change; their protocol would have to be extended from a single old/new pair to one comparison per merge parent. `--no-ext-diff` and `--diff-algorithm` bypass the process entirely, diff --git a/line-log.c b/line-log.c index 5fc75ae275e03a..5da7bcdaeb330b 100644 --- a/line-log.c +++ b/line-log.c @@ -7,6 +7,7 @@ #include "tag.h" #include "tree.h" #include "diff.h" +#include "diff-process.h" #include "commit.h" #include "decorate.h" #include "repository.h" @@ -330,12 +331,15 @@ static int collect_diff_cb(long start_a, long count_a, return 0; } -static int collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges *out) +static int collect_diff(struct diff_options *diffopt, const char *path, + mmfile_t *parent, mmfile_t *target, + struct diff_ranges *out) { struct collect_diff_cbdata cbdata = {NULL}; xpparam_t xpp; xdemitconf_t xecfg; xdemitcb_t ecb; + int ret = 0; memset(&xpp, 0, sizeof(xpp)); memset(&xecfg, 0, sizeof(xecfg)); @@ -345,7 +349,20 @@ static int collect_diff(mmfile_t *parent, mmfile_t *target, struct diff_ranges * xecfg.hunk_func = collect_diff_cb; memset(&ecb, 0, sizeof(ecb)); ecb.priv = &cbdata; - return xdi_diff(parent, target, &xpp, &xecfg, &ecb); + + /* + * Consult the diff process so range tracking agrees with the + * diff that will be shown. When the tool reports the files as + * equivalent we collect no ranges, so the tracked range maps + * across unchanged and the commit drops out of the log, rather + * than being selected here but rendered with an empty diff by + * the process-aware builtin_diff(). + */ + if (diff_process_fill_hunks(diffopt, path, parent, target, &xpp) + != DIFF_PROCESS_EQUIVALENT) + ret = xdi_diff(parent, target, &xpp, &xecfg, &ecb); + free(xpp.external_hunks); + return ret; } /* @@ -927,7 +944,8 @@ static int process_diff_filepair(struct rev_info *rev, } diff_ranges_init(&diff); - if (collect_diff(&file_parent, &file_target, &diff)) + if (collect_diff(&rev->diffopt, pair->two->path, + &file_parent, &file_target, &diff)) die("unable to generate diff for %s", pair->one->path); /* NEEDSWORK should apply some heuristics to prevent mismatches */ diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh index 2c9613d1c4185e..07a2ad7403706e 100755 --- a/t/t4080-diff-process.sh +++ b/t/t4080-diff-process.sh @@ -796,4 +796,37 @@ test_expect_success 'blame -w bypasses diff process' ' test_path_is_missing backend.log ' +# +# Line-log (git log -L) range tracking. +# + +test_expect_success 'diff process drops equivalent commit from log -L' ' + test_when_finished "rm -f backend.log" && + cat >linelog.c <<-\EOF && + int tracked(void) { return 1; } + EOF + git add linelog.c && + git commit -m "add linelog.c" && + + cat >linelog.c <<-\EOF && + int tracked(void) { return 2; } + EOF + git commit -am "change tracked line" && + + # Builtin line tracking selects the change commit. + git log --no-ext-diff -L1,1:linelog.c --format="%s" >builtin && + test_grep "change tracked line" builtin && + + # With the tool reporting the change as equivalent, tracking + # drops the commit (the range maps across unchanged) instead of + # selecting it and rendering an empty diff. + git -c diff.cdiff.process="$BACKEND --mode=no-hunks --log=backend.log" \ + log -L1,1:linelog.c --format="%s" >actual && + test_grep ! "change tracked line" actual && + # The creating commit still appears, so the change commit was + # selectively dropped rather than the whole log going empty. + test_grep "add linelog.c" actual && + test_grep "command=hunks pathname=linelog.c" backend.log +' + test_done