Skip to content

[RegexDiff X64] [stephentoub] Reduce backtracking for greedy loops followed ... #1822

@MihuBot

Description

@MihuBot

Job completed in 11 minutes 5 seconds (remote runner delay: 0 seconds).

Using arguments: regexdiff -NoPRLink
Main commit: dotnet/runtime@93ec72d
PR commit: stephentoub/runtime@727885a

69 out of 18857 patterns have generated source code changes.

Examples of GeneratedRegex source diffs
"([A-Z]+)([A-Z][a-z])" (1481 uses)
[GeneratedRegex("([A-Z]+)([A-Z][a-z])")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOfAnyInRange('A', 'Z')) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       UncaptureUntil(0);
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (!char.IsAsciiLetterUpper(inputSpan[charloop_ending_pos]))
  {
      UncaptureUntil(0);
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
"([0-9\\w\\+]+\\.)|([0-9\\w\\+]+\\+)([\\(\\)]*)" (582 uses)
[GeneratedRegex("([0-9\\w\\+]+\\.)|([0-9\\w\\+]+\\+)([\\(\\)]*)")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOf('+')) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       UncaptureUntil(0);
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (inputSpan[charloop_ending_pos] != '+')
  {
      UncaptureUntil(0);
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
"^[a-z0-9][a-z0-9.-]+[a-z0-9]$" (452 uses)
[GeneratedRegex("^[a-z0-9][a-z0-9.-]+[a-z0-9]$")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOfAny(Utilities.s_asciiLettersLowerAndDigits)) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (!(((uint)((ch = inputSpan[charloop_ending_pos]) - '0') <= (uint)('9' - '0')) | ((uint)(ch - 'a') <= (uint)('z' - 'a'))))
  {
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
"((([^\"]*\\\\\\\")*)|([^\"]*))[^\"]*(\\\"|$)" (193 uses)
[GeneratedRegex("((([^\"]*\\\\\\\")*)|([^\"]*))[^\"]*(\\\"|$)")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, Math.Min(inputSpan.Length, charloop_ending_pos + 1) - charloop_starting_pos).LastIndexOf("\\\"")) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       goto LoopIterationNoMatch;
+   }
+   charloop_ending_pos--;
+   if (inputSpan[charloop_ending_pos] != '\\')
  {
      goto LoopIterationNoMatch;
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
"([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+) \\s+ [+- ..." (169 uses)
[GeneratedRegex("([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+) \\s+ [+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+)) (?:\\s+[+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+))+")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos2 >= charloop_ending_pos2 ||
-       (charloop_ending_pos2 = inputSpan.Slice(charloop_starting_pos2, charloop_ending_pos2 - charloop_starting_pos2).LastIndexOf(' ')) < 0)
+   if (charloop_starting_pos2 >= charloop_ending_pos2)
+   {
+       goto AlternationBacktrack;
+   }
+   charloop_ending_pos2--;
+   if (inputSpan[charloop_ending_pos2] != ' ')
  {
      goto AlternationBacktrack;
  }
-   charloop_ending_pos2 += charloop_starting_pos2;
  pos = charloop_ending_pos2;
+   charloop_ending_pos2 = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd2:
"\\s+ (?:z|m|zm|Z|M|ZM) \\s* \\(" (169 uses)
[GeneratedRegex("\\s+ (?:z|m|zm|Z|M|ZM) \\s* \\(")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOf(' ')) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (inputSpan[charloop_ending_pos] != ' ')
  {
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos1 >= charloop_ending_pos1 ||
-       (charloop_ending_pos1 = inputSpan.Slice(charloop_starting_pos1, Math.Min(inputSpan.Length, charloop_ending_pos1 + 1) - charloop_starting_pos1).LastIndexOf(" (")) < 0)
+   if (charloop_starting_pos1 >= charloop_ending_pos1)
+   {
+       goto AlternationBacktrack;
+   }
+   charloop_ending_pos1--;
+   if (inputSpan[charloop_ending_pos1] != ' ')
  {
      goto AlternationBacktrack;
  }
-   charloop_ending_pos1 += charloop_starting_pos1;
  pos = charloop_ending_pos1;
+   charloop_ending_pos1 = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd1:
"^ \\s* ([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+)) ..." (169 uses)
[GeneratedRegex("^ \\s* ([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+)) \\s* ([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+)) \\s* ([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+)) \\s* ([+-]?(?:\\d+\\.?\\d*|\\d*\\.?\\d+)) \\s* $")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOf(' ')) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       UncaptureUntil(0);
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (inputSpan[charloop_ending_pos] != ' ')
  {
      UncaptureUntil(0);
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos3 >= charloop_ending_pos3 ||
-       (charloop_ending_pos3 = inputSpan.Slice(charloop_starting_pos3, charloop_ending_pos3 - charloop_starting_pos3).LastIndexOf(' ')) < 0)
+   if (charloop_starting_pos3 >= charloop_ending_pos3)
+   {
+       goto CaptureBacktrack;
+   }
+   charloop_ending_pos3--;
+   if (inputSpan[charloop_ending_pos3] != ' ')
  {
      goto CaptureBacktrack;
  }
-   charloop_ending_pos3 += charloop_starting_pos3;
  pos = charloop_ending_pos3;
+   charloop_ending_pos3 = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd3:
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos6 >= charloop_ending_pos6 ||
-       (charloop_ending_pos6 = inputSpan.Slice(charloop_starting_pos6, charloop_ending_pos6 - charloop_starting_pos6).LastIndexOf(' ')) < 0)
+   if (charloop_starting_pos6 >= charloop_ending_pos6)
+   {
+       goto CaptureBacktrack1;
+   }
+   charloop_ending_pos6--;
+   if (inputSpan[charloop_ending_pos6] != ' ')
  {
      goto CaptureBacktrack1;
  }
-   charloop_ending_pos6 += charloop_starting_pos6;
  pos = charloop_ending_pos6;
+   charloop_ending_pos6 = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd6:
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos9 >= charloop_ending_pos9 ||
-       (charloop_ending_pos9 = inputSpan.Slice(charloop_starting_pos9, charloop_ending_pos9 - charloop_starting_pos9).LastIndexOf(' ')) < 0)
+   if (charloop_starting_pos9 >= charloop_ending_pos9)
+   {
+       goto CaptureBacktrack2;
+   }
+   charloop_ending_pos9--;
+   if (inputSpan[charloop_ending_pos9] != ' ')
  {
      goto CaptureBacktrack2;
  }
-   charloop_ending_pos9 += charloop_starting_pos9;
  pos = charloop_ending_pos9;
+   charloop_ending_pos9 = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd9:
"(?<ServiceName>^[^ ]+): " (164 uses)
[GeneratedRegex("(?<ServiceName>^[^ ]+): ")]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, Math.Min(inputSpan.Length, charloop_ending_pos + 1) - charloop_starting_pos).LastIndexOf(": ")) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       UncaptureUntil(0);
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (inputSpan[charloop_ending_pos] != ':')
  {
      UncaptureUntil(0);
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
"Bind<(?<contractTypes>[\\w.,\\s<>]+)>\\(\\)( ..." (125 uses)
[GeneratedRegex("Bind<(?<contractTypes>[\\w.,\\s<>]+)>\\(\\)(?<config>.*)\\.To<\\s*(?<instanceType>[\\w.<>]+)\\s*>\\(\\)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant)]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, Math.Min(inputSpan.Length, charloop_ending_pos + 2) - charloop_starting_pos).LastIndexOf(">()")) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       UncaptureUntil(0);
+       return false; // The input didn't match.
+   }
+   charloop_ending_pos--;
+   if (inputSpan[charloop_ending_pos] != '>')
  {
      UncaptureUntil(0);
      return false; // The input didn't match.
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:
"(?<link>[a-z]*?:\\/\\/(?<domain>(?:[a-z0-9]\ ..." (124 uses)
[GeneratedRegex("(?<link>[a-z]*?:\\/\\/(?<domain>(?:[a-z0-9]\\.|[a-z0-9][a-z0-9-]*[a-z0-9]\\.)*[a-z0-9-]*[a-z0-9](?::\\d+)?)(?<path>(?:(?:\\/+(?:[a-z0-9$_\\.\\+!\\*\\',;:\\(\\)@&~=-]|%[0-9a-f]{2})*)*(?:\\?(?:[a-z0-9$_\\+!\\*\\',;:\\(\\)@&=\\/~-]|%[0-9a-f]{2})*)?)?(?:#(?:[a-z0-9$_\\+!\\*\\',;:\\(\\)@&=\\/~-]|%[0-9a-f]{2})*)?)?)", RegexOptions.IgnoreCase)]
      base.CheckTimeout();
  }
  
-   if (charloop_starting_pos >= charloop_ending_pos ||
-       (charloop_ending_pos = inputSpan.Slice(charloop_starting_pos, charloop_ending_pos - charloop_starting_pos).LastIndexOfAny(Utilities.s_asciiLettersAndDigitsAndKelvinSign)) < 0)
+   if (charloop_starting_pos >= charloop_ending_pos)
+   {
+       goto LoopIterationNoMatch;
+   }
+   charloop_ending_pos--;
+   if (!((ch = inputSpan[charloop_ending_pos]) < 128 ? char.IsAsciiLetterOrDigit(ch) : RegexRunner.CharInClass((char)ch, "\0\b\00:A[a{KÅ")))
  {
      goto LoopIterationNoMatch;
  }
-   charloop_ending_pos += charloop_starting_pos;
  pos = charloop_ending_pos;
+   charloop_ending_pos = 0;
  slice = inputSpan.Slice(pos);
  
  CharLoopEnd:

For more diff examples, see https://gist.github.com/MihuBot/495ff26669f2a814ba0dd01d2ab7448c

Sample source code for further analysis
const string JsonPath = "RegexResults-1822.json";
if (!File.Exists(JsonPath))
{
    await using var archiveStream = await new HttpClient().GetStreamAsync("https://mihubot.xyz/r/FJaS88VA");
    using var archive = new ZipArchive(archiveStream, ZipArchiveMode.Read);
    archive.Entries.First(e => e.Name == "Results.json").ExtractToFile(JsonPath);
}

using FileStream jsonFileStream = File.OpenRead(JsonPath);
RegexEntry[] entries = JsonSerializer.Deserialize<RegexEntry[]>(jsonFileStream, new JsonSerializerOptions { IncludeFields = true })!;
Console.WriteLine($"Working with {entries.Length} patterns");



record KnownPattern(string Pattern, RegexOptions Options, int Count);

sealed class RegexEntry
{
    public required KnownPattern Regex { get; set; }
    public required string MainSource { get; set; }
    public required string PrSource { get; set; }
    public string? FullDiff { get; set; }
    public string? ShortDiff { get; set; }
    public (string Name, string Values)[]? SearchValuesOfChar { get; set; }
    public (string[] Values, StringComparison ComparisonType)[]? SearchValuesOfString { get; set; }
}

Artifacts:

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions