Skip to content

JIT: Add a runtime async optimization to skip saving unmutated locals into reused continuations#125615

Draft
jakobbotsch wants to merge 19 commits intodotnet:mainfrom
jakobbotsch:reuse-continuation-fields
Draft

JIT: Add a runtime async optimization to skip saving unmutated locals into reused continuations#125615
jakobbotsch wants to merge 19 commits intodotnet:mainfrom
jakobbotsch:reuse-continuation-fields

Conversation

@jakobbotsch
Copy link
Member

@jakobbotsch jakobbotsch commented Mar 16, 2026

With #125556 we learn something whenever we reuse a continuation -- specifically that the continuation was created at one of the other suspension points that can reach the current suspension point. We can use that knowledge to skip saving all locals that cannot possibly have been mutated since any previous suspension point. This saves a lot of write barriers when we reuse continuations.

Micro benchmark with warmup
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;

namespace AsyncMicro;

public class Program
{
    static void Main()
    {
        NullAwaiter na = new NullAwaiter();

        for (int i = 0; i < 10; i++)
        {
            for (int j = 0; j < 100; j++)
            {
                Task t = Foo(100, na);
                while (!t.IsCompleted)
                {
                    na.Continue();
                }
            }

            Thread.Sleep(100);
        }

        for (int i = 0; i < 5; i++)
        {
            Task t = Foo(10_000_000, na);
            while (!t.IsCompleted)
            {
                na.Continue();
            }
        }
    }

    static int s_value;
    static async Task Foo(int n, NullAwaiter na)
    {
        for (int i = 0; i < n; i++)
        {
            s_value += i;
        }

        Stopwatch timer = Stopwatch.StartNew();
        for (int i = 0; i < n; i++)
        {
            await na;
        }

        if (n > 100)
            Console.WriteLine("Took {0:F1} ms", timer.Elapsed.TotalMilliseconds);
    }

    private class NullAwaiter : ICriticalNotifyCompletion
    {
        public Action Continue;

        public NullAwaiter GetAwaiter() => this;

        public bool IsCompleted => false;

        public void GetResult()
        {
        }

        public void UnsafeOnCompleted(Action continuation)
        {
            Continue = continuation;
        }

        public void OnCompleted(Action continuation)
        {
            throw new NotImplementedException();
        }
    }
}

(with DOTNET_TC_OnStackReplacement=0 due to #120865) this improves performance by about 10%.

Codegen diff
@@ -26,7 +26,7 @@ G_M000_IG01:                ;; offset=0x0000
        mov      qword ptr [rbp-0x30], rax
        mov      gword ptr [rbp+0x10], rcx
        mov      gword ptr [rbp+0x20], r8
-       mov      esi, edx
+       mov      ebx, edx
  
 G_M000_IG02:                ;; offset=0x0034
        test     rcx, rcx
@@ -49,7 +49,7 @@ G_M000_IG03:                ;; offset=0x006B
  
 G_M000_IG04:                ;; offset=0x007F
        xor      eax, eax
-       test     esi, esi
+       test     ebx, ebx
        jle      SHORT G_M000_IG07
  
 G_M000_IG05:                ;; offset=0x0085
@@ -59,7 +59,7 @@ G_M000_IG05:                ;; offset=0x0085
 G_M000_IG06:                ;; offset=0x008F
        add      dword ptr [rdx], eax
        inc      eax
-       cmp      eax, esi
+       cmp      eax, ebx
        jl       SHORT G_M000_IG06
  
 G_M000_IG07:                ;; offset=0x0097
@@ -83,36 +83,36 @@ G_M000_IG08:                ;; offset=0x00C1
  
 G_M000_IG09:                ;; offset=0x00DC
        mov      rdi, gword ptr [rbp-0x50]
-       test     esi, esi
+       test     ebx, ebx
        jle      SHORT G_M000_IG13
  
 G_M000_IG10:                ;; offset=0x00E4
-       mov      rbx, gword ptr [rbp+0x20]
-       cmp      byte  ptr [rbx], bl
-       mov      r14d, esi
+       mov      rsi, gword ptr [rbp+0x20]
+       cmp      byte  ptr [rsi], sil
+       mov      r14d, ebx
  
-G_M000_IG11:                ;; offset=0x00ED
-       mov      r8, rbx
+G_M000_IG11:                ;; offset=0x00EE
+       mov      r8, rsi
        mov      rcx, 0x7FFB4D3458F0
        xor      rdx, rdx
        call     [System.Runtime.CompilerServices.AsyncHelpers:UnsafeAwaitAwaiter[System.__Canon](System.__Canon)]
        test     rcx, rcx
        jne      G_M000_IG22
  
-G_M000_IG12:                ;; offset=0x010B
+G_M000_IG12:                ;; offset=0x010C
        dec      r14d
        jne      SHORT G_M000_IG11
  
-G_M000_IG13:                ;; offset=0x0110
-       cmp      esi, 100
+G_M000_IG13:                ;; offset=0x0111
+       cmp      ebx, 100
        jle      G_M000_IG16
        jmp      SHORT G_M000_IG15
  
-G_M000_IG14:                ;; offset=0x011B
+G_M000_IG14:                ;; offset=0x011C
        call     CORINFO_HELP_POLL_GC
        jmp      SHORT G_M000_IG09
  
-G_M000_IG15:                ;; offset=0x0122
+G_M000_IG15:                ;; offset=0x0123
        mov      rcx, rdi
        call     [System.Diagnostics.Stopwatch:get_ElapsedTicks():long:this]
        vxorps   xmm0, xmm0, xmm0
@@ -136,18 +136,18 @@ G_M000_IG15:                ;; offset=0x0122
        call     [System.TimeSpan:get_TotalMilliseconds():double:this]
        vmovsd   qword ptr [rbx+0x08], xmm0
        mov      rdx, rbx
-       mov      rcx, 0x209A15C05B8
+       mov      rcx, 0x1AC6C9305B8
        call     [System.Console:WriteLine(System.String,System.Object)]
        nop      
  
-G_M000_IG16:                ;; offset=0x01B3
+G_M000_IG16:                ;; offset=0x01B4
        cmp      gword ptr [rbp+0x10], 0
        je       SHORT G_M000_IG20
  
-G_M000_IG17:                ;; offset=0x01BA
+G_M000_IG17:                ;; offset=0x01BB
        xor      ecx, ecx
  
-G_M000_IG18:                ;; offset=0x01BC
+G_M000_IG18:                ;; offset=0x01BD
        add      rsp, 88
        pop      rbx
        pop      rsi
@@ -157,12 +157,12 @@ G_M000_IG18:                ;; offset=0x01BC
        pop      rbp
        ret      
  
-G_M000_IG19:                ;; offset=0x01C9
+G_M000_IG19:                ;; offset=0x01CA
        mov      ecx, 2
        call     CORINFO_HELP_GETDYNAMIC_GCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED
        jmp      G_M000_IG03
  
-G_M000_IG20:                ;; offset=0x01D8
+G_M000_IG20:                ;; offset=0x01D9
        mov      ecx, 2
        call     CORINFO_HELP_GETDYNAMIC_GCTHREADSTATIC_BASE_NOCTOR_OPTIMIZED
        mov      rbx, gword ptr [rax+0x10]
@@ -173,7 +173,7 @@ G_M000_IG20:                ;; offset=0x01D8
        mov      rdx, r8
        call     CORINFO_HELP_ASSIGN_REF
  
-G_M000_IG21:                ;; offset=0x01FC
+G_M000_IG21:                ;; offset=0x01FD
        mov      r8, gword ptr [rbx+0x08]
        mov      rdx, gword ptr [rbp-0x30]
        cmp      rdx, r8
@@ -182,7 +182,7 @@ G_M000_IG21:                ;; offset=0x01FC
        call     [System.Threading.ExecutionContext:RestoreChangedContextToThread(System.Threading.Thread,System.Threading.ExecutionContext,System.Threading.ExecutionContext)]
        jmp      SHORT G_M000_IG17
  
-G_M000_IG22:                ;; offset=0x0214
+G_M000_IG22:                ;; offset=0x0215
        mov      rax, rcx
        mov      rcx, gword ptr [rbp+0x10]
        mov      r15, rcx
@@ -193,24 +193,24 @@ G_M000_IG22:                ;; offset=0x0214
        call     CORINFO_HELP_ASSIGN_REF
        jmp      SHORT G_M000_IG24
  
-G_M000_IG23:                ;; offset=0x0231
+G_M000_IG23:                ;; offset=0x0232
        mov      rcx, rax
        mov      rdx, 0x7FFB4D38C788
        call     [CORINFO_HELP_ALLOC_CONTINUATION]
        mov      r15, rax
- 
-G_M000_IG24:                ;; offset=0x0247
-       lea      rcx, [reloc @RWD08]
-       mov      qword ptr [r15+0x10], rcx
-       xor      ecx, ecx
-       mov      qword ptr [r15+0x18], rcx
        lea      rcx, bword ptr [r15+0x28]
-       mov      rdx, rbx
+       mov      rdx, rsi
        call     CORINFO_HELP_ASSIGN_REF
        lea      rcx, bword ptr [r15+0x30]
        mov      rdx, rdi
        call     CORINFO_HELP_ASSIGN_REF
-       mov      dword ptr [r15+0x38], esi
+       mov      dword ptr [r15+0x38], ebx
+ 
+G_M000_IG24:                ;; offset=0x0264
+       lea      rax, [reloc @RWD08]
+       mov      qword ptr [r15+0x10], rax
+       xor      eax, eax
+       mov      qword ptr [r15+0x18], rax
        mov      dword ptr [r15+0x3C], r14d
        call     [System.Runtime.CompilerServices.AsyncHelpers:CaptureExecutionContext():System.Threading.ExecutionContext]
        lea      rcx, bword ptr [r15+0x20]
@@ -224,7 +224,7 @@ G_M000_IG24:                ;; offset=0x0247
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreContextsOnSuspension(bool,System.Threading.ExecutionContext,System.Threading.SynchronizationContext)]
        mov      rcx, r15
  
-G_M000_IG25:                ;; offset=0x02A6
+G_M000_IG25:                ;; offset=0x02A7
        add      rsp, 88
        pop      rbx
        pop      rsi
@@ -234,22 +234,22 @@ G_M000_IG25:                ;; offset=0x02A6
        pop      rbp
        ret      
  
-G_M000_IG26:                ;; offset=0x02B3
+G_M000_IG26:                ;; offset=0x02B4
        mov      rcx, gword ptr [rbp+0x10]
        mov      rcx, gword ptr [rcx+0x20]
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreExecutionContext(System.Threading.ExecutionContext)]
        mov      rcx, gword ptr [rbp+0x10]
-       mov      rbx, gword ptr [rcx+0x28]
+       mov      rsi, gword ptr [rcx+0x28]
        mov      rdi, gword ptr [rcx+0x30]
-       mov      esi, dword ptr [rcx+0x38]
+       mov      ebx, dword ptr [rcx+0x38]
        mov      r14d, dword ptr [rcx+0x3C]
        jmp      G_M000_IG12
  
-G_M000_IG27:                ;; offset=0x02D9
+G_M000_IG27:                ;; offset=0x02DA
        sub      rsp, 40
        vzeroupper 
  
-G_M000_IG28:                ;; offset=0x02E0
+G_M000_IG28:                ;; offset=0x02E1
        cmp      gword ptr [rbp+0x10], 0
        setne    cl
        movzx    rcx, cl
@@ -258,7 +258,7 @@ G_M000_IG28:                ;; offset=0x02E0
        call     [System.Runtime.CompilerServices.AsyncHelpers:RestoreContexts(bool,System.Threading.ExecutionContext,System.Threading.SynchronizationContext)]
        nop      
  
-G_M000_IG29:                ;; offset=0x02FA
+G_M000_IG29:                ;; offset=0x02FB
        add      rsp, 40
        ret      
  
@@ -266,10 +266,10 @@ RWD00  	dq	43E0000000000000h
 RWD08  	dq	(dynamicClass):IL_STUB_AsyncResume(System.Object,byref):System.Object
 	dq	G_M000_IG22 + 3
 
-; Total bytes of code 767
+; Total bytes of code 768
 
-Took 409.6 ms
-Took 407.4 ms
-Took 411.7 ms
-Took 412.4 ms
-Took 407.6 ms
+Took 375.8 ms
+Took 376.3 ms
+Took 376.5 ms
+Took 374.9 ms
+Took 377.9 ms

Based on #125556

Copilot AI review requested due to automatic review settings March 16, 2026 14:15
@github-actions github-actions bot added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Mar 16, 2026
@dotnet-policy-service
Copy link
Contributor

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch
See info in area-owners.md if you want to be subscribed.

Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

This PR adds a runtime async optimization to skip saving locals that haven't been mutated since the last resumption point when reusing continuation objects. It builds on PR #125556 (which added continuation reuse) by leveraging the knowledge that a reused continuation already holds correct values for unmutated locals, thus eliminating unnecessary write barriers and improving performance by ~10%.

Changes:

  • Introduces PreservedValueAnalysis, a forward dataflow analysis that computes which tracked locals may have been mutated since the previous resumption point, enabling the optimization to skip saving unchanged locals.
  • Restructures continuation layout handling: replaces the old per-call ContinuationLayout with a ContinuationLayoutBuilder/ContinuationLayout split where a shared layout can be computed across all suspension points, and switches flag encoding from HAS_* bitmasks to index-based encoding of exception/context/result offsets.
  • Splits CreateSuspension and CreateResumption into block-creation and IR-population phases, with the new CreateResumptionsAndSuspensions method driving the two-phase approach and handling shared vs per-call layouts.

Reviewed changes

Copilot reviewed 9 out of 9 changed files in this pull request and generated 4 comments.

Show a summary per file
File Description
src/coreclr/inc/corinfo.h Replaces HAS_* flag bits with index-based encoding for exception, context, and result offsets
src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/AsyncHelpers.CoreCLR.cs Updates managed ContinuationFlags enum and access methods to use new index-based encoding
src/coreclr/vm/object.h Updates GetResultStorage and GetExceptionObjectStorage to use index-based decoding
src/coreclr/vm/interpexec.cpp Updates interpreter suspension/resumption to use index-based flag encoding
src/coreclr/interpreter/compiler.cpp Updates interpreter compiler to emit index-based flag encoding
src/coreclr/jit/async.h Introduces ReturnTypeInfo, ReturnInfo, ContinuationLayoutBuilder, AsyncState, SaveSet; restructures ContinuationLayout and AsyncTransformation
src/coreclr/jit/async.cpp Core implementation: PreservedValueAnalysis, CreateSharedLayout, continuation reuse logic, split save sets
src/coreclr/jit/jitconfigvalues.h Adds JitAsyncReuseContinuations and JitAsyncPreservedValueAnalysisRange config knobs
src/coreclr/jit/jitstd/vector.h Adds const overload of data() to support ContainsLocal const method

}

if (layout.ReturnSize > 0)
// Now allocate all returns
ContinuationContextIndexNumBits = 2,

ResultIndexFirstBit = 7,
ResultIndexBits = 25,
, Call(call)
, CallDefInfo(callDefInfo)
, SuspensionBB(suspensionBB)
, ResumptionBB(resumptionBB)
Comment on lines 5858 to 5862
returnValueDataStartOffset = currentOffset;
encodeIndex(currentOffset, CORINFO_CONTINUATION_RESULT_INDEX_FIRST_BIT, CORINFO_CONTINUATION_RESULT_INDEX_NUM_BITS);
// Handle return value first
if (returnValueVar == -1)
continue;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI runtime-async

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants