Implement streaming approach

ChrisBr · ChrisBr · commit cf8e2414f3eb · 2025-12-08T10:28:10.000Z
diff --git a/ruby/lib/ci/queue/configuration.rb b/ruby/lib/ci/queue/configuration.rb
@@ -6,6 +6,7 @@ class Configuration
       attr_accessor :requeue_tolerance, :namespace, :failing_test, :statsd_endpoint
       attr_accessor :max_test_duration, :max_test_duration_percentile, :track_test_duration
       attr_accessor :max_test_failed, :redis_ttl, :warnings_file, :debug_log, :max_missed_heartbeat_seconds
+      attr_accessor :batch_upload, :batch_size
       attr_reader :circuit_breakers
       attr_writer :seed, :build_id
       attr_writer :queue_init_timeout, :report_timeout, :inactive_workers_timeout
@@ -46,7 +47,8 @@ def initialize(
         grind_count: nil, max_duration: nil, failure_file: nil, max_test_duration: nil,
         max_test_duration_percentile: 0.5, track_test_duration: false, max_test_failed: nil,
         queue_init_timeout: nil, redis_ttl: 8 * 60 * 60, report_timeout: nil, inactive_workers_timeout: nil,
-        export_flaky_tests_file: nil, warnings_file: nil, debug_log: nil, max_missed_heartbeat_seconds: nil)
+        export_flaky_tests_file: nil, warnings_file: nil, debug_log: nil, max_missed_heartbeat_seconds: nil,
+        batch_upload: false, batch_size: 100)
         @build_id = build_id
         @circuit_breakers = [CircuitBreaker::Disabled]
         @failure_file = failure_file
@@ -73,6 +75,8 @@ def initialize(
         @warnings_file = warnings_file
         @debug_log = debug_log
         @max_missed_heartbeat_seconds = max_missed_heartbeat_seconds
+        @batch_upload = batch_upload
+        @batch_size = batch_size
       end
 
       def queue_init_timeout
diff --git a/ruby/lib/ci/queue/redis/base.rb b/ruby/lib/ci/queue/redis/base.rb
@@ -155,15 +155,33 @@ def wait_for_master(timeout: 30)
           return true if master?
           return true if queue_initialized?
 
-          (timeout * 10 + 1).to_i.times do
-            if queue_initialized?
-              return true
-            else
-              sleep 0.1
+          if config.batch_upload
+            return wait_for_streaming(timeout: timeout)
+          else
+            (timeout * 10 + 1).to_i.times do
+              if queue_initialized?
+                return true
+              else
+                sleep 0.1
+              end
             end
+
+            raise LostMaster, "The master worker is still `#{master_status}` after #{timeout} seconds waiting."
+          end
+        end
+
+        def wait_for_streaming(timeout:)
+          (timeout * 10 + 1).to_i.times do
+            status = master_status
+
+            # Ready to work if streaming or complete
+            return true if status == 'streaming' || status == 'ready' || status == 'finished'
+
+            # Master hasn't started yet
+            sleep 0.1
           end
 
-          raise LostMaster, "The master worker is still `#{master_status}` after #{timeout} seconds waiting."
+          raise LostMaster, "The master worker didn't start streaming after #{timeout} seconds waiting."
         end
 
         def workers_count
@@ -173,7 +191,11 @@ def workers_count
         def queue_initialized?
           @queue_initialized ||= begin
             status = master_status
-            status == 'ready' || status == 'finished'
+            if config.batch_upload
+              status == 'streaming' || status == 'ready' || status == 'finished'
+            else
+              status == 'ready' || status == 'finished'
+            end
           end
         end
 
diff --git a/ruby/lib/ci/queue/redis/worker.rb b/ruby/lib/ci/queue/redis/worker.rb
@@ -26,12 +26,25 @@ def distributed?
         end
 
         def populate(tests, random: Random.new)
-          @index = tests.map { |t| [t.id, t] }.to_h
+          if config.batch_upload
+            @index = {}
+            @source_files_loaded = Set.new
+          else
+            @index = tests.map { |t| [t.id, t] }.to_h
+          end
           tests = Queue.shuffle(tests, random)
           push(tests.map(&:id))
           self
         end
 
+        def populate_from_files(file_paths, random: Random.new)
+          @file_paths = file_paths.sort
+          @index = {}
+          @source_files_loaded = Set.new
+          push_files_in_batches(@file_paths, random)
+          self
+        end
+
         def populated?
           !!defined?(@index)
         end
@@ -54,9 +67,17 @@ def poll
           wait_for_master
           attempt = 0
           until shutdown_required? || config.circuit_breakers.any?(&:open?) || exhausted? || max_test_failed?
-            if test = reserve
+            if test_id = reserve
               attempt = 0
-              yield index.fetch(test)
+
+              # Lazy load test if needed (batch mode)
+              test = if config.batch_upload && !@index.key?(test_id)
+                @index[test_id] = build_index_entry(test_id)
+              else
+                index.fetch(test_id)
+              end
+
+              yield test
             else
               # Adding exponential backoff to avoid hammering Redis
               # we just stay online here in case a test gets retried or times out so we can afford to wait
@@ -153,6 +174,120 @@ def release!
 
         attr_reader :index
 
+        def push_files_in_batches(file_paths, random)
+          #Elect master (existing logic)
+          value = key('setup', worker_id)
+          _, status = redis.pipelined do |pipeline|
+            pipeline.set(key('master-status'), value, nx: true)
+            pipeline.get(key('master-status'))
+          end
+
+          if @master = (value == status)
+            puts "Worker elected as leader, loading and pushing tests in batches..."
+            puts
+
+            # Set status to 'streaming' to signal workers can start
+            redis.set(key('master-status'), 'streaming')
+
+            # Group files into batches based on batch_size
+            # Since we're batching by files, calculate files per batch to approximate tests per batch
+            files_per_batch = [config.batch_size / 10, 1].max # Estimate ~10 tests per file
+
+            all_tests = []
+            tests_uploaded = 0
+
+            attempts = 0
+            duration = measure do
+              file_paths.each_slice(files_per_batch).with_index do |file_batch, batch_num|
+                # Load files in this batch
+                batch_tests = []
+                file_batch.each do |file_path|
+                  abs_path = ::File.expand_path(file_path)
+                  require abs_path
+                  @source_files_loaded.add(abs_path)
+                end
+
+                # Extract tests from newly loaded files
+                if defined?(Minitest)
+                  Minitest::Test.runnables.each do |runnable|
+                    runnable.runnable_methods.each do |method_name|
+                      test = Minitest::Queue::SingleExample.new(runnable, method_name)
+                      unless @index.key?(test.id)
+                        batch_tests << test
+                        @index[test.id] = test
+                      end
+                    end
+                  end
+                end
+
+                # Shuffle tests in this batch
+                batch_tests = Queue.shuffle(batch_tests, random)
+
+                unless batch_tests.empty?
+                  # Extract metadata
+                  test_ids = []
+                  metadata = {}
+
+                  batch_tests.each do |test|
+                    test_ids << test.id
+                    if test.respond_to?(:source_location) && (location = test.source_location)
+                      metadata[test.id] = location[0] # file path
+                    end
+                  end
+
+                  # Upload batch to Redis
+                  with_redis_timeout(5) do
+                    redis.without_reconnect do
+                      redis.pipelined do |pipeline|
+                        pipeline.lpush(key('queue'), test_ids)
+                        pipeline.mapped_hmset(key('test-metadata'), metadata) unless metadata.empty?
+                        pipeline.incr(key('batch-count'))
+                        pipeline.expire(key('queue'), config.redis_ttl)
+                        pipeline.expire(key('test-metadata'), config.redis_ttl)
+                        pipeline.expire(key('batch-count'), config.redis_ttl)
+                      end
+                    end
+                  rescue ::Redis::BaseError => error
+                    if attempts < 3
+                      puts "Retrying batch upload... (#{error})"
+                      attempts += 1
+                      retry
+                    end
+                    raise
+                  end
+
+                  tests_uploaded += test_ids.size
+
+                  # Progress reporting
+                  if (batch_num + 1) % 10 == 0 || batch_num == 0
+                    puts "Uploaded #{tests_uploaded} tests from #{(batch_num + 1) * files_per_batch} files..."
+                  end
+                end
+
+                all_tests.concat(batch_tests)
+              end
+            end
+
+            @total = all_tests.size
+
+            # Mark upload complete
+            redis.multi do |transaction|
+              transaction.set(key('total'), @total)
+              transaction.set(key('master-status'), 'ready')
+              transaction.expire(key('total'), config.redis_ttl)
+              transaction.expire(key('master-status'), config.redis_ttl)
+            end
+
+            puts
+            puts "Finished pushing #{@total} tests to the queue in #{duration.round(2)}s."
+          end
+
+          register
+          redis.expire(key('workers'), config.redis_ttl)
+        rescue *CONNECTION_ERRORS
+          raise if @master
+        end
+
         def reserved_tests
           @reserved_tests ||= Concurrent::Set.new
         end
@@ -161,6 +296,54 @@ def worker_id
           config.worker_id
         end
 
+        def build_index_entry(test_id)
+          # Try to load from metadata
+          file_path = redis.hget(key('test-metadata'), test_id)
+
+          if file_path && !@source_files_loaded.include?(file_path)
+            # Lazy load the test file
+            require_test_file(file_path)
+            @source_files_loaded.add(file_path)
+          end
+
+          # Find the test in loaded runnables
+          find_test_object(test_id)
+        end
+
+        def require_test_file(file_path)
+          # Make path absolute if needed
+          abs_path = if file_path.start_with?('/')
+            file_path
+          else
+            ::File.expand_path(file_path)
+          end
+
+          # Require the file
+          require abs_path
+        rescue LoadError => e
+          # Log warning but continue
+          warn "Warning: Could not load test file #{file_path}: #{e.message}"
+        end
+
+        def find_test_object(test_id)
+          # For Minitest
+          if defined?(Minitest)
+            Minitest::Test.runnables.each do |runnable|
+              runnable.runnable_methods.each do |method_name|
+                candidate_id = "#{runnable}##{method_name}"
+                if candidate_id == test_id
+                  return Minitest::Queue::SingleExample.new(runnable, method_name)
+                end
+              end
+            end
+          end
+
+          # Fallback: create a test object that will report an error
+          warn "Warning: Test #{test_id} not found after loading file. Ensure all dependencies are explicitly required in test_helper.rb"
+          # Return nil and let index.fetch handle the KeyError
+          nil
+        end
+
         def raise_on_mismatching_test(test)
           unless reserved_tests.delete?(test)
             raise ReservationError, "Acknowledged #{test.inspect} but only #{reserved_tests.map(&:inspect).join(", ")} reserved"
diff --git a/ruby/lib/minitest/queue/runner.rb b/ruby/lib/minitest/queue/runner.rb
@@ -97,12 +97,10 @@ def run_command
             if remaining <= running
               puts green("Queue almost empty, exiting early...")
             else
-              load_tests
-              populate_queue
+              load_tests_and_populate
             end
           else
-            load_tests
-            populate_queue
+            load_tests_and_populate
           end
         end
 
@@ -357,6 +355,19 @@ def reset_counters
         queue.build.reset_worker_error
       end
 
+      def load_tests_and_populate
+        if queue_config.batch_upload && queue.respond_to?(:populate_from_files)
+          # In batch mode, pass file paths directly to the queue
+          # The master will load files in batches as it uploads
+          # Workers will load files lazily as needed
+          Minitest.queue.populate_from_files(argv, random: ordering_seed)
+        else
+          # Traditional mode: load all tests upfront
+          load_tests
+          populate_queue
+        end
+      end
+
       def populate_queue
         Minitest.queue.populate(Minitest.loaded_tests, random: ordering_seed)
       end
@@ -636,6 +647,32 @@ def parser
             queue_config.debug_log = path
           end
 
+          help = <<~EOS
+            Enable batch/streaming upload mode. In this mode, the master worker will load test files
+            and push tests to the queue in batches, allowing other workers to start processing tests
+            immediately without waiting for all tests to be uploaded. This significantly reduces
+            startup time for large test suites.
+
+            IMPORTANT: When using this mode, test files are loaded lazily on-demand on workers.
+            You MUST explicitly require all dependencies (models, helpers, etc.) in your test_helper.rb.
+            Autoloading may not work as expected since not all test files are loaded upfront.
+          EOS
+          opts.separator ""
+          opts.on('--batch-upload', help) do
+            queue_config.batch_upload = true
+          end
+
+          help = <<~EOS
+            Specify the number of tests to upload in each batch when --batch-upload is enabled.
+            Smaller batches allow workers to start sooner but may increase overhead.
+            Larger batches reduce overhead but increase initial wait time.
+            Defaults to 100.
+          EOS
+          opts.separator ""
+          opts.on('--batch-size SIZE', Integer, help) do |size|
+            queue_config.batch_size = size
+          end
+
           opts.separator ""
           opts.separator "    retry: Replays a previous run in the same order."
 
diff --git a/ruby/lib/rspec/queue.rb b/ruby/lib/rspec/queue.rb
diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb