diff --git a/ruby/Gemfile.lock b/ruby/Gemfile.lock index c916e87e..6e074361 100644 --- a/ruby/Gemfile.lock +++ b/ruby/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - ci-queue (0.89.0) + ci-queue (0.90.0) logger GEM diff --git a/ruby/lib/ci/queue/configuration.rb b/ruby/lib/ci/queue/configuration.rb index e785673c..73341a6b 100644 --- a/ruby/lib/ci/queue/configuration.rb +++ b/ruby/lib/ci/queue/configuration.rb @@ -99,6 +99,11 @@ def lazy_load_test_helper_paths @lazy_load_test_helpers.split(',').map(&:strip) end + def retry? + ENV.fetch("BUILDKITE_RETRY_COUNT", "0").to_i > 0 || + ENV["SEMAPHORE_PIPELINE_RERUN"] == "true" + end + def queue_init_timeout @queue_init_timeout || timeout end diff --git a/ruby/lib/ci/queue/redis/supervisor.rb b/ruby/lib/ci/queue/redis/supervisor.rb index d16de443..a5ea8b76 100644 --- a/ruby/lib/ci/queue/redis/supervisor.rb +++ b/ruby/lib/ci/queue/redis/supervisor.rb @@ -39,6 +39,23 @@ def wait_for_workers yield if block_given? end + # On retry runs (BUILDKITE_RETRY_COUNT > 0), the main queue is already + # exhausted from the original run. A retry worker may have found unresolved + # failures via the error-reports fallback and be running them via the Retry + # queue — but those tests are NOT in the Redis running set so active_workers? + # returns false and the loop above exits immediately. + # + # Wait up to inactive_workers_timeout for retry workers to clear error-reports. + # This prevents the summary from canceling retry workers before they finish. + if exhausted? && config.retry? && !rescue_connection_errors { build.failed_tests }.empty? + @time_left_with_no_workers = config.inactive_workers_timeout + until rescue_connection_errors { build.failed_tests }.empty? || + @time_left_with_no_workers <= 0 + sleep 1 + @time_left_with_no_workers -= 1 + end + end + exhausted? rescue CI::Queue::Redis::LostMaster false diff --git a/ruby/lib/ci/queue/version.rb b/ruby/lib/ci/queue/version.rb index e71afe8c..9aea925b 100644 --- a/ruby/lib/ci/queue/version.rb +++ b/ruby/lib/ci/queue/version.rb @@ -2,7 +2,7 @@ module CI module Queue - VERSION = '0.89.0' + VERSION = '0.90.0' DEV_SCRIPTS_ROOT = ::File.expand_path('../../../../../redis', __FILE__) RELEASE_SCRIPTS_ROOT = ::File.expand_path('../redis', __FILE__) end diff --git a/ruby/lib/minitest/queue/build_status_reporter.rb b/ruby/lib/minitest/queue/build_status_reporter.rb index 43b5c3f7..92731253 100644 --- a/ruby/lib/minitest/queue/build_status_reporter.rb +++ b/ruby/lib/minitest/queue/build_status_reporter.rb @@ -267,8 +267,16 @@ def all_workers_died? end def aggregates - success = failures.zero? && errors.zero? - failures_count = "#{failures} failures, #{errors} errors," + # error-reports is authoritative when workers die before flushing per-test stats. + # Floor the displayed count so the summary line is never misleadingly green. + known_error_count = error_reports.size + effective_total = [failures + errors, known_error_count].max + success = effective_total.zero? + failures_count = if failures + errors >= known_error_count + "#{failures} failures, #{errors} errors," + else + "#{effective_total} failures," + end step([ 'Ran %d tests, %d assertions,' % [progress, assertions], diff --git a/ruby/test/ci/queue/redis_supervisor_test.rb b/ruby/test/ci/queue/redis_supervisor_test.rb index f46baa31..9f3f943a 100644 --- a/ruby/test/ci/queue/redis_supervisor_test.rb +++ b/ruby/test/ci/queue/redis_supervisor_test.rb @@ -39,6 +39,50 @@ def test_wait_for_workers assert_equal true, workers_done end + def test_wait_for_workers_waits_for_retry_workers_to_clear_failures + # Simulate a rebuild: queue is already exhausted from the original run + poll(worker(1)) + + # Inject an unresolved failure into error-reports (as if the original run + # recorded a failure but the retry worker hasn't re-run it yet) + entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb") + @redis.hset("build:42:error-reports", entry, "{}") + + sup = supervisor(timeout: 2, inactive_workers_timeout: 2) + + with_retry_env do + # Simulate a retry worker clearing the failure after a short delay + thread = Thread.start do + sleep 0.5 + @redis.hdel("build:42:error-reports", entry) + end + + result = sup.wait_for_workers + thread.join + + assert_equal true, result + assert @redis.hkeys("build:42:error-reports").empty?, + "error-reports should be empty after retry worker cleared the failure" + end + end + + def test_wait_for_workers_does_not_wait_on_non_retry + # Same setup as above but WITHOUT retry env set + poll(worker(1)) + + entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb") + @redis.hset("build:42:error-reports", entry, "{}") + + sup = supervisor(timeout: 30, inactive_workers_timeout: 30) + + started_at = CI::Queue.time_now + result = sup.wait_for_workers + elapsed = CI::Queue.time_now - started_at + + assert_equal true, result + assert_operator elapsed, :<, 2.0, "should return immediately without the retry wait" + end + def test_num_workers assert_equal 0, @supervisor.workers_count worker(1) @@ -58,14 +102,27 @@ def worker(id) ).populate(SharedQueueAssertions::TEST_LIST) end - def supervisor(timeout: 30, queue_init_timeout: nil) + def supervisor(timeout: 30, queue_init_timeout: nil, inactive_workers_timeout: nil) CI::Queue::Redis::Supervisor.new( @redis_url, CI::Queue::Configuration.new( build_id: '42', timeout: timeout, - queue_init_timeout: queue_init_timeout + queue_init_timeout: queue_init_timeout, + inactive_workers_timeout: inactive_workers_timeout, ), ) end + + def with_retry_env + original = ENV['BUILDKITE_RETRY_COUNT'] + ENV['BUILDKITE_RETRY_COUNT'] = '1' + yield + ensure + if original.nil? + ENV.delete('BUILDKITE_RETRY_COUNT') + else + ENV['BUILDKITE_RETRY_COUNT'] = original + end + end end diff --git a/ruby/test/integration/minitest_redis_test.rb b/ruby/test/integration/minitest_redis_test.rb index 6d95ea93..ea56442d 100644 --- a/ruby/test/integration/minitest_redis_test.rb +++ b/ruby/test/integration/minitest_redis_test.rb @@ -977,6 +977,74 @@ def test_rebuild_different_worker_with_no_failures_exits_cleanly assert_match(/All tests were ran already/, out) end + def test_report_waits_for_retry_worker_to_clear_failures + # Simulates the race condition seen in build 900737: + # - Report step starts (BUILDKITE_RETRY_COUNT=1), sees queue exhausted immediately, + # but error-reports still has a failure from the original run. + # - A retry worker is concurrently running the failed test. + # - Without the fix, report exits immediately and cancels the retry worker. + # - With the fix, report waits up to inactive_workers_timeout for + # retry workers to clear error-reports before reporting. + + # First run: worker 1 fails a test + out, err = capture_subprocess_io do + system( + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '1', + '--timeout', '1', + '-Itest', + 'test/flaky_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + assert_match(/1 failures/, normalize(out)) + + # Start the report concurrently — it should block waiting for retry workers + report_out = nil + report_err = nil + report_thread = Thread.new do + report_out, report_err = capture_subprocess_io do + system( + { 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual' }, + @exe, 'report', + '--queue', @redis_url, + '--build', '1', + '--timeout', '1', + '--inactive-workers-timeout', '10', + chdir: 'test/fixtures/', + ) + end + end + + # Give the report a moment to start, then run the retry worker which + # re-runs the failed test and clears error-reports + sleep 0.3 + out, err = capture_subprocess_io do + system( + { 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual', 'FLAKY_TEST_PASS' => '1' }, + @exe, 'run', + '--queue', @redis_url, + '--seed', 'foobar', + '--build', '1', + '--worker', '2', + '--timeout', '1', + '-Itest', + 'test/flaky_test.rb', + chdir: 'test/fixtures/', + ) + end + assert_empty filter_deprecation_warnings(err) + assert_match(/Retrying failed tests/, out) + + report_thread.join(15) + assert_empty filter_deprecation_warnings(report_err || '') + assert_match(/0 failures/, normalize(report_out || '')) + end + def test_retry_fails_when_test_run_is_expired out, err = capture_subprocess_io do system( diff --git a/ruby/test/minitest/queue/build_status_recorder_test.rb b/ruby/test/minitest/queue/build_status_recorder_test.rb index dbfa9d1a..386e6586 100644 --- a/ruby/test/minitest/queue/build_status_recorder_test.rb +++ b/ruby/test/minitest/queue/build_status_recorder_test.rb @@ -3,17 +3,6 @@ require 'concurrent/set' module Minitest::Queue - # Lightweight stand-in for a test object in unit tests that don't run real tests. - # Holds test_id and file_path directly so no source_location lookup is needed. - FakeEntry = Struct.new(:id, :queue_entry, :method_name) - - def self.fake_entry(method_name) - test_id = "Minitest::Test##{method_name}" - # Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls - file_path = "#{Minitest::Queue.project_root}/test/my_test.rb" - FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name) - end - class BuildStatusRecorderTest < Minitest::Test include ReporterTestHelper diff --git a/ruby/test/minitest/queue/build_status_reporter_test.rb b/ruby/test/minitest/queue/build_status_reporter_test.rb index b5f14a32..5eb312b6 100644 --- a/ruby/test/minitest/queue/build_status_reporter_test.rb +++ b/ruby/test/minitest/queue/build_status_reporter_test.rb @@ -52,6 +52,34 @@ def test_all_workers_died assert_includes out, "All workers died." end + def test_aggregate_floors_failures_to_error_reports_when_stats_are_zero + # Simulate a worker that recorded a test failure in error-reports but died + # before flushing its per-worker stats (stats show 0 failures). + queue = worker(1) + queue.poll { |_test| queue.shutdown! } + + entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb") + payload = Minitest::Queue::ErrorReport.new( + test_name: "test_failure", + test_suite: "FakeTest", + test_file: "/tmp/fake_test.rb", + test_line: 1, + error_class: "RuntimeError", + output: "FakeTest#test_failure: failed", + ).dump + @redis.hset("build:42:error-reports", entry, payload) + + @supervisor.instance_variable_set(:@time_left, 1) + @supervisor.instance_variable_set(:@time_left_with_no_workers, 0) + + out, _ = capture_subprocess_io do + @reporter.report + end + + assert_includes out, "1 failures,", "should floor failure count to error_reports.size" + refute_includes out, "0 failures, 0 errors,", "should not show misleadingly green zero-failure line" + end + private def worker(id) diff --git a/ruby/test/support/fake_test_entry.rb b/ruby/test/support/fake_test_entry.rb new file mode 100644 index 00000000..262e916d --- /dev/null +++ b/ruby/test/support/fake_test_entry.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true +module Minitest + module Queue + # Lightweight stand-in for a test object in unit tests that don't run real tests. + # Holds test_id and file_path directly so no source_location lookup is needed. + FakeEntry = Struct.new(:id, :queue_entry, :method_name) + + def self.fake_entry(method_name) + test_id = "Minitest::Test##{method_name}" + # Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls + file_path = "#{Minitest::Queue.project_root}/test/my_test.rb" + FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name) + end + end +end