Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ruby/Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
ci-queue (0.89.0)
ci-queue (0.90.0)
logger

GEM
Expand Down
5 changes: 5 additions & 0 deletions ruby/lib/ci/queue/configuration.rb
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ def lazy_load_test_helper_paths
@lazy_load_test_helpers.split(',').map(&:strip)
end

def retry?
ENV.fetch("BUILDKITE_RETRY_COUNT", "0").to_i > 0 ||
ENV["SEMAPHORE_PIPELINE_RERUN"] == "true"
end

def queue_init_timeout
@queue_init_timeout || timeout
end
Expand Down
17 changes: 17 additions & 0 deletions ruby/lib/ci/queue/redis/supervisor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,23 @@ def wait_for_workers
yield if block_given?
end

# On retry runs (BUILDKITE_RETRY_COUNT > 0), the main queue is already
# exhausted from the original run. A retry worker may have found unresolved
# failures via the error-reports fallback and be running them via the Retry
# queue — but those tests are NOT in the Redis running set so active_workers?
# returns false and the loop above exits immediately.
#
# Wait up to inactive_workers_timeout for retry workers to clear error-reports.
# This prevents the summary from canceling retry workers before they finish.
if exhausted? && config.retry? && !rescue_connection_errors { build.failed_tests }.empty?
@time_left_with_no_workers = config.inactive_workers_timeout
until rescue_connection_errors { build.failed_tests }.empty? ||
@time_left_with_no_workers <= 0
sleep 1
@time_left_with_no_workers -= 1
end
end

exhausted?
rescue CI::Queue::Redis::LostMaster
false
Expand Down
2 changes: 1 addition & 1 deletion ruby/lib/ci/queue/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

module CI
module Queue
VERSION = '0.89.0'
VERSION = '0.90.0'
DEV_SCRIPTS_ROOT = ::File.expand_path('../../../../../redis', __FILE__)
RELEASE_SCRIPTS_ROOT = ::File.expand_path('../redis', __FILE__)
end
Expand Down
12 changes: 10 additions & 2 deletions ruby/lib/minitest/queue/build_status_reporter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,16 @@ def all_workers_died?
end

def aggregates
success = failures.zero? && errors.zero?
failures_count = "#{failures} failures, #{errors} errors,"
# error-reports is authoritative when workers die before flushing per-test stats.
# Floor the displayed count so the summary line is never misleadingly green.
known_error_count = error_reports.size
effective_total = [failures + errors, known_error_count].max
success = effective_total.zero?
failures_count = if failures + errors >= known_error_count
"#{failures} failures, #{errors} errors,"
else
"#{effective_total} failures,"
end

step([
'Ran %d tests, %d assertions,' % [progress, assertions],
Expand Down
61 changes: 59 additions & 2 deletions ruby/test/ci/queue/redis_supervisor_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,50 @@ def test_wait_for_workers
assert_equal true, workers_done
end

def test_wait_for_workers_waits_for_retry_workers_to_clear_failures
# Simulate a rebuild: queue is already exhausted from the original run
poll(worker(1))

# Inject an unresolved failure into error-reports (as if the original run
# recorded a failure but the retry worker hasn't re-run it yet)
entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
@redis.hset("build:42:error-reports", entry, "{}")

sup = supervisor(timeout: 2, inactive_workers_timeout: 2)

with_retry_env do
# Simulate a retry worker clearing the failure after a short delay
thread = Thread.start do
sleep 0.5
@redis.hdel("build:42:error-reports", entry)
end

result = sup.wait_for_workers
thread.join

assert_equal true, result
assert @redis.hkeys("build:42:error-reports").empty?,
"error-reports should be empty after retry worker cleared the failure"
end
end

def test_wait_for_workers_does_not_wait_on_non_retry
# Same setup as above but WITHOUT retry env set
poll(worker(1))

entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
@redis.hset("build:42:error-reports", entry, "{}")

sup = supervisor(timeout: 30, inactive_workers_timeout: 30)

started_at = CI::Queue.time_now
result = sup.wait_for_workers
elapsed = CI::Queue.time_now - started_at

assert_equal true, result
assert_operator elapsed, :<, 2.0, "should return immediately without the retry wait"
end

def test_num_workers
assert_equal 0, @supervisor.workers_count
worker(1)
Expand All @@ -58,14 +102,27 @@ def worker(id)
).populate(SharedQueueAssertions::TEST_LIST)
end

def supervisor(timeout: 30, queue_init_timeout: nil)
def supervisor(timeout: 30, queue_init_timeout: nil, inactive_workers_timeout: nil)
CI::Queue::Redis::Supervisor.new(
@redis_url,
CI::Queue::Configuration.new(
build_id: '42',
timeout: timeout,
queue_init_timeout: queue_init_timeout
queue_init_timeout: queue_init_timeout,
inactive_workers_timeout: inactive_workers_timeout,
),
)
end

def with_retry_env
original = ENV['BUILDKITE_RETRY_COUNT']
ENV['BUILDKITE_RETRY_COUNT'] = '1'
yield
ensure
if original.nil?
ENV.delete('BUILDKITE_RETRY_COUNT')
else
ENV['BUILDKITE_RETRY_COUNT'] = original
end
end
end
68 changes: 68 additions & 0 deletions ruby/test/integration/minitest_redis_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,74 @@ def test_rebuild_different_worker_with_no_failures_exits_cleanly
assert_match(/All tests were ran already/, out)
end

def test_report_waits_for_retry_worker_to_clear_failures
# Simulates the race condition seen in build 900737:
# - Report step starts (BUILDKITE_RETRY_COUNT=1), sees queue exhausted immediately,
# but error-reports still has a failure from the original run.
# - A retry worker is concurrently running the failed test.
# - Without the fix, report exits immediately and cancels the retry worker.
# - With the fix, report waits up to inactive_workers_timeout for
# retry workers to clear error-reports before reporting.

# First run: worker 1 fails a test
out, err = capture_subprocess_io do
system(
@exe, 'run',
'--queue', @redis_url,
'--seed', 'foobar',
'--build', '1',
'--worker', '1',
'--timeout', '1',
'-Itest',
'test/flaky_test.rb',
chdir: 'test/fixtures/',
)
end
assert_empty filter_deprecation_warnings(err)
assert_match(/1 failures/, normalize(out))

# Start the report concurrently — it should block waiting for retry workers
report_out = nil
report_err = nil
report_thread = Thread.new do
report_out, report_err = capture_subprocess_io do
system(
{ 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual' },
@exe, 'report',
'--queue', @redis_url,
'--build', '1',
'--timeout', '1',
'--inactive-workers-timeout', '10',
chdir: 'test/fixtures/',
)
end
end

# Give the report a moment to start, then run the retry worker which
# re-runs the failed test and clears error-reports
sleep 0.3
out, err = capture_subprocess_io do
system(
{ 'BUILDKITE_RETRY_COUNT' => '1', 'BUILDKITE_RETRY_TYPE' => 'manual', 'FLAKY_TEST_PASS' => '1' },
@exe, 'run',
'--queue', @redis_url,
'--seed', 'foobar',
'--build', '1',
'--worker', '2',
'--timeout', '1',
'-Itest',
'test/flaky_test.rb',
chdir: 'test/fixtures/',
)
end
assert_empty filter_deprecation_warnings(err)
assert_match(/Retrying failed tests/, out)

report_thread.join(15)
assert_empty filter_deprecation_warnings(report_err || '')
assert_match(/0 failures/, normalize(report_out || ''))
end

def test_retry_fails_when_test_run_is_expired
out, err = capture_subprocess_io do
system(
Expand Down
11 changes: 0 additions & 11 deletions ruby/test/minitest/queue/build_status_recorder_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,6 @@
require 'concurrent/set'

module Minitest::Queue
# Lightweight stand-in for a test object in unit tests that don't run real tests.
# Holds test_id and file_path directly so no source_location lookup is needed.
FakeEntry = Struct.new(:id, :queue_entry, :method_name)

def self.fake_entry(method_name)
test_id = "Minitest::Test##{method_name}"
# Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls
file_path = "#{Minitest::Queue.project_root}/test/my_test.rb"
FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name)
end

class BuildStatusRecorderTest < Minitest::Test
include ReporterTestHelper

Expand Down
28 changes: 28 additions & 0 deletions ruby/test/minitest/queue/build_status_reporter_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,34 @@ def test_all_workers_died
assert_includes out, "All workers died."
end

def test_aggregate_floors_failures_to_error_reports_when_stats_are_zero
# Simulate a worker that recorded a test failure in error-reports but died
# before flushing its per-worker stats (stats show 0 failures).
queue = worker(1)
queue.poll { |_test| queue.shutdown! }

entry = CI::Queue::QueueEntry.format("FakeTest#test_failure", "/tmp/fake_test.rb")
payload = Minitest::Queue::ErrorReport.new(
test_name: "test_failure",
test_suite: "FakeTest",
test_file: "/tmp/fake_test.rb",
test_line: 1,
error_class: "RuntimeError",
output: "FakeTest#test_failure: failed",
).dump
@redis.hset("build:42:error-reports", entry, payload)

@supervisor.instance_variable_set(:@time_left, 1)
@supervisor.instance_variable_set(:@time_left_with_no_workers, 0)

out, _ = capture_subprocess_io do
@reporter.report
end

assert_includes out, "1 failures,", "should floor failure count to error_reports.size"
refute_includes out, "0 failures, 0 errors,", "should not show misleadingly green zero-failure line"
end

private

def worker(id)
Expand Down
15 changes: 15 additions & 0 deletions ruby/test/support/fake_test_entry.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# frozen_string_literal: true
module Minitest
module Queue
# Lightweight stand-in for a test object in unit tests that don't run real tests.
# Holds test_id and file_path directly so no source_location lookup is needed.
FakeEntry = Struct.new(:id, :queue_entry, :method_name)

def self.fake_entry(method_name)
test_id = "Minitest::Test##{method_name}"
# Use the same file_path as ReporterTestHelper#result so entries match across reserve/record calls
file_path = "#{Minitest::Queue.project_root}/test/my_test.rb"
FakeEntry.new(test_id, CI::Queue::QueueEntry.format(test_id, file_path), method_name)
end
end
end
Loading