From c8f1836c27cb1cc5a00aa521d1dbeca41b4fed1b Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Thu, 11 Jun 2026 09:06:52 +0900 Subject: [PATCH] fix(compactor): de-flake sharded-ownership compactor tests TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning and its partition twin seed one shared bucket.ClientMock with 100 tenants (~3,208/~3,707 registered testify expectations). Every bucket operation from both compactors and both blocks cleaners holds the mock's single mutex through an O(expectations) reflective scan (~38% of CPU in two independent profiles), so the first compaction cycle alone takes ~20s under -race on an idle machine and 6-7x longer on starved CI runners. Each per-tenant meta sync additionally runs under a deadline equal to -compactor.compaction-interval (5s in these tests, pkg/compactor/compactor.go:1085), so starved syncs turn whole runs into failures and CompactionRunsCompleted can stay pinned at 0 longer than ANY poll budget: in CI run 26632776611 the partition test failed its 60s poll AND the non-partition sibling burned its full 120s poll, in both attempts. The ring itself was ACTIVE and stable before the poll started (starting() waits for both), correcting the issue's original ring-convergence framing. Fix (test-only): - Cut numUsers from 100 to 20 in both tests (~25x less quadratic mock-matching work; the per-user ownership assertions are tenant-count independent). - Align the partition test's run-completion poll budget with its non-partition sibling (60s -> 120s) as a seatbelt. - Set WaitActiveInstanceTimeout=30s in both tests (same hardening as #7503): prepareConfig's 5s ACTIVE ceiling is within reach of the observed CI starvation envelope. The shuffle-sharding twins are deliberately untouched: their live arm64 failure in the same run is an ownership-exclusivity violation (compactor_test.go:1318), a different failure class that needs its own root-causing. Before/after on Apple Silicon, -race: 21.00s/24.01s -> 3.57s/3.55s. Fixes #7607 Co-Authored-By: Claude Fable 5 Signed-off-by: Sandy Chen --- pkg/compactor/compactor_paritioning_test.go | 5 +++-- pkg/compactor/compactor_test.go | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/compactor/compactor_paritioning_test.go b/pkg/compactor/compactor_paritioning_test.go index 0002a131b5..4398a1da36 100644 --- a/pkg/compactor/compactor_paritioning_test.go +++ b/pkg/compactor/compactor_paritioning_test.go @@ -1142,7 +1142,7 @@ func TestPartitionCompactor_ShouldCompactAllUsersOnShardingEnabledButOnlyOneInst func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning(t *testing.T) { - numUsers := 100 + numUsers := 20 // Setup user IDs userIDs := make([]string, 0, numUsers) @@ -1192,6 +1192,7 @@ func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEn cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) cfg.ShardingRing.WaitStabilityMinDuration = time.Second cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second + cfg.ShardingRing.WaitActiveInstanceTimeout = 30 * time.Second // Give the ring ACTIVE wait headroom on slow CI (#7503). cfg.ShardingRing.KVStore.Mock = kvstore c, _, tsdbPlanner, l, _ := prepareForPartitioning(t, cfg, bucketClient, nil, nil) @@ -1214,7 +1215,7 @@ func TestPartitionCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEn // Wait until a run has been completed on each compactor for _, c := range compactors { - cortex_testutil.Poll(t, 60*time.Second, true, func() any { + cortex_testutil.Poll(t, 120*time.Second, true, func() any { return prom_testutil.ToFloat64(c.CompactionRunsCompleted) >= 1 }) } diff --git a/pkg/compactor/compactor_test.go b/pkg/compactor/compactor_test.go index d1d4460383..15b6c1adf1 100644 --- a/pkg/compactor/compactor_test.go +++ b/pkg/compactor/compactor_test.go @@ -1091,7 +1091,7 @@ func TestCompactor_ShouldCompactAllUsersOnShardingEnabledButOnlyOneInstanceRunni func TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndMultipleInstancesRunning(t *testing.T) { t.Parallel() - numUsers := 100 + numUsers := 20 // Setup user IDs userIDs := make([]string, 0, numUsers) @@ -1139,6 +1139,7 @@ func TestCompactor_ShouldCompactOnlyUsersOwnedByTheInstanceOnShardingEnabledAndM cfg.ShardingRing.InstanceAddr = fmt.Sprintf("127.0.0.%d", i) cfg.ShardingRing.WaitStabilityMinDuration = time.Second cfg.ShardingRing.WaitStabilityMaxDuration = 5 * time.Second + cfg.ShardingRing.WaitActiveInstanceTimeout = 30 * time.Second // Give the ring ACTIVE wait headroom on slow CI (#7503). cfg.ShardingRing.KVStore.Mock = kvstore c, _, tsdbPlanner, l, _ := prepare(t, cfg, bucketClient, nil)