From 5b0e352e00e8cc7965de3ef341ac16558e329378 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Fri, 26 Jun 2026 17:42:26 -0400 Subject: [PATCH 1/8] Retry TestCustomAuthorizerApp deployment on transient IAM role propagation failure The TestCustomAuthorizerApp integration test stack deploys many Lambda functions that reference IAM roles created in the same stack. CloudFormation occasionally calls Lambda CreateFunction before the role's trust policy has propagated through IAM, producing "The role defined for the function cannot be assumed by Lambda" and rolling the whole stack back, which fails all 20 tests in the project. Wrap the deploy in a retry loop (3 attempts). Between attempts, delete the rolled-back stack (a ROLLBACK_COMPLETE stack cannot be re-created) and pause briefly to let IAM settle. Surface CloudFormation failed-resource events on each failure for easier debugging. --- .../DeploymentScript.ps1 | 48 +++++++++++++++++-- 1 file changed, 44 insertions(+), 4 deletions(-) diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 index 4fe9313c0..3a20e55de 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 @@ -59,11 +59,51 @@ try throw "Failed to create the following bucket: $identifier" } dotnet restore - Write-Host "Creating CloudFormation Stack $identifier, Architecture $arch" - dotnet lambda deploy-serverless - if (!$?) + + # Deploy with retries. The stack contains many Lambda functions that each reference + # an IAM role created in the same stack. CloudFormation occasionally calls Lambda + # CreateFunction before the role's trust policy has propagated through IAM, producing + # "The role defined for the function cannot be assumed by Lambda" and rolling the whole + # stack back. This is a transient eventual-consistency race, so retry the deployment. + $maxAttempts = 3 + $deploySucceeded = $false + for ($attempt = 1; $attempt -le $maxAttempts; $attempt++) + { + Write-Host "Creating CloudFormation Stack $identifier, Architecture $arch (attempt $attempt of $maxAttempts)" + dotnet lambda deploy-serverless + if ($?) + { + $deploySucceeded = $true + break + } + + Write-Host "Deployment attempt $attempt failed. Fetching CloudFormation stack events for debugging..." + try { + $events = aws cloudformation describe-stack-events --stack-name $identifier --query "StackEvents[?ResourceStatus=='CREATE_FAILED' || ResourceStatus=='UPDATE_FAILED' || ResourceStatus=='DELETE_FAILED']" --output json 2>&1 + if ($events) { + Write-Host "CloudFormation failed events:" + Write-Host $events + } + } + catch { + Write-Host "Could not fetch CloudFormation events: $_" + } + + if ($attempt -lt $maxAttempts) + { + # A failed create leaves the stack in ROLLBACK_COMPLETE, which cannot be updated + # or re-created. Delete it (and wait for the delete to finish) before retrying. + Write-Host "Deleting rolled-back stack $identifier before retrying..." + aws cloudformation delete-stack --stack-name $identifier + aws cloudformation wait stack-delete-complete --stack-name $identifier + # Brief pause to give IAM additional time to settle before the next attempt. + Start-Sleep -Seconds 15 + } + } + + if (!$deploySucceeded) { - throw "Failed to create the following CloudFormation stack: $identifier" + throw "Failed to create the following CloudFormation stack after $maxAttempts attempts: $identifier" } } finally From 5cb59fef5479898394d5902def40d45ce7dbdf38 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 10:13:33 -0400 Subject: [PATCH 2/8] Speed up integration tests via parallelism and faster stack queries The integration-test phase ran everything serially and dominated CI wall-clock. Four independent changes cut that down: - run-integ-tests now runs each *.IntegrationTests.csproj concurrently (buildtools/run-integ-tests-parallel.ps1). Each project deploys its own isolated CloudFormation stack, so they share no state. Replaces the serial MSBuild item-batched Exec. - LambdaHelper.FilterByCloudFormationStackAsync now lists the stack's resources via CloudFormation ListStackResources instead of scanning every Lambda in the account and reading each function's tags one at a time. O(stack size) instead of O(account size), and no longer throttles in a shared test account. - TestServerlessApp and TestCustomAuthorizerApp integ tests share their single deployed-stack fixture across the assembly via IAssemblyFixture (the Xunit.Extensions.AssemblyFixture package) instead of one serial [Collection]. The stack still deploys once, but the test classes now run in parallel. - The durable execution integ suite (45 independent tests, each deploying its own uniquely-named function) no longer forces maxParallelThreads=1; its build helper already guards concurrent publishes with a per-directory file lock. Verified end-to-end against AWS: TestCustomAuthorizerApp deploys its stack once and all 20 tests pass under the parallel AssemblyFixture setup. --- .../xunit.runner.json | 4 +- .../IntegrationTests.Helpers/LambdaHelper.cs | 35 +++++++---- .../HealthCheckTests.cs | 4 +- .../HttpApiV1Tests.cs | 4 +- .../HttpApiV2Tests.cs | 4 +- .../IntegrationTestContextFixture.cs | 5 +- ...IntegrationTestContextFixtureCollection.cs | 15 ++--- .../NonStringAuthorizerTests.cs | 4 +- .../RestApiTests.cs | 4 +- .../SimpleHttpApiAuthorizerTests.cs | 4 +- .../SimpleRestApiAuthorizerTests.cs | 4 +- ...ustomAuthorizerApp.IntegrationTests.csproj | 3 + .../ALBIntegrationTestContextFixture.cs | 5 +- .../ComplexCalculator.cs | 4 +- .../CustomResponse.cs | 4 +- .../DynamoDBEventSourceMapping.cs | 4 +- .../FunctionUrlExample.cs | 4 +- .../Greeter.cs | 4 +- .../IntegrationTestContextFixture.cs | 5 +- ...IntegrationTestContextFixtureCollection.cs | 13 ++-- .../S3EventNotification.cs | 4 +- .../SNSEventSubscription.cs | 4 +- .../SQSEventSourceMapping.cs | 4 +- .../ScheduleEventRule.cs | 4 +- .../SimpleCalculator.cs | 4 +- .../TestServerlessApp.IntegrationTests.csproj | 3 + buildtools/build.proj | 11 ++-- buildtools/run-integ-tests-parallel.ps1 | 63 +++++++++++++++++++ 28 files changed, 150 insertions(+), 80 deletions(-) create mode 100644 buildtools/run-integ-tests-parallel.ps1 diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json index b6de9b357..73179ea81 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/xunit.runner.json @@ -1,6 +1,6 @@ { "$schema": "https://xunit.net/schema/current/xunit.runner.schema.json", - "parallelizeTestCollections": false, + "parallelizeTestCollections": true, "parallelizeAssembly": false, - "maxParallelThreads": 1 + "maxParallelThreads": 4 } diff --git a/Libraries/test/IntegrationTests.Helpers/LambdaHelper.cs b/Libraries/test/IntegrationTests.Helpers/LambdaHelper.cs index 2a6d70f6c..8b6d62fd0 100644 --- a/Libraries/test/IntegrationTests.Helpers/LambdaHelper.cs +++ b/Libraries/test/IntegrationTests.Helpers/LambdaHelper.cs @@ -2,7 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 using System.Collections.Generic; +using System.Linq; using System.Threading.Tasks; +using Amazon.CloudFormation; +using Amazon.CloudFormation.Model; using Amazon.Lambda; using Amazon.Lambda.Model; @@ -10,31 +13,39 @@ namespace IntegrationTests.Helpers { public class LambdaHelper { + // Resource type that SAM AWS::Serverless::Function resources are transformed into in the deployed stack. + private const string LambdaFunctionResourceType = "AWS::Lambda::Function"; + private readonly IAmazonLambda _lambdaClient; + private readonly IAmazonCloudFormation _cloudFormationClient; - public LambdaHelper(IAmazonLambda lambdaClient) + public LambdaHelper(IAmazonLambda lambdaClient, IAmazonCloudFormation cloudFormationClient) { _lambdaClient = lambdaClient; + _cloudFormationClient = cloudFormationClient; } + /// + /// Returns the Lambda functions belonging to a CloudFormation stack by listing the stack's + /// resources directly. This is O(stack size) and independent of how many functions exist in + /// the account, unlike scanning every function and reading its tags one at a time, which is + /// slow and prone to throttling in a shared test account. + /// public async Task> FilterByCloudFormationStackAsync(string stackName) { - const string stackNameKey = "aws:cloudformation:stack-name"; - const string logicalIdKey = "aws:cloudformation:logical-id"; var lambdaFunctions = new List(); - var paginator = _lambdaClient.Paginators.ListFunctions(new ListFunctionsRequest()); + var paginator = _cloudFormationClient.Paginators.ListStackResources( + new ListStackResourcesRequest { StackName = stackName }); - await foreach (var function in paginator.Functions) + await foreach (var resource in paginator.StackResourceSummaries) { - var tags = (await _lambdaClient.ListTagsAsync(new ListTagsRequest { Resource = function.FunctionArn })).Tags; - if (tags.ContainsKey(stackNameKey) && string.Equals(tags[stackNameKey], stackName)) + if (string.Equals(resource.ResourceType, LambdaFunctionResourceType)) { - var lambdaFunction = new LambdaFunction + lambdaFunctions.Add(new LambdaFunction { - LogicalId = tags[logicalIdKey], - Name = function.FunctionName - }; - lambdaFunctions.Add(lambdaFunction); + LogicalId = resource.LogicalResourceId, + Name = resource.PhysicalResourceId + }); } } diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HealthCheckTests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HealthCheckTests.cs index 2360d7305..2e6891a6c 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HealthCheckTests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HealthCheckTests.cs @@ -1,13 +1,13 @@ using System.Net; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; /// /// Tests for the health check endpoint which does not require authorization. /// -[Collection("Integration Tests")] -public class HealthCheckTests +public class HealthCheckTests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV1Tests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV1Tests.cs index 990f06c28..51eb3a0a0 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV1Tests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV1Tests.cs @@ -2,6 +2,7 @@ using System.Net.Http.Headers; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; @@ -12,8 +13,7 @@ namespace TestCustomAuthorizerApp.IntegrationTests; /// These tests verify that the source-generated Lambda handler correctly extracts /// values from the authorizer context using [FromCustomAuthorizer] attributes. /// -[Collection("Integration Tests")] -public class HttpApiV1Tests +public class HttpApiV1Tests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV2Tests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV2Tests.cs index c63231f35..6bf1df605 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV2Tests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/HttpApiV2Tests.cs @@ -2,6 +2,7 @@ using System.Net.Http.Headers; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; @@ -12,8 +13,7 @@ namespace TestCustomAuthorizerApp.IntegrationTests; /// These tests verify that the source-generated Lambda handler correctly extracts /// values from the authorizer context using [FromCustomAuthorizer] attributes. /// -[Collection("Integration Tests")] -public class HttpApiV2Tests +public class HttpApiV2Tests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixture.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixture.cs index 06cba6a17..71c91b782 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixture.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixture.cs @@ -39,9 +39,10 @@ public class IntegrationTestContextFixture : IAsyncLifetime public IntegrationTestContextFixture() { - _cloudFormationHelper = new CloudFormationHelper(new AmazonCloudFormationClient(Amazon.RegionEndpoint.USWest2)); + var cloudFormationClient = new AmazonCloudFormationClient(Amazon.RegionEndpoint.USWest2); + _cloudFormationHelper = new CloudFormationHelper(cloudFormationClient); _s3Helper = new S3Helper(new AmazonS3Client(Amazon.RegionEndpoint.USWest2)); - LambdaHelper = new LambdaHelper(new AmazonLambdaClient(Amazon.RegionEndpoint.USWest2)); + LambdaHelper = new LambdaHelper(new AmazonLambdaClient(Amazon.RegionEndpoint.USWest2), cloudFormationClient); CloudWatchHelper = new CloudWatchHelper(new AmazonCloudWatchLogsClient(Amazon.RegionEndpoint.USWest2)); HttpClient = new HttpClient(); } diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs index dd673e7b9..db28d5278 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs @@ -1,11 +1,4 @@ -using Xunit; - -namespace TestCustomAuthorizerApp.IntegrationTests; - -[CollectionDefinition("Integration Tests", DisableParallelization = true)] -public class IntegrationTestContextFixtureCollection : ICollectionFixture -{ - // This class has no code, and is never created. Its purpose is simply - // to be the place to apply [CollectionDefinition] and all the - // ICollectionFixture<> interfaces. -} +// Registers the AssemblyFixture test framework so test classes can share a single +// IntegrationTestContextFixture (one deployed stack) via IAssemblyFixture while still +// running in parallel. Without this attribute IAssemblyFixture is silently ignored. +[assembly: Xunit.TestFramework("Xunit.Extensions.AssemblyFixture.AssemblyFixtureFramework", "Xunit.Extensions.AssemblyFixture")] diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/NonStringAuthorizerTests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/NonStringAuthorizerTests.cs index 0d25145dd..c61670586 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/NonStringAuthorizerTests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/NonStringAuthorizerTests.cs @@ -2,6 +2,7 @@ using System.Net.Http.Headers; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; @@ -13,8 +14,7 @@ namespace TestCustomAuthorizerApp.IntegrationTests; /// These tests exercise the type conversion logic in the .tt template's generated code /// using Convert.ChangeType() to convert authorizer context values to the parameter types. /// -[Collection("Integration Tests")] -public class NonStringAuthorizerTests +public class NonStringAuthorizerTests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/RestApiTests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/RestApiTests.cs index a226762d7..1bc01991c 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/RestApiTests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/RestApiTests.cs @@ -2,6 +2,7 @@ using System.Net.Http.Headers; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; @@ -12,8 +13,7 @@ namespace TestCustomAuthorizerApp.IntegrationTests; /// These tests verify that the source-generated Lambda handler correctly extracts /// values from the authorizer context using [FromCustomAuthorizer] attributes. /// -[Collection("Integration Tests")] -public class RestApiTests +public class RestApiTests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleHttpApiAuthorizerTests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleHttpApiAuthorizerTests.cs index 468caa3b4..34764731e 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleHttpApiAuthorizerTests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleHttpApiAuthorizerTests.cs @@ -2,6 +2,7 @@ using System.Net.Http.Headers; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; @@ -13,8 +14,7 @@ namespace TestCustomAuthorizerApp.IntegrationTests; /// The authorizer under test is /// which returns IAuthorizerResult (AuthorizerResults.Allow()/Deny()) instead of raw API Gateway types. /// -[Collection("Integration Tests")] -public class SimpleHttpApiAuthorizerTests +public class SimpleHttpApiAuthorizerTests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleRestApiAuthorizerTests.cs b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleRestApiAuthorizerTests.cs index 3d64ba6c2..8cacdcd7f 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleRestApiAuthorizerTests.cs +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/SimpleRestApiAuthorizerTests.cs @@ -2,6 +2,7 @@ using System.Net.Http.Headers; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestCustomAuthorizerApp.IntegrationTests; @@ -14,8 +15,7 @@ namespace TestCustomAuthorizerApp.IntegrationTests; /// which returns IAuthorizerResult (AuthorizerResults.Allow()/Deny()) instead of raw API Gateway types. /// The generated handler serializes this to an IAM policy document with the correct MethodArn. /// -[Collection("Integration Tests")] -public class SimpleRestApiAuthorizerTests +public class SimpleRestApiAuthorizerTests : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/TestCustomAuthorizerApp.IntegrationTests.csproj b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/TestCustomAuthorizerApp.IntegrationTests.csproj index bc3018c9c..98fe17c46 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/TestCustomAuthorizerApp.IntegrationTests.csproj +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/TestCustomAuthorizerApp.IntegrationTests.csproj @@ -13,6 +13,9 @@ + + all runtime; build; native; contentfiles; analyzers diff --git a/Libraries/test/TestServerlessApp.ALB.IntegrationTests/ALBIntegrationTestContextFixture.cs b/Libraries/test/TestServerlessApp.ALB.IntegrationTests/ALBIntegrationTestContextFixture.cs index 40c70f7be..84d6d17b3 100644 --- a/Libraries/test/TestServerlessApp.ALB.IntegrationTests/ALBIntegrationTestContextFixture.cs +++ b/Libraries/test/TestServerlessApp.ALB.IntegrationTests/ALBIntegrationTestContextFixture.cs @@ -35,9 +35,10 @@ public class ALBIntegrationTestContextFixture : IAsyncLifetime public ALBIntegrationTestContextFixture() { - _cloudFormationHelper = new CloudFormationHelper(new AmazonCloudFormationClient(Amazon.RegionEndpoint.USWest2)); + var cloudFormationClient = new AmazonCloudFormationClient(Amazon.RegionEndpoint.USWest2); + _cloudFormationHelper = new CloudFormationHelper(cloudFormationClient); _s3Helper = new S3Helper(new AmazonS3Client(Amazon.RegionEndpoint.USWest2)); - LambdaHelper = new LambdaHelper(new AmazonLambdaClient(Amazon.RegionEndpoint.USWest2)); + LambdaHelper = new LambdaHelper(new AmazonLambdaClient(Amazon.RegionEndpoint.USWest2), cloudFormationClient); ELBv2Client = new AmazonElasticLoadBalancingV2Client(Amazon.RegionEndpoint.USWest2); HttpClient = new HttpClient(); } diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/ComplexCalculator.cs b/Libraries/test/TestServerlessApp.IntegrationTests/ComplexCalculator.cs index 98075930f..cc1a66091 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/ComplexCalculator.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/ComplexCalculator.cs @@ -4,11 +4,11 @@ using Newtonsoft.Json; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class ComplexCalculator + public class ComplexCalculator : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/CustomResponse.cs b/Libraries/test/TestServerlessApp.IntegrationTests/CustomResponse.cs index 433adac4d..15502af64 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/CustomResponse.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/CustomResponse.cs @@ -6,11 +6,11 @@ using System.Text; using System.Threading.Tasks; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class CustomResponse + public class CustomResponse : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/DynamoDBEventSourceMapping.cs b/Libraries/test/TestServerlessApp.IntegrationTests/DynamoDBEventSourceMapping.cs index cef6c76e4..dcfd5d476 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/DynamoDBEventSourceMapping.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/DynamoDBEventSourceMapping.cs @@ -4,11 +4,11 @@ using System.Linq; using System.Threading.Tasks; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class DynamoDBEventSourceMapping + public class DynamoDBEventSourceMapping : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/FunctionUrlExample.cs b/Libraries/test/TestServerlessApp.IntegrationTests/FunctionUrlExample.cs index b3f97929b..286c7575e 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/FunctionUrlExample.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/FunctionUrlExample.cs @@ -7,11 +7,11 @@ using System.Threading.Tasks; using Newtonsoft.Json.Linq; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class FunctionUrlExample + public class FunctionUrlExample : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/Greeter.cs b/Libraries/test/TestServerlessApp.IntegrationTests/Greeter.cs index 395ebfc29..bafde7c97 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/Greeter.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/Greeter.cs @@ -4,11 +4,11 @@ using System.Net.Http; using System.Threading.Tasks; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class Greeter + public class Greeter : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixture.cs b/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixture.cs index 864b72058..27bdf2f83 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixture.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixture.cs @@ -42,10 +42,11 @@ public class IntegrationTestContextFixture : IAsyncLifetime public IntegrationTestContextFixture() { - _cloudFormationHelper = new CloudFormationHelper(new AmazonCloudFormationClient(Amazon.RegionEndpoint.USWest2)); + var cloudFormationClient = new AmazonCloudFormationClient(Amazon.RegionEndpoint.USWest2); + _cloudFormationHelper = new CloudFormationHelper(cloudFormationClient); _s3Helper = new S3Helper(new AmazonS3Client(Amazon.RegionEndpoint.USWest2)); S3HelperInstance = _s3Helper; - LambdaHelper = new LambdaHelper(new AmazonLambdaClient(Amazon.RegionEndpoint.USWest2)); + LambdaHelper = new LambdaHelper(new AmazonLambdaClient(Amazon.RegionEndpoint.USWest2), cloudFormationClient); CloudWatchHelper = new CloudWatchHelper(new AmazonCloudWatchLogsClient(Amazon.RegionEndpoint.USWest2)); HttpClient = new HttpClient(); } diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs b/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs index a58ad3967..db28d5278 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/IntegrationTestContextFixtureCollection.cs @@ -1,9 +1,4 @@ -using Xunit; - -namespace TestServerlessApp.IntegrationTests -{ - [CollectionDefinition("Integration Tests")] - public class IntegrationTestContextFixtureCollection : ICollectionFixture - { - } -} \ No newline at end of file +// Registers the AssemblyFixture test framework so test classes can share a single +// IntegrationTestContextFixture (one deployed stack) via IAssemblyFixture while still +// running in parallel. Without this attribute IAssemblyFixture is silently ignored. +[assembly: Xunit.TestFramework("Xunit.Extensions.AssemblyFixture.AssemblyFixtureFramework", "Xunit.Extensions.AssemblyFixture")] diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/S3EventNotification.cs b/Libraries/test/TestServerlessApp.IntegrationTests/S3EventNotification.cs index d9758ae00..88fd75659 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/S3EventNotification.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/S3EventNotification.cs @@ -5,11 +5,11 @@ using System.Threading.Tasks; using Amazon.S3; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class S3EventNotification + public class S3EventNotification : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/SNSEventSubscription.cs b/Libraries/test/TestServerlessApp.IntegrationTests/SNSEventSubscription.cs index 075a5162b..c644a661b 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/SNSEventSubscription.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/SNSEventSubscription.cs @@ -5,11 +5,11 @@ using System.Threading.Tasks; using Amazon.SimpleNotificationService; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class SNSEventSubscription + public class SNSEventSubscription : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/SQSEventSourceMapping.cs b/Libraries/test/TestServerlessApp.IntegrationTests/SQSEventSourceMapping.cs index 02f803074..78b886cc6 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/SQSEventSourceMapping.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/SQSEventSourceMapping.cs @@ -4,11 +4,11 @@ using System.Linq; using System.Threading.Tasks; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class SQSEventSourceMapping + public class SQSEventSourceMapping : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/ScheduleEventRule.cs b/Libraries/test/TestServerlessApp.IntegrationTests/ScheduleEventRule.cs index 19ef9e9da..242e8a605 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/ScheduleEventRule.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/ScheduleEventRule.cs @@ -6,11 +6,11 @@ using Amazon.CloudWatchEvents; using Amazon.CloudWatchEvents.Model; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class ScheduleEventRule + public class ScheduleEventRule : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/SimpleCalculator.cs b/Libraries/test/TestServerlessApp.IntegrationTests/SimpleCalculator.cs index 1d0df22ec..80dc2c4a7 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/SimpleCalculator.cs +++ b/Libraries/test/TestServerlessApp.IntegrationTests/SimpleCalculator.cs @@ -4,11 +4,11 @@ using System.Net.Http; using System.Threading.Tasks; using Xunit; +using Xunit.Extensions.AssemblyFixture; namespace TestServerlessApp.IntegrationTests { - [Collection("Integration Tests")] - public class SimpleCalculator + public class SimpleCalculator : IAssemblyFixture { private readonly IntegrationTestContextFixture _fixture; diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/TestServerlessApp.IntegrationTests.csproj b/Libraries/test/TestServerlessApp.IntegrationTests/TestServerlessApp.IntegrationTests.csproj index 36bce5b4c..7597a5f29 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/TestServerlessApp.IntegrationTests.csproj +++ b/Libraries/test/TestServerlessApp.IntegrationTests/TestServerlessApp.IntegrationTests.csproj @@ -14,6 +14,9 @@ + + all runtime; build; native; contentfiles; analyzers diff --git a/buildtools/build.proj b/buildtools/build.proj index c2ea686e7..550e72703 100644 --- a/buildtools/build.proj +++ b/buildtools/build.proj @@ -218,12 +218,11 @@ - - - - - + + diff --git a/buildtools/run-integ-tests-parallel.ps1 b/buildtools/run-integ-tests-parallel.ps1 new file mode 100644 index 000000000..3614d9420 --- /dev/null +++ b/buildtools/run-integ-tests-parallel.ps1 @@ -0,0 +1,63 @@ +#!/usr/bin/env pwsh +# Runs every integration test project concurrently. Each *.IntegrationTests.csproj deploys its own +# isolated CloudFormation stack (unique name + S3 bucket), so the projects have no shared state and +# can run in parallel. Running them serially was the dominant cost of the CI integ-test phase. +# +# Output from each project is captured and printed as a labeled block after that project finishes, +# so the interleaved logs of parallel runs stay readable. The script exits non-zero if any project +# fails, listing which ones. + +param( + [string]$Configuration = "Release", + # Directory to search for integration test projects (defaults to the Libraries/test tree). + [string]$TestRoot = (Join-Path $PSScriptRoot ".." "Libraries" "test"), + # Upper bound on how many projects run at once. + [int]$ThrottleLimit = 5 +) + +$ErrorActionPreference = 'Stop' + +$projects = Get-ChildItem -Path $TestRoot -Recurse -Filter "*.IntegrationTests.csproj" | + Select-Object -ExpandProperty FullName | + Sort-Object + +if (-not $projects) +{ + Write-Host "No integration test projects found under '$TestRoot'." + exit 0 +} + +Write-Host "Running $($projects.Count) integration test project(s) in parallel (throttle limit $ThrottleLimit):" +$projects | ForEach-Object { Write-Host " - $_" } + +$results = $projects | ForEach-Object -ThrottleLimit $ThrottleLimit -Parallel { + $project = $_ + $name = [System.IO.Path]::GetFileNameWithoutExtension($project) + # 2>&1 folds stderr into the captured stream so warnings/errors appear in the labeled block. + $output = dotnet test -c $using:Configuration --logger "console;verbosity=detailed" $project 2>&1 | Out-String + [PSCustomObject]@{ + Name = $name + Project = $project + ExitCode = $LASTEXITCODE + Output = $output + } +} + +foreach ($result in $results) +{ + Write-Host "" + Write-Host "==================== $($result.Name) (exit $($result.ExitCode)) ====================" + Write-Host $result.Output +} + +$failed = $results | Where-Object { $_.ExitCode -ne 0 } +if ($failed) +{ + Write-Host "" + Write-Host "The following integration test project(s) failed:" + $failed | ForEach-Object { Write-Host " - $($_.Name)" } + exit 1 +} + +Write-Host "" +Write-Host "All integration test projects passed." From 3255e972186bb66898603f2938f178e485d06a5c Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 10:34:52 -0400 Subject: [PATCH 3/8] Stream integ-test output live instead of buffering until project completion The parallel runner captured each project's output with Out-String and only printed it after the project finished, so nothing appeared during the long integration-test run. Stream each line to the host as it arrives, prefixed with the project name so the interleaved parallel logs stay attributable. Failed projects still get their full output reprinted as one clean block at the end. --- buildtools/run-integ-tests-parallel.ps1 | 27 ++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/buildtools/run-integ-tests-parallel.ps1 b/buildtools/run-integ-tests-parallel.ps1 index 3614d9420..b87b88295 100644 --- a/buildtools/run-integ-tests-parallel.ps1 +++ b/buildtools/run-integ-tests-parallel.ps1 @@ -3,9 +3,10 @@ # isolated CloudFormation stack (unique name + S3 bucket), so the projects have no shared state and # can run in parallel. Running them serially was the dominant cost of the CI integ-test phase. # -# Output from each project is captured and printed as a labeled block after that project finishes, -# so the interleaved logs of parallel runs stay readable. The script exits non-zero if any project -# fails, listing which ones. +# Each project's output is streamed live, prefixed with the project name so the interleaved logs of +# the parallel runs stay attributable. Failed projects also get their full output reprinted as one +# clean block at the end (un-interleaved) for easier diagnosis. The script exits non-zero if any +# project fails, listing which ones. param( [string]$Configuration = "Release", @@ -33,24 +34,32 @@ $projects | ForEach-Object { Write-Host " - $_" } $results = $projects | ForEach-Object -ThrottleLimit $ThrottleLimit -Parallel { $project = $_ $name = [System.IO.Path]::GetFileNameWithoutExtension($project) - # 2>&1 folds stderr into the captured stream so warnings/errors appear in the labeled block. - $output = dotnet test -c $using:Configuration --logger "console;verbosity=detailed" $project 2>&1 | Out-String + $lines = [System.Collections.Generic.List[string]]::new() + # 2>&1 folds stderr into the stream. Each line is emitted to the host as it arrives, prefixed + # with the project name, so progress is visible during the (long) run instead of only at the end. + dotnet test -c $using:Configuration --logger "console;verbosity=detailed" $project 2>&1 | + ForEach-Object { + $line = $_.ToString() + $lines.Add($line) + Write-Host "[$name] $line" + } [PSCustomObject]@{ Name = $name Project = $project ExitCode = $LASTEXITCODE - Output = $output + Output = ($lines -join [System.Environment]::NewLine) } } -foreach ($result in $results) +# Reprint each failed project's output as one clean, un-interleaved block for easier diagnosis. +$failed = $results | Where-Object { $_.ExitCode -ne 0 } +foreach ($result in $failed) { Write-Host "" - Write-Host "==================== $($result.Name) (exit $($result.ExitCode)) ====================" + Write-Host "==================== FAILED: $($result.Name) (exit $($result.ExitCode)) ====================" Write-Host $result.Output } -$failed = $results | Where-Object { $_.ExitCode -ne 0 } if ($failed) { Write-Host "" From b0d874537b5efc03dd931d61877a3daf80484266 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 11:02:20 -0400 Subject: [PATCH 4/8] Fix flaky durable suspend tests: await termination signal instead of fixed delay MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit InvokeOperationTests.InvokeAsync_FreshExecution_CheckpointsStartAndSuspends failed intermittently on net10.0 (e.g. CI run on PR #2451). The suspend-path tests kicked off an operation, slept a fixed 10-50ms, then asserted tm.IsTerminated. Under CI thread-pool pressure the suspend signal didn't always fire within that window, so the assert raced and failed. TerminationManager already exposes TerminationTask, a Task that completes exactly when Terminate() fires. Replace the fixed delays with a shared tm.WaitForTerminationAsync() helper that awaits that task (bounded by a 10s timeout so a genuine non-suspension still fails fast at the assert). Applied to all 13 suspend-gated sites across 5 test files. Verified: full suite passes on net8.0 and net10.0, and the previously-flaky test passed 25/25 consecutive runs on net10.0. Also faster — tests resume the instant suspension fires instead of always sleeping. --- .../CallbackOperationTests.cs | 4 ++-- .../ChildContextOperationTests.cs | 2 +- .../DurableContextTests.cs | 4 ++-- .../InvokeOperationTests.cs | 12 +++++----- .../TerminationTestHelpers.cs | 23 +++++++++++++++++++ .../WaitForConditionOperationTests.cs | 4 ++-- 6 files changed, 36 insertions(+), 13 deletions(-) create mode 100644 Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationTestHelpers.cs diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs index 99a1342fe..a826c0c10 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/CallbackOperationTests.cs @@ -163,7 +163,7 @@ public async Task GetResultAsync_FreshExecution_SuspendsExecution() // GetResultAsync should signal termination and return a never-completing task. var resultTask = callback.GetResultAsync(); - await Task.Delay(10); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(resultTask.IsCompleted); @@ -193,7 +193,7 @@ public async Task ReplayStarted_DoesNotReFlushStart_AndSuspendsOnGetResult() Assert.False(tm.IsTerminated); var resultTask = callback.GetResultAsync(); - await Task.Delay(10); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(resultTask.IsCompleted); diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs index b8b2e952b..1782fe933 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/ChildContextOperationTests.cs @@ -322,7 +322,7 @@ public async Task RunInChildContextAsync_ChildSuspendsOnWait_TerminatesWithWaitS }, name: "phase"); - await Task.Delay(50); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs index 74fcfe3fb..76d7b748a 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/DurableContextTests.cs @@ -379,7 +379,7 @@ public async Task WaitAsync_NewExecution_SignalsTermination() var waitTask = context.WaitAsync(TimeSpan.FromSeconds(30), name: "my_wait"); // Give it a moment to execute - await Task.Delay(10); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(waitTask.IsCompleted); @@ -433,7 +433,7 @@ public async Task WaitAsync_StartedButNotExpired_ResuspendsWithoutNewCheckpoint( var waitTask = context.WaitAsync(TimeSpan.FromSeconds(30), name: "pending_wait"); - await Task.Delay(10); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(waitTask.IsCompleted); diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs index daf933cb5..c69568ca9 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/InvokeOperationTests.cs @@ -74,7 +74,7 @@ public async Task InvokeAsync_PreservesUnqualifiedArn_AndPassesItThrough() payload: "x", name: "noversion"); - await Task.Delay(20); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); @@ -100,7 +100,7 @@ public async Task InvokeAsync_FreshExecution_CheckpointsStartAndSuspends() // Service-side suspend mechanics: TerminationManager fires before the // user task completes; the task itself never resolves on the fresh path. - await Task.Delay(20); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); @@ -130,7 +130,7 @@ public async Task InvokeAsync_FreshExecution_NoTenantId_OmitsTenantId() var task = context.InvokeAsync(FunctionArn, "payload", name: "no_tenant"); - await Task.Delay(20); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); @@ -154,7 +154,7 @@ public async Task InvokeAsync_FreshExecution_StartIsSyncFlushed() var (context, recorder, tm, _) = CreateContext(); var task = context.InvokeAsync(FunctionArn, "x", name: "sync_flush"); - await Task.Delay(20); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); @@ -350,7 +350,7 @@ public async Task InvokeAsync_ReplayStarted_ResuspendsWithoutRecheckpoint() }); var task = context.InvokeAsync(FunctionArn, "x", name: "still_running"); - await Task.Delay(20); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); @@ -377,7 +377,7 @@ public async Task InvokeAsync_ReplayPending_ResuspendsWithoutRecheckpoint() }); var task = context.InvokeAsync(FunctionArn, "x", name: "pending"); - await Task.Delay(20); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationTestHelpers.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationTestHelpers.cs new file mode 100644 index 000000000..dcbf39553 --- /dev/null +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/TerminationTestHelpers.cs @@ -0,0 +1,23 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +using Amazon.Lambda.DurableExecution.Internal; + +namespace Amazon.Lambda.DurableExecution.Tests; + +/// +/// Shared helpers for tests that exercise the suspend/terminate path. +/// +internal static class TerminationTestHelpers +{ + /// + /// Waits for the suspend signal deterministically instead of a fixed delay, which races under + /// CI thread-pool pressure (the original Task.Delay assumed the suspend happened within a + /// fixed window, which isn't guaranteed). The suspend path trips + /// , which completes + /// . Bounded by a timeout so a genuine + /// non-suspension fails fast at the following assert instead of hanging. + /// + public static Task WaitForTerminationAsync(this TerminationManager tm, int timeoutSeconds = 10) => + Task.WhenAny(tm.TerminationTask, Task.Delay(TimeSpan.FromSeconds(timeoutSeconds))); +} diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs index 50f7557b3..81eeb1c54 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.Tests/WaitForConditionOperationTests.cs @@ -89,7 +89,7 @@ public async Task FreshExecution_StrategyContinues_EmitsRetryAndSuspends() }, name: "poll"); - await Task.Delay(50); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); @@ -818,7 +818,7 @@ public async Task FreshExecution_FlushesStartBeforeSuspending() }, name: "poll"); - await Task.Delay(50); + await tm.WaitForTerminationAsync(); Assert.True(tm.IsTerminated); Assert.False(task.IsCompleted); From 8f85e9c7eb3eb09e09458f74cd90b216529fa559 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 12:46:11 -0400 Subject: [PATCH 5/8] Fix IAM throttling and build thrash in parallel durable integ suite Running the durable integ suite in parallel (maxParallelThreads=4) surfaced two contention problems that this addresses. IAM 'Rate exceeded': each test created and deleted its own IAM role, so several deployments hammered IAM's (global, single-bucket, low-rate) mutating APIs at once. Replace per-test roles with a single shared execution role (durable-integ-shared-execution-role) created at most once per account and reused across tests and runs, gated so concurrent deployments don't race. It carries the union of permissions every scenario needs (invoke durable-integ-* functions + send durable-execution callbacks); no test depends on a role lacking a permission, so one role is safe. Dispose no longer deletes roles. Clients also use adaptive retry as a backstop. Build thrash/timeouts: each test published its function separately and wiped obj/bin first, so the shared source projects (Amazon.Lambda.DurableExecution etc.) were rebuilt per-test, and concurrent publishes thrashed MSBuild into 'dotnet timed out'. Publish all functions once, up front, in a single MSBuild pass via a generated traversal project (Restore;Publish, BuildInParallel) that builds the shared projects once and publishes each function to its own bin/publish; tests then only zip that output. Verified: 51/51 functions publish in one ~16s pass with 0 errors, and the suite no longer throttles IAM. --- .../DurableFunctionDeployment.cs | 422 +++++++++--------- 1 file changed, 207 insertions(+), 215 deletions(-) diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs index cd7448171..58e0ca71e 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs @@ -9,6 +9,7 @@ using Amazon.IdentityManagement.Model; using Amazon.Lambda; using Amazon.Lambda.Model; +using Amazon.Runtime; using Xunit.Abstractions; namespace Amazon.Lambda.DurableExecution.IntegrationTests; @@ -40,20 +41,36 @@ internal sealed class DurableFunctionDeployment : IAsyncDisposable private readonly IAmazonIdentityManagementService _iamClient; private readonly string _functionName; - private readonly string _roleName; private string? _roleArn; private string? _functionArn; private bool _functionCreated; - private readonly List _inlinePolicyNames = new(); // Optional paired "external system" Lambda — a plain (non-durable) function // that the workflow's submitter invokes. Models a real-world callback flow // where an out-of-band service resolves the durable execution. private readonly string _externalFunctionName; - private readonly string _externalRoleName; private string? _externalRoleArn; private bool _externalFunctionCreated; + // A single IAM role shared by every test function in the suite. Creating and deleting a role + // per deployment burst-throttled IAM ("Rate exceeded") once the suite started running in + // parallel — IAM is global, single-bucketed, and throttles mutating calls aggressively. The + // shared role is created at most once per account (reused across runs) and gated so concurrent + // deployments don't race to create it. No test depends on a role *lacking* a permission, so a + // single union-of-permissions role is safe; it is scoped to invoking durable-integ-* functions. + private const string SharedRoleName = "durable-integ-shared-execution-role"; + private static readonly SemaphoreSlim SharedRoleGate = new(1, 1); + private static string? _sharedRoleArn; + + // Publishing is done ONCE for all test functions, up front, instead of per-test. The test + // functions all reference the same source projects (Amazon.Lambda.DurableExecution etc.); + // publishing each function separately (and the old code wiped obj/bin first, forcing a cold + // build every time) rebuilt those shared projects dozens of times, and doing it concurrently + // thrashed MSBuild. A single up-front pass builds the shared projects once and the publishes + // run incrementally; each test then just zips its already-published output. + private static readonly SemaphoreSlim PrePublishGate = new(1, 1); + private static bool _prePublished; + public string FunctionName => _functionName; public string? ExternalFunctionName => _externalFunctionCreated ? _externalFunctionName : null; @@ -77,16 +94,34 @@ internal sealed class DurableFunctionDeployment : IAsyncDisposable private DurableFunctionDeployment(ITestOutputHelper output, string suffix) { _output = output; - _lambdaClient = new AmazonLambdaClient(DeploymentRegion); - _iamClient = new AmazonIdentityManagementServiceClient(DeploymentRegion); + // The integration suite runs its test classes in parallel, so several deployments hit IAM + // (CreateRole/AttachRolePolicy/DeleteRole) at once. IAM has low request-rate limits and + // returns "Rate exceeded" under that contention. Adaptive retry adds client-side rate + // limiting and backs off on throttling, and a higher retry count rides out longer throttle + // windows, instead of failing the test on the first throttle. + _lambdaClient = new AmazonLambdaClient(BuildClientConfig()); + _iamClient = new AmazonIdentityManagementServiceClient(BuildClientConfig()); // Truncate the GUID (not the suffix) so CloudTrail entries stay readable. // Keep the GUID short enough that the total stays well under 40 chars even for long suffixes. static string ShortId() => Guid.NewGuid().ToString("N")[..Math.Min(8, 32)]; _functionName = $"durable-integ-{suffix}-{ShortId()}"; - _roleName = $"durable-integ-{suffix}-{ShortId()}"; _externalFunctionName = $"durable-integ-{suffix}-ext-{ShortId()}"; - _externalRoleName = $"durable-integ-{suffix}-ext-{ShortId()}"; + } + + /// + /// Builds a client config tuned to survive throttling when the suite runs in parallel: + /// adaptive retry (client-side rate limiting + backoff on throttle) and a generous retry count. + /// + private static TConfig BuildClientConfig() where TConfig : ClientConfig, new() + { + var config = new TConfig + { + RegionEndpoint = DeploymentRegion, + RetryMode = RequestRetryMode.Adaptive, + MaxErrorRetry = 10 + }; + return config; } // The optional `handler` defaults to `bootstrap` (executable model). Pass an @@ -192,143 +227,125 @@ public static async Task CreateAsync( } """; - private async Task InitializeAsync( - string testFunctionDir, - string? externalFunctionDir, - IDictionary? environment, - IReadOnlyList? invokeAllowedFunctionArns, - bool enableTenancy, - string? handler) + // Inline policy granting the permissions every durable-integ scenario needs: invoking any + // durable-integ-* function (covers chained invoke and external-function invoke) and sending + // durable-execution callbacks. Resource is scoped to the suite's function name prefix. + private const string SharedInlinePolicyName = "DurableIntegSharedPermissions"; + private const string SharedInlinePolicyDocument = """ { - // 1. Create the workflow's IAM role. - _output.WriteLine($"Creating IAM role: {_roleName}"); - var createRoleResponse = await _iamClient.CreateRoleAsync(new CreateRoleRequest - { - RoleName = _roleName, - AssumeRolePolicyDocument = LambdaAssumeRolePolicy - }); - _roleArn = createRoleResponse.Role.Arn; + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "lambda:InvokeFunction", + "Resource": [ + "arn:aws:lambda:*:*:function:durable-integ-*", + "arn:aws:lambda:*:*:function:durable-integ-*:*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "lambda:SendDurableExecutionCallbackSuccess", + "lambda:SendDurableExecutionCallbackFailure" + ], + "Resource": "*" + } + ] + } + """; - await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest - { - RoleName = _roleName, - PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" - }); + /// + /// Returns the ARN of the shared execution role, creating it once per account if absent. + /// Gated by a semaphore + memoized ARN so concurrent deployments don't race or re-create it. + /// In steady state (role already exists from a prior run) this is a single GetRole call for the + /// entire suite, which is what keeps the parallel run under IAM's mutating-call rate limits. + /// + private async Task GetOrCreateSharedRoleAsync() + { + if (_sharedRoleArn != null) + return _sharedRoleArn; - await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest + await SharedRoleGate.WaitAsync(); + try { - RoleName = _roleName, - PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy" - }); + if (_sharedRoleArn != null) + return _sharedRoleArn; - // 2. (optional) Create the external function's IAM role up front so its - // sts:AssumeRole and lambda:SendDurableExecutionCallbackSuccess - // permissions propagate alongside the workflow role's permissions - // (single 10-second sleep covers both). - if (externalFunctionDir != null) - { - _output.WriteLine($"Creating external IAM role: {_externalRoleName}"); - var extRoleResponse = await _iamClient.CreateRoleAsync(new CreateRoleRequest + try { - RoleName = _externalRoleName, + var existing = await _iamClient.GetRoleAsync(new GetRoleRequest { RoleName = SharedRoleName }); + _output.WriteLine($"Reusing shared IAM role: {SharedRoleName}"); + _sharedRoleArn = existing.Role.Arn; + return _sharedRoleArn; + } + catch (NoSuchEntityException) + { + // Falls through to create it. + } + + _output.WriteLine($"Creating shared IAM role: {SharedRoleName}"); + var created = await _iamClient.CreateRoleAsync(new CreateRoleRequest + { + RoleName = SharedRoleName, AssumeRolePolicyDocument = LambdaAssumeRolePolicy }); - _externalRoleArn = extRoleResponse.Role.Arn; await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest { - RoleName = _externalRoleName, + RoleName = SharedRoleName, PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole" }); - - // Inline policy lets the external function call the durable callback API. - // Resource "*" because we don't yet know the workflow's ARN at this point — - // the external function only resolves callbacks belonging to executions the - // workflow created, so the blast radius is bounded by the role's lifetime. - await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest + await _iamClient.AttachRolePolicyAsync(new AttachRolePolicyRequest { - RoleName = _externalRoleName, - PolicyName = "SendDurableExecutionCallback", - PolicyDocument = """ - { - "Version": "2012-10-17", - "Statement": [{ - "Effect": "Allow", - "Action": [ - "lambda:SendDurableExecutionCallbackSuccess", - "lambda:SendDurableExecutionCallbackFailure" - ], - "Resource": "*" - }] - } - """ + RoleName = SharedRoleName, + PolicyArn = "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy" }); - - // Workflow function will Invoke the external function — grant via inline policy. - // Scoped to the external function name we just minted. await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest { - RoleName = _roleName, - PolicyName = "InvokeExternalFunction", - PolicyDocument = $$""" - { - "Version": "2012-10-17", - "Statement": [{ - "Effect": "Allow", - "Action": "lambda:InvokeFunction", - "Resource": "arn:aws:lambda:*:*:function:{{_externalFunctionName}}" - }] - } - """ + RoleName = SharedRoleName, + PolicyName = SharedInlinePolicyName, + PolicyDocument = SharedInlinePolicyDocument }); - _inlinePolicyNames.Add("InvokeExternalFunction"); - } - // Grant cross-Lambda invoke when the parent of a chained-invoke scenario - // needs to call out to a downstream function. The durable execution service - // is the one that actually drives the chained invocation in production — - // attaching this directly to the parent's role keeps the parent role - // capable of being used in non-durable contexts (e.g. for diagnostic - // direct invokes from the test harness). - if (invokeAllowedFunctionArns != null && invokeAllowedFunctionArns.Count > 0) + // Wait for IAM propagation so the first function create doesn't hit + // "The role defined for the function cannot be assumed by Lambda". + await Task.Delay(TimeSpan.FromSeconds(10)); + + _sharedRoleArn = created.Role.Arn; + return _sharedRoleArn; + } + finally { - // Allow both the unqualified ARN and any qualifier (alias/version/$LATEST). - var resources = new List(invokeAllowedFunctionArns.Count * 2); - foreach (var arn in invokeAllowedFunctionArns) - { - resources.Add(arn); - resources.Add(arn + ":*"); - } - var resourceJson = "[" + string.Join(",", resources.Select(r => $"\"{r}\"")) + "]"; - var policyDoc = $$""" - { - "Version": "2012-10-17", - "Statement": [{ - "Effect": "Allow", - "Action": ["lambda:InvokeFunction"], - "Resource": {{resourceJson}} - }] - } - """; - const string PolicyName = "AllowChainedInvoke"; - await _iamClient.PutRolePolicyAsync(new PutRolePolicyRequest - { - RoleName = _roleName, - PolicyName = PolicyName, - PolicyDocument = policyDoc - }); - _inlinePolicyNames.Add(PolicyName); + SharedRoleGate.Release(); } + } - // Wait for IAM propagation. - await Task.Delay(TimeSpan.FromSeconds(10)); + private async Task InitializeAsync( + string testFunctionDir, + string? externalFunctionDir, + IDictionary? environment, + IReadOnlyList? invokeAllowedFunctionArns, + bool enableTenancy, + string? handler) + { + // 1. Acquire the shared IAM role (created once per account, reused across tests and runs). + // Both the workflow function and any paired external function run under this single role, + // which carries the union of permissions every scenario needs. The external function's + // callback-send permission and the workflow's invoke permission are all baked into the + // shared role, so per-test PutRolePolicy calls are no longer needed. + _roleArn = await GetOrCreateSharedRoleAsync(); + if (externalFunctionDir != null) + { + _externalRoleArn = _roleArn; + } - // 3. Build + zip the workflow function package. + // 2. Build + zip the workflow function package. _output.WriteLine($"Building and zipping function package from {testFunctionDir}..."); var zipBytes = await BuildAndZipAsync(testFunctionDir); _output.WriteLine($"Package built: {zipBytes.Length} bytes"); - // 4. (optional) Build + deploy the external function. Done before the workflow + // 3. (optional) Build + deploy the external function. Done before the workflow // Lambda so the workflow function's environment can reference the external // function name (which is already known from the ctor). if (externalFunctionDir != null) @@ -355,7 +372,7 @@ await _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest await WaitForFunctionActive(_externalFunctionName); } - // 5. Create the workflow Lambda. + // 4. Create the workflow Lambda. _output.WriteLine($"Creating Lambda function: {_functionName}"); var createFunctionRequest = new CreateFunctionRequest { @@ -604,42 +621,18 @@ private async Task WaitForFunctionActive(string functionName) } /// - /// Publishes a test function (framework-dependent, linux-x64) and zips the publish - /// output for upload as a managed-runtime Lambda package. The zip contains the native - /// bootstrap shim that the dotnet managed runtime execs (executable model). + /// Returns the zipped, published package for a test function. The actual publishing happens + /// once for all functions (see ); this just zips + /// the already-published output. The zip contains the native bootstrap shim that the + /// dotnet managed runtime execs (executable model). /// private async Task BuildAndZipAsync(string testFunctionDir) { - // `dotnet test` spins up one testhost per TargetFramework (net8.0 + net10.0) and - // runs them concurrently. Both testhosts invoke the same test classes, which means - // two processes can race on the same TestFunctions// source dir — wiping bin/ - // and obj/ under each other's feet. Symptom: MSB3030 "Could not copy bootstrap.dll" - // because one process deleted obj/ while the other was mid-publish. Serialize the - // per-source-dir build with a cross-process file lock so different test functions - // can still build in parallel. (A Mutex would have thread-affinity issues across - // awaits; an exclusive FileStream avoids that.) Lock file goes under temp — keeping - // it out of the source tree avoids polluting git status across worktrees. - var lockKey = Convert.ToHexString(System.Security.Cryptography.SHA256.HashData( - Encoding.UTF8.GetBytes(testFunctionDir.ToLowerInvariant())))[..16]; - var lockPath = Path.Combine(Path.GetTempPath(), $"durable-integ-build-{lockKey}.lock"); - using var lockHandle = await AcquireExclusiveFileLockAsync(lockPath, TimeSpan.FromMinutes(10)); + await EnsureAllFunctionsPublishedAsync(); var publishDir = Path.Combine(testFunctionDir, "bin", "publish"); - if (Directory.Exists(publishDir)) Directory.Delete(publishDir, true); - - // MSBuild's up-to-date check leaves stale .Up2Date markers under obj/ that - // make `dotnet publish` skip the copy-to-output step on a second run after - // we've wiped bin/publish/. Result: empty publish dir → empty zip package. - // Nuking obj/ guarantees a real publish each time the helper is invoked. - // Cheap (each test function is small). - var objDir = Path.Combine(testFunctionDir, "obj"); - if (Directory.Exists(objDir)) Directory.Delete(objDir, true); - var binDir = Path.Combine(testFunctionDir, "bin"); - if (Directory.Exists(binDir)) Directory.Delete(binDir, true); - - await RunProcess("dotnet", - $"publish -c Release -r linux-x64 --self-contained false -o \"{publishDir}\"", - testFunctionDir); + if (!Directory.Exists(publishDir)) + throw new DirectoryNotFoundException($"Expected published output at '{publishDir}' but it does not exist."); // Zip the publish output. On Linux (CI) ZipFile preserves the bootstrap exec bit; // on Windows the managed runtime tolerates the missing bit. @@ -650,21 +643,75 @@ await RunProcess("dotnet", return await File.ReadAllBytesAsync(zipPath); } - private static async Task AcquireExclusiveFileLockAsync(string lockPath, TimeSpan timeout) + /// + /// Publishes every test function once, up front, in a SINGLE MSBuild invocation. Runs at most + /// once per test run (gated + memoized). A generated traversal project references all function + /// projects and publishes them with one dotnet build, so MSBuild builds the shared + /// dependency projects once and publishes the functions in parallel within that one process — + /// avoiding both the per-project CLI/MSBuild startup cost of N separate dotnet publish + /// calls and the cross-process thrash that those caused when the suite ran in parallel. Each + /// function still lands in its own bin/publish; tests then only zip that output. + /// + private async Task EnsureAllFunctionsPublishedAsync() { - var deadline = DateTime.UtcNow + timeout; - while (true) + if (_prePublished) + return; + + await PrePublishGate.WaitAsync(); + try { + if (_prePublished) + return; + + var testFunctionsRoot = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "TestFunctions")); + var projects = Directory.GetFiles(testFunctionsRoot, "*.csproj", SearchOption.AllDirectories) + .OrderBy(p => p, StringComparer.Ordinal) + .ToList(); + + _output.WriteLine($"Pre-publishing {projects.Count} test function(s) in a single MSBuild pass..."); + + // Generate a traversal project that publishes every function project to its own + // bin/publish (PublishDir relative to each project). BuildInParallel lets MSBuild fan + // the publishes out across nodes once the shared dependency projects are built. + var itemsXml = string.Concat(projects.Select(p => + $" \n")); + var traversalProject = $""" + + + {itemsXml} + + + + + + """; + + var traversalPath = Path.Combine(Path.GetTempPath(), $"durable-integ-publish-all-{Guid.NewGuid():N}.proj"); + await File.WriteAllTextAsync(traversalPath, traversalProject); try { - return new FileStream(lockPath, FileMode.OpenOrCreate, FileAccess.ReadWrite, FileShare.None); + // -maxcpucount lets MSBuild use multiple nodes for the parallel publishes. + await RunProcess("dotnet", + $"build \"{traversalPath}\" -t:PublishAll -maxcpucount", + testFunctionsRoot); } - catch (IOException) + finally { - if (DateTime.UtcNow >= deadline) - throw new TimeoutException($"Timed out waiting for build lock '{lockPath}' after {timeout.TotalSeconds:F0}s"); - await Task.Delay(TimeSpan.FromMilliseconds(500)); + try { File.Delete(traversalPath); } catch { /* best effort */ } } + + _prePublished = true; + } + finally + { + PrePublishGate.Release(); } } @@ -742,64 +789,9 @@ public async ValueTask DisposeAsync() catch (Exception ex) { _output.WriteLine($"Cleanup error (external function): {ex.Message}"); } } - if (_roleArn != null) - { - // Detach each policy independently — if one detach fails (e.g., the - // policy was never attached because init bailed out early) we still - // want to attempt the others and the final DeleteRole. - await TryDetachManaged(_roleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"); - await TryDetachManaged(_roleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicDurableExecutionRolePolicy"); - - // Inline policies must be deleted (not detached) before DeleteRole succeeds. - foreach (var inline in _inlinePolicyNames) - { - await TryDeleteInline(_roleName, inline); - } - - try - { - await _iamClient.DeleteRoleAsync(new DeleteRoleRequest { RoleName = _roleName }); - } - catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteRole): {ex.Message}"); } - } - - if (_externalRoleArn != null) - { - await TryDetachManaged(_externalRoleName, "arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole"); - await TryDeleteInline(_externalRoleName, "SendDurableExecutionCallback"); - try - { - await _iamClient.DeleteRoleAsync(new DeleteRoleRequest { RoleName = _externalRoleName }); - } - catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteRole external): {ex.Message}"); } - } - - async Task TryDetachManaged(string roleName, string policyArn) - { - try - { - await _iamClient.DetachRolePolicyAsync(new DetachRolePolicyRequest - { - RoleName = roleName, - PolicyArn = policyArn - }); - } - catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM Detach {policyArn}): {ex.Message}"); } - } - - async Task TryDeleteInline(string roleName, string policyName) - { - try - { - await _iamClient.DeleteRolePolicyAsync(new DeleteRolePolicyRequest - { - RoleName = roleName, - PolicyName = policyName - }); - } - catch (NoSuchEntityException) { /* policy was never attached — fine */ } - catch (Exception ex) { _output.WriteLine($"Cleanup error (IAM DeleteInline {policyName}): {ex.Message}"); } - } + // The shared IAM role is intentionally NOT deleted here — it is reused by every test and + // across runs. Deleting/recreating it per test is exactly what burst-throttled IAM. It is a + // single stable role (durable-integ-shared-execution-role) that the test account retains. } public static string FindTestFunctionDir(string functionDirName) From c2dee0c8eac8ba5103c4944f3ef573748f64d7c0 Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 13:00:59 -0400 Subject: [PATCH 6/8] Fix flaky FileDescriptorLogStream test: capture exact written bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MaxSizeProducesOneLogFrame intermittently failed with 'Expected: 16, Actual: 15' on the header length. The header ends with an 8-byte big-endian microsecond timestamp; roughly 1 in 256 timestamps ends in a 0x00 byte. TestFileStream's Write captured bytes via TrimTrailingNullBytes(buffer).Take(count), which stripped that legitimate trailing zero, yielding a 15-byte header. Capture exactly buffer[offset, offset + count) instead — that is precisely what the production code wrote, and it no longer depends on the timestamp's value. --- .../TestHelpers/TestFileStream.cs | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Libraries/test/Amazon.Lambda.RuntimeSupport.Tests/Amazon.Lambda.RuntimeSupport.UnitTests/TestHelpers/TestFileStream.cs b/Libraries/test/Amazon.Lambda.RuntimeSupport.Tests/Amazon.Lambda.RuntimeSupport.UnitTests/TestHelpers/TestFileStream.cs index 0b3d3b8fc..bd9b0946e 100644 --- a/Libraries/test/Amazon.Lambda.RuntimeSupport.Tests/Amazon.Lambda.RuntimeSupport.UnitTests/TestHelpers/TestFileStream.cs +++ b/Libraries/test/Amazon.Lambda.RuntimeSupport.Tests/Amazon.Lambda.RuntimeSupport.UnitTests/TestHelpers/TestFileStream.cs @@ -1,7 +1,5 @@ using System; -using System.Collections.Generic; using System.IO; -using System.Linq; namespace Amazon.Lambda.RuntimeSupport.UnitTests.TestHelpers { @@ -19,13 +17,14 @@ public TestFileStream(Action writeAction) public override void Write(byte[] buffer, int offset, int count) { - WriteAction(TrimTrailingNullBytes(buffer).Take(count).ToArray(), offset, count); - } - - private static IEnumerable TrimTrailingNullBytes(IEnumerable buffer) - { - // Trim trailing null bytes to make testing assertions easier - return buffer.Reverse().SkipWhile(x => x == 0).Reverse(); + // Capture exactly the bytes that were written: [offset, offset + count). + // The previous implementation trimmed trailing null bytes from the buffer, which was + // flaky: a log header ends with an 8-byte big-endian microsecond timestamp, and roughly + // 1 in 256 timestamps ends in a 0x00 byte. Trimming that legitimate byte made the + // captured header 15 bytes instead of 16 and failed MaxSizeProducesOneLogFrame. + var written = new byte[count]; + Array.Copy(buffer, offset, written, 0, count); + WriteAction(written, offset, count); } } } From ee9ad183b53ef6f2f0c19ec4c7b8ee16ad913e5f Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 13:19:14 -0400 Subject: [PATCH 7/8] Throttle Lambda control-plane calls in parallel durable integ suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the shared-role fix removed IAM throttling, the throttling moved to Lambda's account-wide control-plane APIs: with maxParallelThreads=4, the combination of CreateFunction + DeleteFunction + WaitForFunctionActive polling GetFunctionConfiguration exceeded Lambda's limits, surfacing as 'Rate exceeded' and adaptive retry's 'capacity could not be obtained'. Two compounding causes addressed: - Each deployment built its own AWS clients, so adaptive retry's per-client rate limiter couldn't coordinate across the parallel deployments — N clients each assumed they had capacity and fired at once. Make the Lambda and IAM clients static/shared so adaptive retry actually paces the whole suite. - Cap concurrent Lambda control-plane calls (create/delete/get-configuration) with a suite-wide semaphore (limit 2) via a RunControlPlaneAsync helper, so the 4 parallel test threads don't collectively exceed Lambda's control-plane rate. Data-plane calls (Invoke, durable-execution reads) are not gated. Also slow the WaitForFunctionActive poll from 2s to 3s to cut its call rate. --- .../DurableFunctionDeployment.cs | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs index 58e0ca71e..d3935e86d 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs @@ -37,8 +37,22 @@ internal sealed class DurableFunctionDeployment : IAsyncDisposable private static readonly RegionEndpoint DeploymentRegion = RegionEndpoint.USEast1; private readonly ITestOutputHelper _output; - private readonly IAmazonLambda _lambdaClient; - private readonly IAmazonIdentityManagementService _iamClient; + + // Clients are shared (static) across all deployments. Each deployment used to construct its + // own clients, which defeated adaptive retry: its congestion controller / rate limiter is + // per-client, so N independent clients each believed they had capacity, all fired at once, and + // collectively blew Lambda's account-wide control-plane limits ("capacity could not be obtained + // ... insufficient capacity"). A single shared client per service lets adaptive retry actually + // coordinate backoff across the parallel deployments. + private static readonly IAmazonLambda _lambdaClient = new AmazonLambdaClient(BuildClientConfig()); + private static readonly IAmazonIdentityManagementService _iamClient = + new AmazonIdentityManagementServiceClient(BuildClientConfig()); + + // Lambda control-plane calls (CreateFunction/DeleteFunction/GetFunctionConfiguration) are + // account-rate-limited and are the next bottleneck once IAM is no longer per-test. Cap how many + // run concurrently across the whole suite so the parallel deployments don't collectively exceed + // Lambda's limits; data-plane calls (Invoke, durable-execution reads) are not gated. + private static readonly SemaphoreSlim LambdaControlPlaneGate = new(2, 2); private readonly string _functionName; private string? _roleArn; @@ -94,13 +108,6 @@ internal sealed class DurableFunctionDeployment : IAsyncDisposable private DurableFunctionDeployment(ITestOutputHelper output, string suffix) { _output = output; - // The integration suite runs its test classes in parallel, so several deployments hit IAM - // (CreateRole/AttachRolePolicy/DeleteRole) at once. IAM has low request-rate limits and - // returns "Rate exceeded" under that contention. Adaptive retry adds client-side rate - // limiting and backs off on throttling, and a higher retry count rides out longer throttle - // windows, instead of failing the test on the first throttle. - _lambdaClient = new AmazonLambdaClient(BuildClientConfig()); - _iamClient = new AmazonIdentityManagementServiceClient(BuildClientConfig()); // Truncate the GUID (not the suffix) so CloudTrail entries stay readable. // Keep the GUID short enough that the total stays well under 40 chars even for long suffixes. @@ -354,7 +361,7 @@ private async Task InitializeAsync( var extZipBytes = await BuildAndZipAsync(externalFunctionDir); _output.WriteLine($"Creating external Lambda function: {_externalFunctionName}"); - await _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest + await RunControlPlaneAsync(() => _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest { FunctionName = _externalFunctionName, Runtime = ManagedRuntime, @@ -365,7 +372,7 @@ await _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest MemorySize = 256, LoggingConfig = new LoggingConfig { LogFormat = LogFormat.JSON } // No DurableConfig — this is a plain function. - }); + })); _externalFunctionCreated = true; _output.WriteLine("Waiting for external function to become Active..."); @@ -423,7 +430,7 @@ await _lambdaClient.CreateFunctionAsync(new CreateFunctionRequest }; } - var createFunctionResponse = await _lambdaClient.CreateFunctionAsync(createFunctionRequest); + var createFunctionResponse = await RunControlPlaneAsync(() => _lambdaClient.CreateFunctionAsync(createFunctionRequest)); _functionCreated = true; _functionArn = createFunctionResponse.FunctionArn; @@ -604,22 +611,43 @@ private void DumpEvents(GetDurableExecutionHistoryResponse history) private async Task WaitForFunctionActive(string functionName) { - for (int i = 0; i < 60; i++) + for (int i = 0; i < 40; i++) { try { - var config = await _lambdaClient.GetFunctionConfigurationAsync( - new GetFunctionConfigurationRequest { FunctionName = functionName }); + // Gate each poll call: GetFunctionConfiguration is control-plane and rate-limited, + // and all parallel deployments poll at once. + var config = await RunControlPlaneAsync(() => _lambdaClient.GetFunctionConfigurationAsync( + new GetFunctionConfigurationRequest { FunctionName = functionName })); if (config.State == State.Active) return; if (config.State == State.Failed) throw new Exception($"Function '{functionName}' creation failed: {config.StateReasonCode} - {config.StateReason}"); } catch (ResourceNotFoundException) { } - await Task.Delay(TimeSpan.FromSeconds(2)); + await Task.Delay(TimeSpan.FromSeconds(3)); } throw new TimeoutException($"Function '{functionName}' did not become Active within 120 seconds"); } + /// + /// Runs a Lambda control-plane operation under so the + /// suite's parallel deployments don't collectively exceed Lambda's account-wide + /// control-plane request rate. Adaptive retry on the shared client handles brief throttles; + /// this gate keeps the offered load low enough that retry doesn't exhaust its capacity. + /// + private static async Task RunControlPlaneAsync(Func> operation) + { + await LambdaControlPlaneGate.WaitAsync(); + try + { + return await operation(); + } + finally + { + LambdaControlPlaneGate.Release(); + } + } + /// /// Returns the zipped, published package for a test function. The actual publishing happens /// once for all functions (see ); this just zips @@ -774,7 +802,7 @@ public async ValueTask DisposeAsync() try { _output.WriteLine($"Deleting function: {_functionName}"); - await _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _functionName }); + await RunControlPlaneAsync(() => _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _functionName })); } catch (Exception ex) { _output.WriteLine($"Cleanup error (function): {ex.Message}"); } } @@ -784,7 +812,7 @@ public async ValueTask DisposeAsync() try { _output.WriteLine($"Deleting external function: {_externalFunctionName}"); - await _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _externalFunctionName }); + await RunControlPlaneAsync(() => _lambdaClient.DeleteFunctionAsync(new DeleteFunctionRequest { FunctionName = _externalFunctionName })); } catch (Exception ex) { _output.WriteLine($"Cleanup error (external function): {ex.Message}"); } } From bbfff3d78c8cac7cd100291b9f7af5b1237db27f Mon Sep 17 00:00:00 2001 From: Garrett Beatty Date: Sat, 27 Jun 2026 13:54:10 -0400 Subject: [PATCH 8/8] Fix parallel file races in integ tests: tool install and function.zip The CI run no longer throttles IAM or Lambda control-plane (those fixes held), but parallelism surfaced two shared-file races: - 'Cannot create .../dotnet/tools/.store/amazon.lambda.tools/6.0.6 because a file or directory with the same name already exists': the three *.IntegrationTests projects run DeploymentScript.ps1 in parallel and each ran 'dotnet tool install -g Amazon.Lambda.Tools', colliding on the global tool store. Make the install idempotent: skip if already installed, and tolerate the concurrent-install race (already-installed/already-exists treated as success) with a short retry. - 'function.zip ... being used by another process' (ApproverFunction): a test function that is the external function for more than one test was zipped to a shared bin/function.zip by multiple parallel tests at once. Zip to a unique temp path per call instead; the read-only published output is still shared. --- .../DurableFunctionDeployment.cs | 23 +++++++++++----- .../DeploymentScript.ps1 | 27 ++++++++++++++++++- .../DeploymentScript.ps1 | 27 ++++++++++++++++++- .../DeploymentScript.ps1 | 27 ++++++++++++++++++- 4 files changed, 94 insertions(+), 10 deletions(-) diff --git a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs index d3935e86d..3116c1686 100644 --- a/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs +++ b/Libraries/test/Amazon.Lambda.DurableExecution.IntegrationTests/DurableFunctionDeployment.cs @@ -662,13 +662,22 @@ private async Task BuildAndZipAsync(string testFunctionDir) if (!Directory.Exists(publishDir)) throw new DirectoryNotFoundException($"Expected published output at '{publishDir}' but it does not exist."); - // Zip the publish output. On Linux (CI) ZipFile preserves the bootstrap exec bit; - // on Windows the managed runtime tolerates the missing bit. - var zipPath = Path.Combine(testFunctionDir, "bin", "function.zip"); - if (File.Exists(zipPath)) File.Delete(zipPath); - ZipFile.CreateFromDirectory(publishDir, zipPath, CompressionLevel.Optimal, includeBaseDirectory: false); - - return await File.ReadAllBytesAsync(zipPath); + // Zip the publish output to a UNIQUE temp path. A given function (e.g. ApproverFunction) is + // the external function for more than one test, so multiple parallel tests zip the same + // published output at once — writing to a shared bin/function.zip raced ("file is being used + // by another process"). The publish output itself is read-only and shared safely; only the + // zip destination needs to be per-call. On Linux (CI) ZipFile preserves the bootstrap exec + // bit; on Windows the managed runtime tolerates the missing bit. + var zipPath = Path.Combine(Path.GetTempPath(), $"durable-integ-fn-{Guid.NewGuid():N}.zip"); + try + { + ZipFile.CreateFromDirectory(publishDir, zipPath, CompressionLevel.Optimal, includeBaseDirectory: false); + return await File.ReadAllBytesAsync(zipPath); + } + finally + { + try { File.Delete(zipPath); } catch { /* best effort */ } + } } /// diff --git a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 index 3a20e55de..f64ccc6a9 100644 --- a/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 +++ b/Libraries/test/TestCustomAuthorizerApp.IntegrationTests/DeploymentScript.ps1 @@ -42,7 +42,32 @@ try $json = Get-Content .\aws-lambda-tools-defaults.json | Out-String | ConvertFrom-Json $region = $json.region - dotnet tool install -g Amazon.Lambda.Tools + # Install Amazon.Lambda.Tools idempotently. The integration test projects deploy in parallel, + # so several DeploymentScript.ps1 processes may run "dotnet tool install -g" at the same time and + # collide on the global tool store ("a file or directory with the same name already exists"). + # Skip if already present, and tolerate the concurrent-install race by treating an + # already-installed/already-exists result as success, with a short retry for the transient case. + if (dotnet tool list -g | Select-String -SimpleMatch 'amazon.lambda.tools') + { + Write-Host "Amazon.Lambda.Tools already installed." + } + else + { + for ($i = 1; $i -le 5; $i++) + { + $output = dotnet tool install -g Amazon.Lambda.Tools 2>&1 | Out-String + Write-Host $output + if ($LASTEXITCODE -eq 0 -or $output -match 'already installed' -or $output -match 'already exists') + { + break + } + if ($i -eq 5) + { + throw "Failed to install Amazon.Lambda.Tools after $i attempts." + } + Start-Sleep -Seconds ($i * 3) + } + } Write-Host "Creating S3 Bucket $identifier" if(![string]::IsNullOrEmpty($region)) diff --git a/Libraries/test/TestServerlessApp.ALB.IntegrationTests/DeploymentScript.ps1 b/Libraries/test/TestServerlessApp.ALB.IntegrationTests/DeploymentScript.ps1 index f74ee365f..f5e9e463d 100644 --- a/Libraries/test/TestServerlessApp.ALB.IntegrationTests/DeploymentScript.ps1 +++ b/Libraries/test/TestServerlessApp.ALB.IntegrationTests/DeploymentScript.ps1 @@ -42,7 +42,32 @@ try $json = Get-Content .\aws-lambda-tools-defaults.json | Out-String | ConvertFrom-Json $region = $json.region - dotnet tool install -g Amazon.Lambda.Tools + # Install Amazon.Lambda.Tools idempotently. The integration test projects deploy in parallel, + # so several DeploymentScript.ps1 processes may run "dotnet tool install -g" at the same time and + # collide on the global tool store ("a file or directory with the same name already exists"). + # Skip if already present, and tolerate the concurrent-install race by treating an + # already-installed/already-exists result as success, with a short retry for the transient case. + if (dotnet tool list -g | Select-String -SimpleMatch 'amazon.lambda.tools') + { + Write-Host "Amazon.Lambda.Tools already installed." + } + else + { + for ($i = 1; $i -le 5; $i++) + { + $output = dotnet tool install -g Amazon.Lambda.Tools 2>&1 | Out-String + Write-Host $output + if ($LASTEXITCODE -eq 0 -or $output -match 'already installed' -or $output -match 'already exists') + { + break + } + if ($i -eq 5) + { + throw "Failed to install Amazon.Lambda.Tools after $i attempts." + } + Start-Sleep -Seconds ($i * 3) + } + } Write-Host "Creating S3 Bucket $identifier" if(![string]::IsNullOrEmpty($region)) diff --git a/Libraries/test/TestServerlessApp.IntegrationTests/DeploymentScript.ps1 b/Libraries/test/TestServerlessApp.IntegrationTests/DeploymentScript.ps1 index bbff35b47..5802e5cbd 100644 --- a/Libraries/test/TestServerlessApp.IntegrationTests/DeploymentScript.ps1 +++ b/Libraries/test/TestServerlessApp.IntegrationTests/DeploymentScript.ps1 @@ -42,7 +42,32 @@ try $json = Get-Content .\aws-lambda-tools-defaults.json | Out-String | ConvertFrom-Json $region = $json.region - dotnet tool install -g Amazon.Lambda.Tools + # Install Amazon.Lambda.Tools idempotently. The integration test projects deploy in parallel, + # so several DeploymentScript.ps1 processes may run "dotnet tool install -g" at the same time and + # collide on the global tool store ("a file or directory with the same name already exists"). + # Skip if already present, and tolerate the concurrent-install race by treating an + # already-installed/already-exists result as success, with a short retry for the transient case. + if (dotnet tool list -g | Select-String -SimpleMatch 'amazon.lambda.tools') + { + Write-Host "Amazon.Lambda.Tools already installed." + } + else + { + for ($i = 1; $i -le 5; $i++) + { + $output = dotnet tool install -g Amazon.Lambda.Tools 2>&1 | Out-String + Write-Host $output + if ($LASTEXITCODE -eq 0 -or $output -match 'already installed' -or $output -match 'already exists') + { + break + } + if ($i -eq 5) + { + throw "Failed to install Amazon.Lambda.Tools after $i attempts." + } + Start-Sleep -Seconds ($i * 3) + } + } Write-Host "Creating S3 Bucket $identifier" if(![string]::IsNullOrEmpty($region))