diff --git a/packages/aws-durable-execution-sdk-python-examples/src/otel/otel_logger_example.py b/packages/aws-durable-execution-sdk-python-examples/src/otel/otel_logger_example.py index 92cf8db7..50041e9f 100644 --- a/packages/aws-durable-execution-sdk-python-examples/src/otel/otel_logger_example.py +++ b/packages/aws-durable-execution-sdk-python-examples/src/otel/otel_logger_example.py @@ -17,9 +17,6 @@ from typing import Any from aws_durable_execution_sdk_python_otel import DurableExecutionOtelPlugin -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from aws_durable_execution_sdk_python import StepContext from aws_durable_execution_sdk_python.context import ( @@ -30,10 +27,6 @@ from aws_durable_execution_sdk_python.execution import durable_execution -tracer_provider = trace.get_tracer_provider() -otel = DurableExecutionOtelPlugin(tracer_provider) - - @durable_step def greet(step_context: StepContext, name: str) -> str: # Logged inside a step: enriched with this step's span_id. @@ -51,7 +44,7 @@ def greet_in_child(child_context: DurableContext, name: str) -> str: return result -@durable_execution(plugins=[otel]) +@durable_execution(plugins=[DurableExecutionOtelPlugin()]) def handler(_event: Any, context: DurableContext) -> str: # Logged at the top level: enriched with the invocation span_id. context.logger.info("Workflow started") diff --git a/packages/aws-durable-execution-sdk-python-examples/src/plugin/execution_with_otel.py b/packages/aws-durable-execution-sdk-python-examples/src/plugin/execution_with_otel.py index 6bceefff..f55baec1 100644 --- a/packages/aws-durable-execution-sdk-python-examples/src/plugin/execution_with_otel.py +++ b/packages/aws-durable-execution-sdk-python-examples/src/plugin/execution_with_otel.py @@ -2,10 +2,10 @@ from typing import Any -from opentelemetry import trace +from aws_durable_execution_sdk_python_otel import DurableExecutionOtelPlugin from aws_durable_execution_sdk_python import StepContext -from aws_durable_execution_sdk_python.config import Duration, StepConfig, StepSemantics +from aws_durable_execution_sdk_python.config import Duration from aws_durable_execution_sdk_python.context import ( DurableContext, durable_step, @@ -13,13 +13,6 @@ ) from aws_durable_execution_sdk_python.execution import durable_execution -from aws_durable_execution_sdk_python_otel import DurableExecutionOtelPlugin - - -# use default provider -tracer_provider = trace.get_tracer_provider() -otel = DurableExecutionOtelPlugin(tracer_provider) - @durable_step def add_numbers(_step_context: StepContext, a: int, b: int) -> int: @@ -39,7 +32,7 @@ def add_numbers_in_child(child_context: DurableContext, a: int, b: int): return result -@durable_execution(plugins=[otel]) +@durable_execution(plugins=[DurableExecutionOtelPlugin()]) def handler(_event: Any, context: DurableContext) -> int: result = 0 for i in range(3): diff --git a/packages/aws-durable-execution-sdk-python-otel/README.md b/packages/aws-durable-execution-sdk-python-otel/README.md index 99f7f697..81354f32 100644 --- a/packages/aws-durable-execution-sdk-python-otel/README.md +++ b/packages/aws-durable-execution-sdk-python-otel/README.md @@ -1,12 +1,15 @@ -# aws-durable-execution-sdk-python-otel +# AWS Durable Execution SDK - OpenTelemetry Plugin -OpenTelemetry instrumentation for the [AWS Durable Execution SDK for Python](https://github.com/aws/aws-durable-execution-sdk-python). +OpenTelemetry instrumentation plugin for the [AWS Durable Execution SDK for Python](https://github.com/aws/aws-durable-execution-sdk-python). Emits distributed traces that correlate across multiple Lambda invocations of a single durable execution, producing deterministic span and trace IDs so that spans from different invocations are stitched into a single coherent trace. -> **Note:** v0.1.0 reserves the package name. Instrumentation lands in v0.2.0. +## Features -## Overview - -This package will provide automatic OpenTelemetry tracing for durable execution workflows, giving you visibility into step execution, waits, retries, and overall workflow performance. +- **Deterministic Trace IDs**: All invocations of the same durable execution share a single trace, derived from the X-Ray trace header or execution ARN +- **Span-per-Operation**: Each durable operation (step, wait, invoke) gets its own span with accurate timing +- **Continuation Spans**: Operations completing in a different invocation are linked back to the original span +- **Log Correlation**: Enrich application logs with trace ID and span ID for end-to-end observability +- **Configurable Sampling**: Control trace volume via plugin options +- **Self-Contained Setup**: No manual TracerProvider configuration required ## Installation @@ -14,20 +17,271 @@ This package will provide automatic OpenTelemetry tracing for durable execution pip install aws-durable-execution-sdk-python-otel ``` -## Quick Start +## Quick Start using X-Ray/CloudWatch Tracing + +1. Add the [ADOT Lambda Layer](#1-adot-lambda-layer) to your function and set `AWS_LAMBDA_EXEC_WRAPPER=/opt/otel-instrument` +2. Enable [X-Ray Active Tracing](#2-aws-x-ray-active-tracing) on the function +3. Pass `DurableExecutionOtelPlugin` to your handler's `plugins` list +4. Add X-Ray write permissions + +### 1. ADOT Lambda Layer + +This plugin requires the [AWS Distro for OpenTelemetry (ADOT) Lambda layer](https://aws-otel.github.io/docs/getting-started/lambda) to export traces from your Lambda function. + +The layer ARN follows the format: + +``` +arn:aws:lambda::901920570463:layer:aws-otel-python--ver- +``` + +Refer to the [ADOT Lambda Layer ARNs](https://aws-otel.github.io/docs/getting-started/lambda/lambda-python) page for the latest version number, architecture, and supported regions. + +**AWS CLI:** + +```bash +aws lambda update-function-configuration \ + --function-name your-function-name \ + --layers "arn:aws:lambda:us-east-1:901920570463:layer:aws-otel-python-amd64-ver-" +``` + +You must also set the `AWS_LAMBDA_EXEC_WRAPPER` environment variable: + +```bash +aws lambda update-function-configuration \ + --function-name your-function-name \ + --environment "Variables={AWS_LAMBDA_EXEC_WRAPPER=/opt/otel-instrument}" +``` + +> **Note:** Replace `us-east-1` with your function's region and ``/`` with the latest layer version and architecture from the ADOT docs. + +**CloudFormation / SAM:** + +```yaml +MyFunction: + Type: AWS::Serverless::Function + Properties: + Layers: + - !Sub arn:aws:lambda:${AWS::Region}:901920570463:layer:aws-otel-python-amd64-ver- + Environment: + Variables: + AWS_LAMBDA_EXEC_WRAPPER: /opt/otel-instrument +``` + +**CDK:** ```python -from aws_durable_execution_sdk_python_otel import __version__ +from aws_cdk import aws_lambda as lambda_ + +adot_layer = lambda_.LayerVersion.from_layer_version_arn( + self, + "AdotLayer", + f"arn:aws:lambda:{self.region}:901920570463:layer:aws-otel-python-amd64-ver-", +) + +fn = lambda_.Function( + self, + "MyFunction", + runtime=lambda_.Runtime.PYTHON_3_12, + handler="index.handler", + code=lambda_.Code.from_asset("lambda"), + layers=[adot_layer], + environment={"AWS_LAMBDA_EXEC_WRAPPER": "/opt/otel-instrument"}, +) +``` + +> **Tip:** Pin the layer version to a specific number in production deployments to avoid unexpected behavior from automatic version changes. + +### 2. AWS X-Ray Active Tracing + +Enable active tracing on your Lambda function so the `_X_AMZN_TRACE_ID` environment variable is populated at invocation time. The plugin uses this header to derive deterministic trace IDs that remain consistent across all invocations of the same durable execution. + +**AWS Console:** Lambda → Configuration → Monitoring and operations tools → Active tracing → Enable + +**AWS CLI:** + +```bash +aws lambda update-function-configuration \ + --function-name your-function-name \ + --tracing-config Mode=Active +``` + +**CloudFormation / SAM:** + +```yaml +MyFunction: + Type: AWS::Lambda::Function + Properties: + TracingConfig: + Mode: Active +``` + +**CDK:** + +```python +lambda_.Function( + self, + "MyFunction", + tracing=lambda_.Tracing.ACTIVE, +) +``` + +### 3. In your Lambda handler (index.py) + +```python +from aws_durable_execution_sdk_python import DurableContext +from aws_durable_execution_sdk_python.execution import durable_execution +from aws_durable_execution_sdk_python_otel import DurableExecutionOtelPlugin + + +@durable_execution(plugins=[DurableExecutionOtelPlugin()]) +def handler(event: dict, context: DurableContext) -> dict: + result = context.step(lambda _: fetch_data(event["id"]), name="fetch-data") + + context.wait(duration=Duration.from_seconds(5)) + + context.step(lambda _: process(result), name="process") -print(__version__) + return result ``` -## Planned Features (v0.2.0) +That's it. The plugin handles TracerProvider setup, deterministic ID generation, and span lifecycle internally. + +### 4. Grant Permissions + +The function's execution role needs the `AWSXRayDaemonWriteAccess` managed policy (or equivalent permissions) if using X-Ray as the tracing backend. + +### Environment Variables for ADOT layer + +| Variable | Description | Default | +| ----------------------------- | --------------------------------------------------------------------------------------------- | ----------------- | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | Endpoint for the OTLP exporter (e.g., `http://localhost:4318` for the ADOT collector sidecar) | Set by ADOT layer | +| `AWS_LAMBDA_EXEC_WRAPPER` | Set to `/opt/otel-instrument` for the ADOT layer to instrument your function | — | +| `OTEL_TRACES_SAMPLER` | Sampler to use (e.g., `traceidratio` for ratio-based sampling) | `always_on` | +| `OTEL_TRACES_SAMPLER_ARG` | Argument for the sampler (e.g., `0.3` to sample 30% of traces) | — | + +See the [ADOT sampling configuration](https://aws-otel.github.io/docs/getting-started/lambda#sampling-configuration) for more details. + +## Configuration + +### Plugin Options + +```python +from aws_durable_execution_sdk_python_otel import ( + DurableExecutionOtelPlugin, + xray_context_extractor, +) + +plugin = DurableExecutionOtelPlugin( + # Provide your own TracerProvider if you already have one configured. + # Defaults to the globally configured tracer provider. + trace_provider=None, + # Use a custom context extractor (default: xray_context_extractor). + context_extractor=xray_context_extractor, + # Ratio used by TraceIdRatioBased sampling (default: 1.0). + sampling_rate=1.0, + # Custom instrumentation scope name + # (default: "aws-durable-execution-sdk-python"). + instrument_name="my-service", + # Install a root-logger filter that stamps trace context onto every + # log record (default: True). + enrich_logger=True, +) +``` + +### Context Extractors + +The plugin supports multiple strategies for extracting upstream trace context: + +```python +from aws_durable_execution_sdk_python_otel import ( + DurableExecutionOtelPlugin, + w3c_client_context_extractor, + xray_context_extractor, +) + +# Default: X-Ray trace header (recommended for most Lambda deployments) +DurableExecutionOtelPlugin(context_extractor=xray_context_extractor) + +# W3C Trace Context via clientContext (requires backend propagation support) +DurableExecutionOtelPlugin(context_extractor=w3c_client_context_extractor) +``` + +### Log Correlation + +When `enrich_logger=True` (the default), the plugin installs a logging filter on +the root logger at invocation start. The filter stamps the active OTel trace +context onto every emitted log record using these attributes: + +- `otel_trace_id`: 32-char hex trace identifier +- `otel_span_id`: 16-char hex span identifier +- `otel_trace_sampled`: boolean indicating if the trace is sampled + +These attributes are only set when a valid span context is active, so any log +formatter or schema must treat the fields as optional. + +## Verification + +After deploying your function with the plugin configured: + +1. **Invoke your durable function** — trigger at least one execution that includes multiple steps or a wait/resume cycle. + +2. **Check the CloudWatch console** — Navigate to CloudWatch → Traces in the AWS Console. You should see a trace with: + - An "invocation" span per invocation + - Child spans for each durable operation (named after your step names) + - All invocations of the same execution grouped under one trace ID + +3. **Check log correlation** — verify that your logs include `otel_trace_id` and `otel_span_id` fields matching the spans in X-Ray. + +4. **Confirm sampling** — If you set `OTEL_TRACES_SAMPLER=traceidratio` and `OTEL_TRACES_SAMPLER_ARG` to a value less than 1.0, verify that only the expected proportion of traces appear. + +5. **Span links** — For operations that span multiple invocations (e.g., after a wait resumes), though span links are set, they are not visualized within the CloudWatch console. + +### Troubleshooting + +| Symptom | Likely Cause | +| --------------------------------- | --------------------------------------------------------------- | +| No traces appear | ADOT layer not configured, or `AWS_LAMBDA_EXEC_WRAPPER` not set | +| Traces appear but are fragmented | X-Ray active tracing not enabled on the Lambda function | +| Missing spans for some operations | `OTEL_TRACES_SAMPLER_ARG` set below 1.0 | +| `_X_AMZN_TRACE_ID` not populated | X-Ray active tracing not enabled | + +## API Reference + +### `DurableExecutionOtelPlugin` + +The main plugin class. Implements `DurableInstrumentationPlugin` from `aws_durable_execution_sdk_python`. + +```python +DurableExecutionOtelPlugin( + trace_provider=None, + context_extractor=None, + sampling_rate=1.0, + instrument_name="aws-durable-execution-sdk-python", + enrich_logger=True, +) +``` + +### `DeterministicIdGenerator` + +A custom OpenTelemetry `IdGenerator` that produces reproducible trace and span IDs from execution metadata. Exported for advanced use cases. + +### `xray_context_extractor` + +Default context extractor. Reads the `_X_AMZN_TRACE_ID` environment variable to derive trace context. + +### `w3c_client_context_extractor` + +Alternative context extractor. Reads W3C `traceparent` from `context.clientContext.custom.traceparent`. Requires backend `clientContext` propagation to be enabled. + +### `ContextExtractor` + +Type alias for custom context extractor functions. + +### `OtelContextLogFilter` / `install_log_filter` -- Automatic span creation for steps, waits, invokes, and child contexts -- Replay-aware tracing (distinguishes fresh executions from replays) -- Error recording with proper OTel status codes -- Configurable span attributes and naming +The logging filter (and its installer) used to stamp trace context onto log +records. Installed automatically when `enrich_logger=True`; exported for manual +setups. ## Requirements diff --git a/packages/aws-durable-execution-sdk-python-otel/src/aws_durable_execution_sdk_python_otel/plugin.py b/packages/aws-durable-execution-sdk-python-otel/src/aws_durable_execution_sdk_python_otel/plugin.py index 2368bbae..7476ca40 100644 --- a/packages/aws-durable-execution-sdk-python-otel/src/aws_durable_execution_sdk_python_otel/plugin.py +++ b/packages/aws-durable-execution-sdk-python-otel/src/aws_durable_execution_sdk_python_otel/plugin.py @@ -68,6 +68,8 @@ class DurableExecutionOtelPlugin(DurableInstrumentationPlugin): Args: trace_provider: OpenTelemetry tracer provider used to create spans. + Optional; when omitted, the globally configured tracer provider + (``opentelemetry.trace.get_tracer_provider()``) is used. context_extractor: Optional extractor for upstream context. Defaults to AWS X-Ray header extraction. sampling_rate: Ratio used by ``TraceIdRatioBased`` sampling. @@ -78,7 +80,7 @@ class DurableExecutionOtelPlugin(DurableInstrumentationPlugin): def __init__( self, - trace_provider: SdkTracerProvider, + trace_provider: SdkTracerProvider | None = None, context_extractor: ContextExtractor | None = None, sampling_rate: float = 1.0, instrument_name: str = DEFAULT_INSTRUMENT_NAME, @@ -86,9 +88,10 @@ def __init__( ) -> None: """Initialize the plugin with an OpenTelemetry tracer provider. - The provided tracer provider is configured with this plugin's - deterministic ID generator and sampling strategy so spans for a durable - execution share stable trace and logical operation identifiers. + The tracer provider is configured with this plugin's deterministic ID + generator and sampling strategy so spans for a durable execution share + stable trace and logical operation identifiers. When no provider is + supplied, the globally configured tracer provider is used. When enrich_logger is enabled (default), the plugin installs a logging filter on the root logger at invocation start that stamps the active @@ -99,15 +102,29 @@ def __init__( context_extractor or xray_context_extractor ) - self._provider = trace_provider - # A ProxyTracerProvider (the API default from trace.get_tracer_provider() - # before an SDK provider is configured) has no id_generator; fall back to - # None so DeterministicIdGenerator uses its own default generator. + self._provider = trace_provider or trace.get_tracer_provider() self._id_generator: DeterministicIdGenerator = DeterministicIdGenerator( fallback_id_generator=getattr(self._provider, "id_generator", None) ) - self._provider.id_generator = self._id_generator - self._provider.sampler = TraceIdRatioBased(sampling_rate) + # Deterministic trace stitching requires the SDK TracerProvider, which + # exposes id_generator/sampler. The API's default ProxyTracerProvider + # (returned before an SDK provider is configured) does not. Rather than + # fail the invocation over an observability concern, warn and continue: + # the proxy's tracer is effectively a no-op (and auto-delegates if an SDK + # provider is configured later). In a Lambda OTel/ADOT deployment the + # layer configures a real SDK provider before the handler imports. + if isinstance(self._provider, SdkTracerProvider): + self._provider.id_generator = self._id_generator + self._provider.sampler = TraceIdRatioBased(sampling_rate) + else: + logger.warning( + "DurableExecutionOtelPlugin expected an SDK TracerProvider " + "(opentelemetry.sdk.trace.TracerProvider) but got %s. Spans will " + "not use deterministic IDs or the configured sampling rate. " + "Ensure the OpenTelemetry SDK is configured (e.g. via the ADOT " + "Lambda layer) or pass an explicit trace_provider.", + type(self._provider).__name__, + ) self._tracer: Tracer = self._provider.get_tracer(instrument_name) # per invocation status: