AzureCosmosDB · jcodella · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.env.template b/.env.template
@@ -57,6 +57,11 @@ AI_FOUNDRY_EMBEDDING_DATA_TYPE=float32
 AI_FOUNDRY_EMBEDDING_DISTANCE_FUNCTION=cosine
 COSMOS_DB_FULL_TEXT_LANGUAGE=en-US
 
+# Embed raw conversation turns on write so they can be vector-searched via
+# search(target="turns"). The turns container is always provisioned with a
+# vector index, so toggling this never requires recreating the container.
+ENABLE_TURN_EMBEDDINGS=false
+
 AI_FOUNDRY_CHAT_DEPLOYMENT_NAME=<your-model-deployment>
 # Optional. Pin the Azure OpenAI REST API version used by chat and embeddings
 # clients. Leave blank to use the toolkit default ("2024-12-01-preview").

diff --git a/Docs/concepts.md b/Docs/concepts.md
@@ -94,6 +94,8 @@ Memories stored in Cosmos DB include embeddings generated by Microsoft AI Foundr
 
 Facts work especially well for vector search because each fact is stored as a small, self-contained document.
 
+By default raw conversation turns are *not* embedded — only derived memories (facts, episodic, procedural, summaries) carry vectors. Set `enable_turn_embeddings=True` (env `ENABLE_TURN_EMBEDDINGS`) to also embed turns on write, then call `search_cosmos(target="turns")` to vector-search the raw conversation log. The turns container is always provisioned with a `quantizedFlat` vector index, so this flag only toggles embedding generation and can be turned on or off at any time without recreating the container.
+
 ---
 
 ## Processing Pipeline

diff --git a/Docs/public_api.md b/Docs/public_api.md
@@ -14,7 +14,7 @@
 
 ### Connection
 
-- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, processor=None) -> None` — configure local state, model clients, optional Cosmos auto-connect, and optional processing backend. The SDK uses a hard 3-container topology: turns in `memories_turns`, facts/episodic/procedural in `memories`, and summaries in `memories_summaries` (or the names you pass).
+- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, enable_turn_embeddings=None, processor=None) -> None` — configure local state, model clients, optional Cosmos auto-connect, and optional processing backend. The SDK uses a hard 3-container topology: turns in `memories_turns`, facts/episodic/procedural in `memories`, and summaries in `memories_summaries` (or the names you pass). `enable_turn_embeddings` (default `False`, env `ENABLE_TURN_EMBEDDINGS`) embeds raw turns on write so they can be vector-searched via `search_cosmos(target="turns")`; the turns container is always provisioned with a vector index, so toggling this never requires recreating it.
 - `close() -> None` — close Cosmos/model clients and owned credentials.
 - `connect_cosmos(endpoint=None, credential=None, key=None, database=None, container=None, turns_container=None, summaries_container=None) -> None` — connect to existing memory, turns, and summaries containers.
 - `create_memory_store(database=None, container=None, turns_container=None, summaries_container=None, counter_container=None, lease_container=None, endpoint=None, credential=None, key=None, embedding_dimensions=None, embedding_data_type=None, distance_function=None, full_text_language=None, throughput_mode=None, autoscale_max_ru=None) -> None` — create/connect the memory, turns, summaries, counter, and lease containers.
@@ -37,7 +37,7 @@
 
 ### Retrieval
 
-- `search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None) -> list[dict]` — vector or hybrid search memories.
+- `search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None, target='memories') -> list[dict]` — vector or hybrid search memories. Set `target="turns"` to search the raw conversation log instead of facts/episodic/procedural (requires turn embeddings; see `enable_turn_embeddings`).
 - `get_procedural_prompt(user_id) -> Optional[str]` — read the active procedural prompt.
 - `get_procedural_history(user_id, limit=10) -> list[dict]` — read procedural prompt history.
 - `get_procedural_memories(user_id, priority=None, category=None, min_salience=None, include_superseded=False) -> list[dict]` — retrieve procedural memory documents.
@@ -67,7 +67,7 @@ Local-buffer methods remain synchronous in-memory operations; Cosmos, retrieval,
 
 ### Connection
 
-- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, processor=None) -> None` — configure async local state, model clients, and optional processing backend. The async SDK uses the same hard 3-container topology as the sync client.
+- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, enable_turn_embeddings=None, processor=None) -> None` — configure async local state, model clients, and optional processing backend. The async SDK uses the same hard 3-container topology as the sync client. `enable_turn_embeddings` (default `False`, env `ENABLE_TURN_EMBEDDINGS`) embeds raw turns on write so they can be vector-searched via `search_cosmos(target="turns")`.
 - `async close() -> None` — close async/sync resources and owned credentials.
 - `async connect_cosmos(endpoint=None, credential=None, key=None, database=None, container=None, turns_container=None, summaries_container=None) -> None` — connect to existing memory, turns, and summaries containers.
 - `async create_memory_store(database=None, container=None, turns_container=None, summaries_container=None, counter_container=None, lease_container=None, endpoint=None, credential=None, key=None, embedding_dimensions=None, embedding_data_type=None, distance_function=None, full_text_language=None, throughput_mode=None, autoscale_max_ru=None) -> None` — create/connect memory, turns, summaries, counter, and lease containers.
@@ -90,7 +90,7 @@ Local-buffer methods remain synchronous in-memory operations; Cosmos, retrieval,
 
 ### Retrieval
 
-- `async search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None) -> list[dict]` — vector or hybrid search memories.
+- `async search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None, target='memories') -> list[dict]` — vector or hybrid search memories. Set `target="turns"` to search the raw conversation log instead of facts/episodic/procedural (requires turn embeddings; see `enable_turn_embeddings`).
 - `async get_procedural_prompt(user_id) -> Optional[str]` — read the active procedural prompt.
 - `async get_procedural_history(user_id, limit=10) -> list[dict]` — read procedural prompt history.
 - `async get_procedural_memories(user_id, priority=None, category=None, min_salience=None, include_superseded=False) -> list[dict]` — retrieve procedural memory documents.

diff --git a/azure/cosmos/agent_memory/_base/base_client.py b/azure/cosmos/agent_memory/_base/base_client.py
@@ -17,6 +17,7 @@
 )
 from azure.cosmos.agent_memory.exceptions import CosmosNotConnectedError, MemoryNotFoundError, ValidationError
 from azure.cosmos.agent_memory.logging import configure_logging, get_logger
+from azure.cosmos.agent_memory.thresholds import get_enable_turn_embeddings
 
 logger = get_logger(__name__)
 
@@ -45,6 +46,7 @@ def _init_base_config(
         embedding_dimensions: Optional[int],
         chat_deployment_name: str,
         use_default_credential: bool,
+        enable_turn_embeddings: Optional[bool] = None,
         default_credential_module: str = "azure.identity",
     ) -> None:
         """Initialize shared local state, config values, and default credentials."""
@@ -75,6 +77,9 @@ def _init_base_config(
         self._embedding_deployment_name = embedding_deployment_name
         self._embedding_dimensions = _resolve_embedding_dimensions(embedding_dimensions)
         self._chat_deployment_name = chat_deployment_name
+        self._enable_turn_embeddings = (
+            enable_turn_embeddings if enable_turn_embeddings is not None else get_enable_turn_embeddings()
+        )
 
         self._owns_cosmos_credential = False
         self._owns_ai_foundry_credential = False

diff --git a/azure/cosmos/agent_memory/_container_routing.py b/azure/cosmos/agent_memory/_container_routing.py
@@ -32,6 +32,24 @@ class ContainerKey(str, Enum):
 
 USER_SCOPED_MEMORIES_TYPES: frozenset[str] = frozenset({"episodic", "procedural"})
 
+# Containers that expose a vector index and may be targeted by ``search``.
+_SEARCH_TARGETS: dict[str, ContainerKey] = {
+    "memories": ContainerKey.MEMORIES,
+    "turns": ContainerKey.TURNS,
+}
+
+
+def resolve_search_target(target: str) -> ContainerKey:
+    """Map a public ``search(target=...)`` value to its ``ContainerKey``.
+
+    ``"memories"`` (the default) targets facts/episodic/procedural; ``"turns"``
+    targets the raw conversation log (requires turn embeddings to be enabled).
+    """
+    try:
+        return _SEARCH_TARGETS[target]
+    except KeyError as exc:
+        raise ValueError(f"Unknown search target {target!r}; valid targets: {sorted(_SEARCH_TARGETS)}") from exc
+
 
 def container_key_for_type(memory_type: str) -> ContainerKey:
     """Return the ``ContainerKey`` that owns documents of ``memory_type``."""

diff --git a/azure/cosmos/agent_memory/_utils.py b/azure/cosmos/agent_memory/_utils.py
@@ -145,7 +145,8 @@ def _resolve_embedding_dimensions(val: Optional[int]) -> int:
     """Resolve embedding dimensions from explicit value or ``AI_FOUNDRY_EMBEDDING_DIMENSIONS`` env var.
 
     Defaults to 1536 (the dimension we ship with for ``text-embedding-3-large``
-    truncated to 1536, which is the size DiskANN is tuned for in our containers).
+    truncated to 1536, which is the size our quantizedFlat vector indexes are
+    tuned for in our containers).
 
     Raises :class:`ConfigurationError` if the env var is set but cannot be
     parsed as a positive integer.
@@ -368,8 +369,14 @@ def _container_policies(
     embedding_data_type: str,
     distance_function: str,
     full_text_language: str,
+    include_salience_composite: bool = True,
 ) -> tuple[dict, dict, dict]:
-    """Build the vector, indexing, and full-text policies for container creation."""
+    """Build the vector, indexing, and full-text policies for container creation.
+
+    ``include_salience_composite`` adds the ``(salience, created_at, id)``
+    composite index required by procedural synthesis on the MEMORIES container.
+    Turns reuse this builder with it disabled (turns are never synthesized).
+    """
     vector_embedding_policy = {
         "vectorEmbeddings": [
             {
@@ -384,25 +391,27 @@ def _container_policies(
     indexing_policy = {
         "includedPaths": [{"path": "/*"}],
         "excludedPaths": [
-            {"path": "/embedding/*"},
             {"path": "/source_memory_ids/*"},
             {"path": "/supersedes_ids/*"},
+            {"path": '/"_etag"/?'},
         ],
-        "vectorIndexes": [{"path": "/embedding", "type": "diskANN"}],
+        "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
         "fullTextIndexes": [{"path": "/content"}],
+    }
+
+    if include_salience_composite:
         # Procedural synthesis selects TOP N by (salience DESC, created_at ASC, id ASC).
         # Cosmos requires a composite index for multi-property ORDER BY; without it the
         # query returns a non-deterministic 50 of N when many docs share the default
         # salience (0.5), which makes the source-id short-circuit in synthesize_procedural
         # thrash and burn LLM calls on every reconcile.
-        "compositeIndexes": [
+        indexing_policy["compositeIndexes"] = [
             [
                 {"path": "/salience", "order": "descending"},
                 {"path": "/created_at", "order": "ascending"},
                 {"path": "/id", "order": "ascending"},
             ]
-        ],
-    }
+        ]
 
     full_text_policy = {
         "defaultLanguage": full_text_language,

diff --git a/azure/cosmos/agent_memory/aio/cosmos_memory_client.py b/azure/cosmos/agent_memory/aio/cosmos_memory_client.py
@@ -36,18 +36,6 @@
 
 logger = get_logger(__name__)
 
-_TURNS_INDEXING_POLICY = {
-    "indexingMode": "consistent",
-    "automatic": True,
-    "includedPaths": [{"path": "/*"}],
-    "excludedPaths": [
-        {"path": "/embedding/?"},
-        {"path": "/source_memory_ids/*"},
-        {"path": "/supersedes_ids/*"},
-        {"path": '/"_etag"/?'},
-    ],
-}
-
 _SUMMARIES_INDEXING_POLICY = {
     "indexingMode": "consistent",
     "automatic": True,
@@ -96,6 +84,7 @@ def __init__(
         embedding_dimensions: Optional[int] = None,
         chat_deployment_name: str = "gpt-4o-mini",
         use_default_credential: bool = True,
+        enable_turn_embeddings: Optional[bool] = None,
         processor: Optional[AsyncMemoryProcessor] = None,
         transcript_metadata_keys: Optional[Iterable[str]] = None,
     ) -> None:
@@ -118,6 +107,7 @@ def __init__(
             embedding_dimensions=embedding_dimensions,
             chat_deployment_name=chat_deployment_name,
             use_default_credential=use_default_credential,
+            enable_turn_embeddings=enable_turn_embeddings,
             default_credential_module="azure.identity.aio",
         )
         self._background_tasks: set[asyncio.Task[Any]] = set()
@@ -305,12 +295,18 @@ async def create_memory_store(
                 autoscale_max_ru=self._cosmos_autoscale_max_ru,
                 throughput_properties_cls=ThroughputProperties,
             )
-            vec_policy, idx_policy, ft_policy = _container_policies(
+            _policy_kwargs = dict(
                 embedding_dimensions=embedding_dimensions or self._embedding_dimensions or 1536,
                 embedding_data_type=_resolve_embedding_data_type(embedding_data_type),
                 distance_function=_resolve_distance_function(distance_function),
                 full_text_language=_resolve_full_text_language(full_text_language),
             )
+            vec_policy, idx_policy, ft_policy = _container_policies(**_policy_kwargs)
+            # Turns always carry the vector index (primed for search) but skip the
+            # salience composite index, which only procedural synthesis needs.
+            turns_vec_policy, turns_idx_policy, turns_ft_policy = _container_policies(
+                **_policy_kwargs, include_salience_composite=False
+            )
             self._memories_container_client = await db.create_container_if_not_exists(
                 **_build_container_kwargs(
                     container_id=self._cosmos_container,
@@ -328,7 +324,9 @@ async def create_memory_store(
                     partition_key=partition_key,
                     offer_throughput=offer,
                     default_ttl=DEFAULT_TTL_BY_TYPE["turn"],
-                    indexing_policy=_TURNS_INDEXING_POLICY,
+                    indexing_policy=turns_idx_policy,
+                    vector_embedding_policy=turns_vec_policy,
+                    full_text_policy=turns_ft_policy,
                 )
             )
             logger.info("Created turns container: %s/%s", self._cosmos_database, self._cosmos_turns_container)
@@ -397,7 +395,11 @@ async def validate_topology(self) -> None:
                 ) from exc
 
     def _build_store(self) -> AsyncMemoryStore:
-        return AsyncMemoryStore(containers=self._containers, embeddings_client=self._embeddings_client)
+        return AsyncMemoryStore(
+            containers=self._containers,
+            embeddings_client=self._embeddings_client,
+            enable_turn_embeddings=self._enable_turn_embeddings,
+        )
 
     def _build_pipeline(self, store: AsyncMemoryStore) -> AsyncPipelineService:
         return AsyncPipelineService(
@@ -664,6 +666,7 @@ async def search_cosmos(
         min_confidence: Optional[float] = None,
         created_after: Optional[str | datetime] = None,
         created_before: Optional[str | datetime] = None,
+        target: str = "memories",
     ) -> list[dict[str, Any]]:
         return await self._get_store().search(
             search_terms=search_terms,
@@ -682,6 +685,7 @@ async def search_cosmos(
             min_confidence=min_confidence,
             created_after=created_after,
             created_before=created_before,
+            target=target,
         )
 
     async def get_thread(

diff --git a/azure/cosmos/agent_memory/aio/processors/inprocess.py b/azure/cosmos/agent_memory/aio/processors/inprocess.py
@@ -45,13 +45,18 @@ def __init__(
             from azure.cosmos.agent_memory._container_routing import ContainerKey
             from azure.cosmos.agent_memory.aio.services.pipeline import AsyncPipelineService
             from azure.cosmos.agent_memory.aio.store import AsyncMemoryStore
+            from azure.cosmos.agent_memory.thresholds import get_enable_turn_embeddings
 
             containers = {
                 ContainerKey.TURNS: turns_container,
                 ContainerKey.MEMORIES: cosmos_container,
                 ContainerKey.SUMMARIES: summaries_container,
             }
-            store = AsyncMemoryStore(containers=containers, embeddings_client=embeddings_client)
+            store = AsyncMemoryStore(
+                containers=containers,
+                embeddings_client=embeddings_client,
+                enable_turn_embeddings=get_enable_turn_embeddings(),
+            )
             pipeline = AsyncPipelineService(store, chat_client, embeddings_client, containers=containers)
 
         self._pipeline = pipeline
-Original file line number
+Diff line change
@@ Expand Up @@
     Facts work especially well for vector search because each fact is stored as a small, self-contained document.
+    By default raw conversation turns are *not* embedded — only derived memories (facts, episodic, procedural, summaries) carry vectors. Set `enable_turn_embeddings=True` (env `ENABLE_TURN_EMBEDDINGS`) to also embed turns on write, then call `search_cosmos(target="turns")` to vector-search the raw conversation log. The turns container is always provisioned with a `quantizedFlat` vector index, so this flag only toggles embedding generation and can be turned on or off at any time without recreating the container.
     ---
     ## Processing Pipeline
@@ Expand Down @@