Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ AI_FOUNDRY_EMBEDDING_DATA_TYPE=float32
AI_FOUNDRY_EMBEDDING_DISTANCE_FUNCTION=cosine
COSMOS_DB_FULL_TEXT_LANGUAGE=en-US

# Embed raw conversation turns on write so they can be vector-searched via
# search(target="turns"). The turns container is always provisioned with a
# vector index, so toggling this never requires recreating the container.
ENABLE_TURN_EMBEDDINGS=false

AI_FOUNDRY_CHAT_DEPLOYMENT_NAME=<your-model-deployment>
# Optional. Pin the Azure OpenAI REST API version used by chat and embeddings
# clients. Leave blank to use the toolkit default ("2024-12-01-preview").
Expand Down
2 changes: 2 additions & 0 deletions Docs/concepts.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ Memories stored in Cosmos DB include embeddings generated by Microsoft AI Foundr

Facts work especially well for vector search because each fact is stored as a small, self-contained document.

By default raw conversation turns are *not* embedded — only derived memories (facts, episodic, procedural, summaries) carry vectors. Set `enable_turn_embeddings=True` (env `ENABLE_TURN_EMBEDDINGS`) to also embed turns on write, then call `search_cosmos(target="turns")` to vector-search the raw conversation log. The turns container is always provisioned with a `quantizedFlat` vector index, so this flag only toggles embedding generation and can be turned on or off at any time without recreating the container.

---

## Processing Pipeline
Expand Down
8 changes: 4 additions & 4 deletions Docs/public_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

### Connection

- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, processor=None) -> None` — configure local state, model clients, optional Cosmos auto-connect, and optional processing backend. The SDK uses a hard 3-container topology: turns in `memories_turns`, facts/episodic/procedural in `memories`, and summaries in `memories_summaries` (or the names you pass).
- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, enable_turn_embeddings=None, processor=None) -> None` — configure local state, model clients, optional Cosmos auto-connect, and optional processing backend. The SDK uses a hard 3-container topology: turns in `memories_turns`, facts/episodic/procedural in `memories`, and summaries in `memories_summaries` (or the names you pass). `enable_turn_embeddings` (default `False`, env `ENABLE_TURN_EMBEDDINGS`) embeds raw turns on write so they can be vector-searched via `search_cosmos(target="turns")`; the turns container is always provisioned with a vector index, so toggling this never requires recreating it.
- `close() -> None` — close Cosmos/model clients and owned credentials.
- `connect_cosmos(endpoint=None, credential=None, key=None, database=None, container=None, turns_container=None, summaries_container=None) -> None` — connect to existing memory, turns, and summaries containers.
- `create_memory_store(database=None, container=None, turns_container=None, summaries_container=None, counter_container=None, lease_container=None, endpoint=None, credential=None, key=None, embedding_dimensions=None, embedding_data_type=None, distance_function=None, full_text_language=None, throughput_mode=None, autoscale_max_ru=None) -> None` — create/connect the memory, turns, summaries, counter, and lease containers.
Expand All @@ -37,7 +37,7 @@

### Retrieval

- `search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None) -> list[dict]` — vector or hybrid search memories.
- `search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None, target='memories') -> list[dict]` — vector or hybrid search memories. Set `target="turns"` to search the raw conversation log instead of facts/episodic/procedural (requires turn embeddings; see `enable_turn_embeddings`).
- `get_procedural_prompt(user_id) -> Optional[str]` — read the active procedural prompt.
- `get_procedural_history(user_id, limit=10) -> list[dict]` — read procedural prompt history.
- `get_procedural_memories(user_id, priority=None, category=None, min_salience=None, include_superseded=False) -> list[dict]` — retrieve procedural memory documents.
Expand Down Expand Up @@ -67,7 +67,7 @@ Local-buffer methods remain synchronous in-memory operations; Cosmos, retrieval,

### Connection

- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, processor=None) -> None` — configure async local state, model clients, and optional processing backend. The async SDK uses the same hard 3-container topology as the sync client.
- `__init__(cosmos_endpoint=None, cosmos_credential=None, cosmos_key=None, cosmos_database=None, cosmos_container=None, cosmos_turns_container='memories_turns', cosmos_summaries_container='memories_summaries', cosmos_counter_container=None, cosmos_lease_container=None, cosmos_throughput_mode=None, cosmos_autoscale_max_ru=None, ai_foundry_endpoint=None, ai_foundry_credential=None, ai_foundry_api_key=None, embedding_deployment_name='text-embedding-3-large', embedding_dimensions=None, chat_deployment_name='gpt-4o-mini', use_default_credential=True, enable_turn_embeddings=None, processor=None) -> None` — configure async local state, model clients, and optional processing backend. The async SDK uses the same hard 3-container topology as the sync client. `enable_turn_embeddings` (default `False`, env `ENABLE_TURN_EMBEDDINGS`) embeds raw turns on write so they can be vector-searched via `search_cosmos(target="turns")`.
- `async close() -> None` — close async/sync resources and owned credentials.
- `async connect_cosmos(endpoint=None, credential=None, key=None, database=None, container=None, turns_container=None, summaries_container=None) -> None` — connect to existing memory, turns, and summaries containers.
- `async create_memory_store(database=None, container=None, turns_container=None, summaries_container=None, counter_container=None, lease_container=None, endpoint=None, credential=None, key=None, embedding_dimensions=None, embedding_data_type=None, distance_function=None, full_text_language=None, throughput_mode=None, autoscale_max_ru=None) -> None` — create/connect memory, turns, summaries, counter, and lease containers.
Expand All @@ -90,7 +90,7 @@ Local-buffer methods remain synchronous in-memory operations; Cosmos, retrieval,

### Retrieval

- `async search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None) -> list[dict]` — vector or hybrid search memories.
- `async search_cosmos(search_terms, memory_id=None, user_id=None, role=None, memory_types=None, thread_id=None, hybrid_search=False, top_k=5, tags_all=None, tags_any=None, exclude_tags=None, include_superseded=False, min_salience=None, min_confidence=None, created_after=None, created_before=None, target='memories') -> list[dict]` — vector or hybrid search memories. Set `target="turns"` to search the raw conversation log instead of facts/episodic/procedural (requires turn embeddings; see `enable_turn_embeddings`).
- `async get_procedural_prompt(user_id) -> Optional[str]` — read the active procedural prompt.
- `async get_procedural_history(user_id, limit=10) -> list[dict]` — read procedural prompt history.
- `async get_procedural_memories(user_id, priority=None, category=None, min_salience=None, include_superseded=False) -> list[dict]` — retrieve procedural memory documents.
Expand Down
5 changes: 5 additions & 0 deletions azure/cosmos/agent_memory/_base/base_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from azure.cosmos.agent_memory.exceptions import CosmosNotConnectedError, MemoryNotFoundError, ValidationError
from azure.cosmos.agent_memory.logging import configure_logging, get_logger
from azure.cosmos.agent_memory.thresholds import get_enable_turn_embeddings

logger = get_logger(__name__)

Expand Down Expand Up @@ -45,6 +46,7 @@ def _init_base_config(
embedding_dimensions: Optional[int],
chat_deployment_name: str,
use_default_credential: bool,
enable_turn_embeddings: Optional[bool] = None,
default_credential_module: str = "azure.identity",
) -> None:
"""Initialize shared local state, config values, and default credentials."""
Expand Down Expand Up @@ -75,6 +77,9 @@ def _init_base_config(
self._embedding_deployment_name = embedding_deployment_name
self._embedding_dimensions = _resolve_embedding_dimensions(embedding_dimensions)
self._chat_deployment_name = chat_deployment_name
self._enable_turn_embeddings = (
enable_turn_embeddings if enable_turn_embeddings is not None else get_enable_turn_embeddings()
)

self._owns_cosmos_credential = False
self._owns_ai_foundry_credential = False
Expand Down
18 changes: 18 additions & 0 deletions azure/cosmos/agent_memory/_container_routing.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,24 @@ class ContainerKey(str, Enum):

USER_SCOPED_MEMORIES_TYPES: frozenset[str] = frozenset({"episodic", "procedural"})

# Containers that expose a vector index and may be targeted by ``search``.
_SEARCH_TARGETS: dict[str, ContainerKey] = {
"memories": ContainerKey.MEMORIES,
"turns": ContainerKey.TURNS,
}


def resolve_search_target(target: str) -> ContainerKey:
"""Map a public ``search(target=...)`` value to its ``ContainerKey``.

``"memories"`` (the default) targets facts/episodic/procedural; ``"turns"``
targets the raw conversation log (requires turn embeddings to be enabled).
"""
try:
return _SEARCH_TARGETS[target]
except KeyError as exc:
raise ValueError(f"Unknown search target {target!r}; valid targets: {sorted(_SEARCH_TARGETS)}") from exc


def container_key_for_type(memory_type: str) -> ContainerKey:
"""Return the ``ContainerKey`` that owns documents of ``memory_type``."""
Expand Down
23 changes: 16 additions & 7 deletions azure/cosmos/agent_memory/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,8 @@ def _resolve_embedding_dimensions(val: Optional[int]) -> int:
"""Resolve embedding dimensions from explicit value or ``AI_FOUNDRY_EMBEDDING_DIMENSIONS`` env var.

Defaults to 1536 (the dimension we ship with for ``text-embedding-3-large``
truncated to 1536, which is the size DiskANN is tuned for in our containers).
truncated to 1536, which is the size our quantizedFlat vector indexes are
tuned for in our containers).

Raises :class:`ConfigurationError` if the env var is set but cannot be
parsed as a positive integer.
Expand Down Expand Up @@ -368,8 +369,14 @@ def _container_policies(
embedding_data_type: str,
distance_function: str,
full_text_language: str,
include_salience_composite: bool = True,
) -> tuple[dict, dict, dict]:
"""Build the vector, indexing, and full-text policies for container creation."""
"""Build the vector, indexing, and full-text policies for container creation.

``include_salience_composite`` adds the ``(salience, created_at, id)``
composite index required by procedural synthesis on the MEMORIES container.
Turns reuse this builder with it disabled (turns are never synthesized).
"""
vector_embedding_policy = {
"vectorEmbeddings": [
{
Expand All @@ -384,25 +391,27 @@ def _container_policies(
indexing_policy = {
"includedPaths": [{"path": "/*"}],
"excludedPaths": [
{"path": "/embedding/*"},
{"path": "/source_memory_ids/*"},
{"path": "/supersedes_ids/*"},
{"path": '/"_etag"/?'},
],
"vectorIndexes": [{"path": "/embedding", "type": "diskANN"}],
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
"fullTextIndexes": [{"path": "/content"}],
}

if include_salience_composite:
# Procedural synthesis selects TOP N by (salience DESC, created_at ASC, id ASC).
# Cosmos requires a composite index for multi-property ORDER BY; without it the
# query returns a non-deterministic 50 of N when many docs share the default
# salience (0.5), which makes the source-id short-circuit in synthesize_procedural
# thrash and burn LLM calls on every reconcile.
"compositeIndexes": [
indexing_policy["compositeIndexes"] = [
[
{"path": "/salience", "order": "descending"},
{"path": "/created_at", "order": "ascending"},
{"path": "/id", "order": "ascending"},
]
],
}
]

full_text_policy = {
"defaultLanguage": full_text_language,
Expand Down
34 changes: 19 additions & 15 deletions azure/cosmos/agent_memory/aio/cosmos_memory_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,6 @@

logger = get_logger(__name__)

_TURNS_INDEXING_POLICY = {
"indexingMode": "consistent",
"automatic": True,
"includedPaths": [{"path": "/*"}],
"excludedPaths": [
{"path": "/embedding/?"},
{"path": "/source_memory_ids/*"},
{"path": "/supersedes_ids/*"},
{"path": '/"_etag"/?'},
],
}

_SUMMARIES_INDEXING_POLICY = {
"indexingMode": "consistent",
"automatic": True,
Expand Down Expand Up @@ -96,6 +84,7 @@ def __init__(
embedding_dimensions: Optional[int] = None,
chat_deployment_name: str = "gpt-4o-mini",
use_default_credential: bool = True,
enable_turn_embeddings: Optional[bool] = None,
processor: Optional[AsyncMemoryProcessor] = None,
transcript_metadata_keys: Optional[Iterable[str]] = None,
) -> None:
Expand All @@ -118,6 +107,7 @@ def __init__(
embedding_dimensions=embedding_dimensions,
chat_deployment_name=chat_deployment_name,
use_default_credential=use_default_credential,
enable_turn_embeddings=enable_turn_embeddings,
default_credential_module="azure.identity.aio",
)
self._background_tasks: set[asyncio.Task[Any]] = set()
Expand Down Expand Up @@ -305,12 +295,18 @@ async def create_memory_store(
autoscale_max_ru=self._cosmos_autoscale_max_ru,
throughput_properties_cls=ThroughputProperties,
)
vec_policy, idx_policy, ft_policy = _container_policies(
_policy_kwargs = dict(
embedding_dimensions=embedding_dimensions or self._embedding_dimensions or 1536,
embedding_data_type=_resolve_embedding_data_type(embedding_data_type),
distance_function=_resolve_distance_function(distance_function),
full_text_language=_resolve_full_text_language(full_text_language),
)
vec_policy, idx_policy, ft_policy = _container_policies(**_policy_kwargs)
# Turns always carry the vector index (primed for search) but skip the
# salience composite index, which only procedural synthesis needs.
turns_vec_policy, turns_idx_policy, turns_ft_policy = _container_policies(
**_policy_kwargs, include_salience_composite=False
)
self._memories_container_client = await db.create_container_if_not_exists(
**_build_container_kwargs(
container_id=self._cosmos_container,
Expand All @@ -328,7 +324,9 @@ async def create_memory_store(
partition_key=partition_key,
offer_throughput=offer,
default_ttl=DEFAULT_TTL_BY_TYPE["turn"],
indexing_policy=_TURNS_INDEXING_POLICY,
indexing_policy=turns_idx_policy,
vector_embedding_policy=turns_vec_policy,
full_text_policy=turns_ft_policy,
)
)
logger.info("Created turns container: %s/%s", self._cosmos_database, self._cosmos_turns_container)
Expand Down Expand Up @@ -397,7 +395,11 @@ async def validate_topology(self) -> None:
) from exc

def _build_store(self) -> AsyncMemoryStore:
return AsyncMemoryStore(containers=self._containers, embeddings_client=self._embeddings_client)
return AsyncMemoryStore(
containers=self._containers,
embeddings_client=self._embeddings_client,
enable_turn_embeddings=self._enable_turn_embeddings,
)

def _build_pipeline(self, store: AsyncMemoryStore) -> AsyncPipelineService:
return AsyncPipelineService(
Expand Down Expand Up @@ -664,6 +666,7 @@ async def search_cosmos(
min_confidence: Optional[float] = None,
created_after: Optional[str | datetime] = None,
created_before: Optional[str | datetime] = None,
target: str = "memories",
) -> list[dict[str, Any]]:
return await self._get_store().search(
search_terms=search_terms,
Expand All @@ -682,6 +685,7 @@ async def search_cosmos(
min_confidence=min_confidence,
created_after=created_after,
created_before=created_before,
target=target,
)

async def get_thread(
Expand Down
7 changes: 6 additions & 1 deletion azure/cosmos/agent_memory/aio/processors/inprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,18 @@ def __init__(
from azure.cosmos.agent_memory._container_routing import ContainerKey
from azure.cosmos.agent_memory.aio.services.pipeline import AsyncPipelineService
from azure.cosmos.agent_memory.aio.store import AsyncMemoryStore
from azure.cosmos.agent_memory.thresholds import get_enable_turn_embeddings

containers = {
ContainerKey.TURNS: turns_container,
ContainerKey.MEMORIES: cosmos_container,
ContainerKey.SUMMARIES: summaries_container,
}
store = AsyncMemoryStore(containers=containers, embeddings_client=embeddings_client)
store = AsyncMemoryStore(
containers=containers,
embeddings_client=embeddings_client,
enable_turn_embeddings=get_enable_turn_embeddings(),
)
pipeline = AsyncPipelineService(store, chat_client, embeddings_client, containers=containers)

self._pipeline = pipeline
Expand Down
Loading
Loading