diff --git a/packages/optimization/Makefile b/packages/optimization/Makefile
index 12ad4476..2c9c3153 100644
--- a/packages/optimization/Makefile
+++ b/packages/optimization/Makefile
@@ -19,9 +19,9 @@ test: install
.PHONY: lint
lint: #! Run type analysis and linting checks
lint: install
- uv run mypy src/ldai_optimization
- uv run isort --check --atomic src/ldai_optimization
- uv run pycodestyle src/ldai_optimization
+ uv run mypy src/ldai_optimizer
+ uv run isort --check --atomic src/ldai_optimizer
+ uv run pycodestyle src/ldai_optimizer
.PHONY: build
build: #! Build distribution files
diff --git a/packages/optimization/README.md b/packages/optimization/README.md
index 4f1f64ed..3dc633ec 100644
--- a/packages/optimization/README.md
+++ b/packages/optimization/README.md
@@ -1,9 +1,6 @@
# LaunchDarkly AI SDK — optimization
-[](https://github.com/launchdarkly/python-server-sdk-ai/actions/workflows/ci.yml)
-
-[](https://pypi.org/project/launchdarkly-server-sdk-ai-optimization/)
-[](https://pypi.org/project/launchdarkly-server-sdk-ai-optimization/)
+[](https://pypi.org/project/ldai_optimizer/)
> [!CAUTION]
> This package is in pre-release and not subject to backwards compatibility
@@ -11,17 +8,122 @@
>
> Pin to a specific minor version and review the [changelog](CHANGELOG.md) before upgrading.
-This package will provide helpers to run selected tools against the [LaunchDarkly API](https://apidocs.launchdarkly.com/) from SDK-based workflows. The public surface is not yet finalized; see [CHANGELOG.md](CHANGELOG.md) for updates.
+This package provides helpers for running iterative AI prompt optimization workflows from within LaunchDarkly SDK-based applications. It drives the optimization loop — generating candidate variations, evaluating them with judges, and optionally committing winners back to LaunchDarkly — while delegating all LLM calls to your own handler functions.
+
+## Requirements
+
+- Python `>=3.9`
+- A configured [LaunchDarkly server-side SDK](https://docs.launchdarkly.com/sdk/server-side/python) client
+- The [LaunchDarkly AI package](https://pypi.org/project/launchdarkly-server-sdk-ai/) (`launchdarkly-server-sdk-ai>=0.16.0`) — pulled in automatically as a dependency
+- **`LAUNCHDARKLY_API_KEY` environment variable** — required only when using `auto_commit=True` or `optimize_from_config`. Not needed for basic `optimize_from_options` runs without auto-commit.
+
+> [!NOTE]
+> **`LAUNCHDARKLY_API_KEY` is used exclusively for discrete LaunchDarkly REST API calls** (fetching configs, publishing results). It is never included in any LLM prompt and is never forwarded to your handler callbacks. All API calls made by this package are isolated; they have no access to your runtime environment beyond the key you explicitly provide via the environment variable.
## Installation
```bash
-pip install launchdarkly-server-sdk-ai-optimization
+pip install ldai_optimizer
```
-## Status
+## Quick Start
+
+### Basic optimization (`optimize_from_options`)
-- 3/24/26: Initial package creation
+No `LAUNCHDARKLY_API_KEY` required unless `auto_commit=True`.
+
+```python
+import ldclient
+from ldai import LDAIClient
+from ldai_optimizer import (
+ OptimizationClient,
+ OptimizationJudge,
+ OptimizationOptions,
+ OptimizationResponse,
+ LLMCallConfig,
+ LLMCallContext,
+)
+
+ldclient.set_config(ldclient.Config("sdk-your-sdk-key"))
+ld = LDAIClient(ldclient.get())
+client = OptimizationClient(ld)
+
+def handle_llm_call(
+ run_id: str,
+ config: LLMCallConfig,
+ context: LLMCallContext,
+ is_evaluation: bool,
+) -> OptimizationResponse:
+ # config.model, config.instructions, config.key are available
+ # context.user_input, context.current_variables are available
+ response = your_llm_client.chat(
+ model=config.model.name if config.model else "gpt-4o",
+ system=config.instructions,
+ user=context.user_input or "",
+ )
+ return OptimizationResponse(completion=response.text)
+
+result = await client.optimize_from_options(
+ OptimizationOptions(
+ agent_key="my-agent",
+ handle_agent_call=handle_llm_call,
+ judge_model="gpt-4o-mini",
+ judges={
+ "quality": OptimizationJudge(
+ threshold=1.0,
+ acceptance_statement="The response is accurate and concise.",
+ )
+ },
+ model_choices=["gpt-4o", "gpt-4o-mini"],
+ variable_choices=[{"user_id": "user-123"}],
+ user_input_choices=["What is my account balance?"],
+ )
+)
+```
+
+### Ground truth optimization
+
+```python
+from ldai_optimizer import GroundTruthOptimizationOptions, GroundTruthSample
+
+result = await client.optimize_from_options(
+ GroundTruthOptimizationOptions(
+ agent_key="my-agent",
+ handle_agent_call=handle_llm_call,
+ judge_model="gpt-4o-mini",
+ judges={
+ "accuracy": OptimizationJudge(
+ threshold=1.0,
+ acceptance_statement="The response matches the expected answer.",
+ )
+ },
+ model_choices=["gpt-4o", "gpt-4o-mini"],
+ ground_truth_responses=[
+ GroundTruthSample(
+ user_input="What is 2+2?",
+ ground_truth_response="4",
+ )
+ ],
+ )
+)
+```
+
+### Config-driven optimization (`optimize_from_config`)
+
+Requires `LAUNCHDARKLY_API_KEY`.
+
+```python
+from ldai_optimizer import OptimizationFromConfigOptions
+
+result = await client.optimize_from_config(
+ OptimizationFromConfigOptions(
+ config_key="my-optimization-config",
+ project_key="my-project",
+ handle_agent_call=handle_llm_call,
+ auto_commit=True,
+ )
+)
+```
## License
diff --git a/packages/optimization/pyproject.toml b/packages/optimization/pyproject.toml
index 20a6b241..d6c40ee4 100644
--- a/packages/optimization/pyproject.toml
+++ b/packages/optimization/pyproject.toml
@@ -1,7 +1,7 @@
[project]
-name = "launchdarkly-server-sdk-ai-optimization"
+name = "ldai_optimizer"
version = "0.1.0" # x-release-please-version
-description = "LaunchDarkly AI SDK optimization helpers"
+description = "LaunchDarkly AI tool — optimizer"
authors = [{name = "LaunchDarkly", email = "dev@launchdarkly.com"}]
license = {text = "Apache-2.0"}
readme = "README.md"
@@ -42,7 +42,7 @@ requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
-packages = ["src/ldai_optimization"]
+packages = ["src/ldai_optimizer"]
[tool.mypy]
python_version = "3.10"
diff --git a/packages/optimization/src/ldai_optimization/__init__.py b/packages/optimization/src/ldai_optimization/__init__.py
deleted file mode 100644
index a0b379c6..00000000
--- a/packages/optimization/src/ldai_optimization/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""LaunchDarkly AI SDK — optimization.
-
-This package will provide helpers to run selected tools against the LaunchDarkly API from SDK-based workflows.
-"""
-
-from ldai_optimization.client import ApiAgentOptimizationClient
-
-__version__ = "0.1.0" # x-release-please-version
-
-__all__ = [
- '__version__',
- 'ApiAgentOptimizationClient',
-]
diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py
deleted file mode 100644
index 75c38589..00000000
--- a/packages/optimization/src/ldai_optimization/client.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Client placeholder for LaunchDarkly API tool execution."""
-
-from typing import Any, Dict
-
-
-class ApiAgentOptimizationClient:
- """Coordinates running supported tools against the LaunchDarkly API.
-
- This type is scaffolding; concrete behavior will be added in a future release.
- """
-
- def optimize(self, tool_name: str, parameters: Dict[str, Any]) -> Any:
- """Execute a supported LaunchDarkly API tool by name.
-
- :param tool_name: Identifier of the tool to invoke.
- :param parameters: Tool-specific request parameters.
- :return: Tool-specific response data.
- :raises NotImplementedError: Until the API integration is implemented.
- """
- raise NotImplementedError
diff --git a/packages/optimization/src/ldai_optimizer/__init__.py b/packages/optimization/src/ldai_optimizer/__init__.py
new file mode 100644
index 00000000..beb11cb7
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/__init__.py
@@ -0,0 +1,44 @@
+"""LaunchDarkly AI SDK — optimization.
+
+This package will provide helpers to run selected tools against the LaunchDarkly API from SDK-based workflows.
+"""
+
+from ldai.tracker import TokenUsage
+
+from ldai_optimizer.client import OptimizationClient
+from ldai_optimizer.dataclasses import (
+ AIJudgeCallConfig,
+ GroundTruthOptimizationOptions,
+ GroundTruthSample,
+ LLMCallConfig,
+ LLMCallContext,
+ OptimizationContext,
+ OptimizationFromConfigOptions,
+ OptimizationJudge,
+ OptimizationJudgeContext,
+ OptimizationOptions,
+ OptimizationResponse,
+ ToolDefinition,
+)
+from ldai_optimizer.ld_api_client import LDApiError
+
+__version__ = "0.0.0"
+
+__all__ = [
+ '__version__',
+ 'AIJudgeCallConfig',
+ 'GroundTruthOptimizationOptions',
+ 'GroundTruthSample',
+ 'LDApiError',
+ 'LLMCallConfig',
+ 'LLMCallContext',
+ 'OptimizationClient',
+ 'OptimizationContext',
+ 'OptimizationFromConfigOptions',
+ 'OptimizationJudge',
+ 'OptimizationJudgeContext',
+ 'OptimizationOptions',
+ 'OptimizationResponse',
+ 'TokenUsage',
+ 'ToolDefinition',
+]
diff --git a/packages/optimization/src/ldai_optimizer/_slug_words.py b/packages/optimization/src/ldai_optimizer/_slug_words.py
new file mode 100644
index 00000000..564c2efb
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/_slug_words.py
@@ -0,0 +1,74 @@
+"""Word lists for slug generation.
+
+Adjectives and nouns curated from the coolname package's word files
+(Apache-2.0 licensed). Used by generate_slug() to produce
+``adjective-noun`` variation keys (e.g. ``blazing-lobster``).
+
+640 × 454 possible combinations from the full coolname corpus would be
+overkill; ~100 × ~100 = ~10 000 combinations is sufficient given that
+_commit_variation already appends a hex suffix on collisions.
+"""
+
+_ADJECTIVES: tuple = (
+ # appearance / texture
+ "blazing", "bouncy", "brawny", "chubby", "curvy", "elastic", "ethereal",
+ "fluffy", "foamy", "furry", "fuzzy", "glaring", "hairy", "hissing",
+ "icy", "luminous", "lumpy", "misty", "noisy", "quiet", "quirky",
+ "radiant", "roaring", "ruddy", "shaggy", "shiny", "silent", "silky",
+ "singing", "skinny", "smooth", "soft", "spicy", "spiked", "sticky",
+ "tall", "venomous", "warm", "winged", "wooden",
+ # personality / disposition
+ "adorable", "amazing", "amiable", "calm", "charming", "cute",
+ "dainty", "easygoing", "elegant", "famous", "friendly", "funny",
+ "graceful", "gracious", "happy", "hilarious", "jolly", "jovial",
+ "kind", "laughing", "lovely", "mellow", "neat", "nifty", "noble",
+ "popular", "pretty", "refreshing", "spiffy", "stylish", "sweet",
+ "tactful", "whimsical",
+ # character / trait
+ "adventurous", "ambitious", "audacious", "bold", "brave", "cheerful",
+ "curious", "daring", "determined", "eager", "enthusiastic", "faithful",
+ "fearless", "fierce", "generous", "gentle", "gleeful", "grateful",
+ "hopeful", "humble", "intrepid", "lively", "loyal", "merry",
+ "mysterious", "optimistic", "passionate", "polite", "proud", "rebel",
+ "relaxed", "reliable", "resolute", "romantic", "sincere", "spirited",
+ "stalwart", "thankful", "upbeat", "valiant", "vigorous", "vivacious",
+ "zealous", "zippy",
+ # quality / impressiveness
+ "ancient", "awesome", "brilliant", "classic", "dazzling", "fabulous",
+ "fantastic", "glorious", "legendary", "magnificent", "majestic",
+ "marvellous", "miraculous", "phenomenal", "remarkable", "splendid",
+ "wonderful",
+ # size
+ "colossal", "enormous", "gigantic", "huge", "massive", "tiny",
+ "towering",
+)
+
+_NOUNS: tuple = (
+ # common mammals
+ "badger", "bat", "bear", "beaver", "bison", "bobcat", "buffalo",
+ "capybara", "cheetah", "chipmunk", "coyote", "dingo", "dormouse",
+ "elephant", "ermine", "ferret", "fox", "gazelle", "gibbon", "gorilla",
+ "groundhog", "hamster", "hare", "hedgehog", "hippo", "horse",
+ "hyena", "jaguar", "kangaroo", "koala", "leopard", "lion", "lynx",
+ "mammoth", "marmot", "meerkat", "mongoose", "monkey", "moose",
+ "otter", "panda", "panther", "porcupine", "puma", "rabbit",
+ "raccoon", "rhinoceros", "seal", "skunk", "sloth", "squirrel",
+ "tiger", "walrus", "weasel", "whale", "wolf", "wombat",
+ "wolverine", "zebra",
+ # birds
+ "condor", "crane", "crow", "dove", "eagle", "falcon", "flamingo",
+ "hawk", "heron", "hummingbird", "kingfisher", "macaw", "magpie",
+ "ostrich", "owl", "parrot", "peacock", "pelican", "penguin",
+ "phoenix", "puffin", "raven", "robin", "sparrow", "starling",
+ "stork", "swan", "toucan", "vulture",
+ # reptiles / amphibians / fish
+ "cobra", "crocodile", "gecko", "iguana", "jellyfish", "lobster",
+ "narwhal", "octopus", "orca", "python", "rattlesnake", "salmon",
+ "seahorse", "shark", "snake", "squid", "tortoise", "turtle",
+ "viper",
+ # legendary / breed
+ "basilisk", "chimera", "chupacabra", "dragon", "griffin",
+ "kraken", "pegasus", "unicorn", "wyvern",
+ "beagle", "bulldog", "collie", "corgi", "dalmatian", "husky",
+ "labrador", "poodle", "rottweiler",
+)
diff --git a/packages/optimization/src/ldai_optimizer/client.py b/packages/optimization/src/ldai_optimizer/client.py
new file mode 100644
index 00000000..c7927d4c
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/client.py
@@ -0,0 +1,2356 @@
+"""Client for LaunchDarkly AI agent optimization.
+
+Security note — LAUNCHDARKLY_API_KEY scope
+-------------------------------------------
+When set, the ``LAUNCHDARKLY_API_KEY`` environment variable is used solely to
+authenticate discrete LaunchDarkly REST API calls (e.g. fetching optimization
+configs, publishing results via ``auto_commit``). It is:
+
+- Never included in any LLM prompt.
+- Never forwarded to user-supplied ``handle_agent_call`` or ``handle_judge_call``
+ callbacks.
+- Never accessible to any external service other than the LaunchDarkly REST API.
+
+All LaunchDarkly API calls are isolated requests; they carry no information
+about the caller's broader runtime environment beyond the key itself.
+"""
+
+import dataclasses
+import json
+import logging
+import os
+import random
+import time
+import uuid
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from ldai import AIAgentConfig, AIJudgeConfig, AIJudgeConfigDefault, LDAIClient
+from ldai.models import LDMessage, ModelConfig
+from ldclient import Context
+
+from ldai_optimizer.dataclasses import (
+ AIJudgeCallConfig,
+ GroundTruthOptimizationOptions,
+ GroundTruthSample,
+ HandleJudgeCall,
+ JudgeResult,
+ OptimizationContext,
+ OptimizationFromConfigOptions,
+ OptimizationJudge,
+ OptimizationJudgeContext,
+ OptimizationOptions,
+ OptimizationResponse,
+ ToolDefinition,
+)
+from ldai_optimizer.ld_api_client import (
+ AgentOptimizationConfig,
+ AgentOptimizationResultPatch,
+ AgentOptimizationResultPost,
+ LDApiClient,
+)
+from ldai_optimizer.prompts import (
+ _acceptance_criteria_implies_duration_optimization,
+ build_message_history_text,
+ build_new_variation_prompt,
+ build_reasoning_history,
+)
+from ldai_optimizer.util import (
+ RedactionFilter,
+ await_if_needed,
+ extract_json_from_response,
+ generate_slug,
+ interpolate_variables,
+ restore_variable_placeholders,
+ validate_variation_response,
+)
+
+logger = logging.getLogger(__name__)
+logger.addFilter(RedactionFilter())
+
+
+def _find_model_config(
+ model_name: str, configs: List[Dict[str, Any]]
+) -> Optional[Dict[str, Any]]:
+ """Find the best matching model config for a given model name.
+
+ When multiple configs share the same ``id``, the one marked ``global=True``
+ is preferred over project-specific configs. Falls back to the first
+ non-global match if no global entry exists.
+
+ :param model_name: The model id to look up.
+ :param configs: List of model config dicts from the LD API.
+ :return: Best-matching model config dict, or None if no match.
+ """
+ matching = [mc for mc in configs if mc.get("id") == model_name]
+ if not matching:
+ return None
+ global_match = next((mc for mc in matching if mc.get("global") is True), None)
+ return global_match if global_match is not None else matching[0]
+
+
+def _strip_provider_prefix(model: str) -> str:
+ """Strip the provider prefix from a model identifier returned by the LD API.
+
+ API model keys are formatted as "Provider.model-name" (e.g. "OpenAI.gpt-5",
+ "Anthropic.claude-opus-4.6"). Only the part after the first period is needed
+ by the underlying LLM clients. If no period is present the string is returned
+ unchanged.
+
+ :param model: Raw model string from the API.
+ :return: Model name with provider prefix removed.
+ """
+ return model.split(".", 1)[-1]
+
+
+def _compute_validation_count(pool_size: int) -> int:
+ """Compute how many validation samples to run after a candidate passes in chaos mode.
+
+ Scales with the size of the available input/variable pool so that larger
+ option sets receive proportionally more validation coverage, capped at 5.
+ The floor of 2 ensures at least a minimal cross-check even for small pools.
+
+ :param pool_size: Total number of distinct choices in the sampling pool
+ (user_input_options count when provided, otherwise variable_choices count).
+ :return: Number of validation samples to run (between 2 and 5 inclusive).
+ """
+ return min(5, max(2, pool_size // 4))
+
+
+# Maximum number of attempts for variation generation. Transient empty or
+# unparseable responses from the LLM are retried up to this many times before
+# the variation step is treated as a failure.
+_MAX_VARIATION_RETRIES = 3
+
+# Duration gate: a candidate must be at least this much faster than the baseline
+# (history[0].duration_ms) to pass the duration check when acceptance criteria
+# imply a latency optimization goal. 0.80 means the candidate must clock in at
+# under 80% of the baseline — i.e. at least 20% improvement.
+_DURATION_TOLERANCE = 0.80
+
+# Maps SDK status strings to the API status/activity values expected by
+# agent_optimization_result records. Defined at module level to avoid
+# allocating the dict on every on_status_update invocation.
+_OPTIMIZATION_STATUS_MAP: Dict[str, Dict[str, str]] = {
+ "init": {"status": "RUNNING", "activity": "PENDING"},
+ "generating": {"status": "RUNNING", "activity": "GENERATING"},
+ "evaluating": {"status": "RUNNING", "activity": "EVALUATING"},
+ "generating variation": {"status": "RUNNING", "activity": "GENERATING_VARIATION"},
+ "validating": {"status": "RUNNING", "activity": "EVALUATING"},
+ "turn completed": {"status": "RUNNING", "activity": "COMPLETED"},
+ "success": {"status": "PASSED", "activity": "COMPLETED"},
+ "failure": {"status": "FAILED", "activity": "COMPLETED"},
+}
+
+
+class OptimizationClient:
+ _options: OptimizationOptions
+ _ldClient: LDAIClient
+ _agent_config: AIAgentConfig
+ _has_api_key: bool
+ _api_key: Optional[str]
+ _agent_key: str
+ _initial_instructions: str
+
+ def __init__(self, ldClient: LDAIClient) -> None:
+ self._ldClient = ldClient
+ self._last_run_succeeded: bool = False
+ self._last_succeeded_context: Optional[OptimizationContext] = None
+ self._last_optimization_result_id: Optional[str] = None
+ self._initial_tool_keys: List[str] = []
+ self._total_token_usage: int = 0
+
+ if os.environ.get("LAUNCHDARKLY_API_KEY"):
+ self._has_api_key = True
+ self._api_key = os.environ.get("LAUNCHDARKLY_API_KEY")
+ else:
+ self._has_api_key = False
+ self._api_key = None
+ logger.warning(
+ "LAUNCHDARKLY_API_KEY is not set, functionality will be limited"
+ )
+
+ def _initialize_class_members_from_config(
+ self, agent_config: AIAgentConfig
+ ) -> None:
+ if not agent_config.instructions:
+ raise ValueError(
+ f"Agent '{agent_config.key}' has no instructions configured. "
+ "Ensure the agent flag has instructions set before running an optimization."
+ )
+ self._current_instructions = agent_config.instructions
+ self._current_parameters: Dict[str, Any] = (
+ agent_config.model._parameters if agent_config.model else None
+ ) or {}
+ self._current_model: Optional[str] = (
+ agent_config.model.name if agent_config.model else None
+ )
+ self._history: List[OptimizationContext] = []
+
+ def _build_agent_config_for_context(
+ self, ctx: OptimizationContext, skip_interpolation: bool = False
+ ) -> AIAgentConfig:
+ """
+ Construct an AIAgentConfig that reflects the current optimization iteration.
+
+ Uses the instructions, model, and parameters from the given context so the
+ caller receives the variation being evaluated rather than the original base config.
+ ``{{placeholder}}`` tokens in the instructions are substituted using
+ ctx.current_variables at call time so the stored template is never mutated.
+
+ :param ctx: The OptimizationContext for this iteration
+ :param skip_interpolation: When True, skip variable interpolation on the
+ instructions. Use this when the instructions are a meta-prompt (e.g. a
+ variation-generation prompt) that deliberately contains ``{{key}}`` tokens
+ as text for the LLM to read rather than as runtime substitution targets.
+ :return: A fresh AIAgentConfig populated from the context's current state
+ """
+ instructions = (
+ interpolate_variables(ctx.current_instructions, ctx.current_variables)
+ if ctx.current_variables and not skip_interpolation
+ else ctx.current_instructions
+ )
+ return AIAgentConfig(
+ key=self._agent_key,
+ enabled=True,
+ create_tracker=self._agent_config.create_tracker,
+ model=ModelConfig(
+ name=ctx.current_model or "",
+ parameters=ctx.current_parameters,
+ ),
+ instructions=instructions,
+ provider=self._agent_config.provider,
+ )
+
+ def _create_optimization_context(
+ self,
+ iteration: int,
+ variables: Dict[str, Any],
+ user_input: Optional[str] = None,
+ completion_response: str = "",
+ scores: Optional[Dict[str, JudgeResult]] = None,
+ ) -> OptimizationContext:
+ """
+ Create an OptimizeContext with current state.
+
+ :param iteration: Current iteration number
+ :param variables: Variable set chosen for this iteration
+ :param user_input: Optional user input for this iteration
+ :param completion_response: Completion response string
+ :param scores: Optional dictionary of judge results
+ :return: A new OptimizeContext instance
+ """
+ flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history]
+ return OptimizationContext(
+ scores=scores or {},
+ completion_response=completion_response,
+ current_instructions=self._current_instructions,
+ current_parameters=self._current_parameters.copy(),
+ current_variables=variables,
+ current_model=self._current_model,
+ user_input=user_input,
+ history=tuple(flat_history),
+ iteration=iteration,
+ )
+
+ @property
+ def _judge_call(self) -> HandleJudgeCall:
+ """Return the judge callable, falling back to handle_agent_call when not set."""
+ return self._options.handle_judge_call or self._options.handle_agent_call
+
+ def _safe_status_update(
+ self,
+ status: Literal[
+ "init",
+ "generating",
+ "evaluating",
+ "generating variation",
+ "validating",
+ "turn completed",
+ "success",
+ "failure",
+ ],
+ context: OptimizationContext,
+ iteration: int,
+ ) -> None:
+ """
+ Safely call on_status_update callback, catching and logging errors.
+
+ :param status: The status string to pass to the callback
+ :param context: The optimization context to pass to the callback
+ :param iteration: Current iteration number for logging
+ """
+ if self._options.on_status_update:
+ try:
+ self._options.on_status_update(status, context.copy_without_history())
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> on_status_update callback failed", iteration
+ )
+
+ def _judge_config(
+ self,
+ judge_key: str,
+ context: Context,
+ default: AIJudgeConfigDefault,
+ variables: Dict[str, Any],
+ ) -> AIJudgeConfig:
+ """
+ Fetch a judge configuration from the LaunchDarkly client.
+
+ Thin wrapper around LDAIClient.judge_config so callers do not need a
+ direct reference to the client.
+
+ :param judge_key: The key for the judge configuration in LaunchDarkly
+ :param context: The evaluation context
+ :param default: Fallback config when the flag is disabled or unreachable
+ :param variables: Template variables for instruction interpolation
+ :return: The resolved AIJudgeConfig
+ """
+ return self._ldClient.judge_config(judge_key, context, default, variables)
+
+ def _serialize_scores(
+ self, judge_results: Dict[str, JudgeResult]
+ ) -> Dict[str, Any]:
+ """
+ Convert judge results to a JSON-serializable dictionary.
+
+ :param judge_results: Dictionary of judge keys to JudgeResult instances
+ :return: Dictionary suitable for json.dumps
+ """
+ return {key: result.to_json() for key, result in judge_results.items()}
+
+ def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[ToolDefinition]:
+ """
+ Extract and normalise the tools list from agent parameters.
+
+ Reads the ``tools`` key from *parameters* (if present) and converts
+ every entry to a ToolDefinition so judges receive typed objects.
+
+ :param parameters: The agent's current_parameters dict
+ :return: List of ToolDefinition instances, empty list if no tools are configured
+ """
+ raw_tools = parameters.get("tools", [])
+ if not raw_tools:
+ return []
+ if not isinstance(raw_tools, list):
+ raw_tools = [raw_tools]
+
+ result = []
+ for tool in raw_tools:
+ if isinstance(tool, ToolDefinition):
+ result.append(tool)
+ elif hasattr(tool, "to_dict"):
+ result.append(ToolDefinition.from_dict(tool.to_dict()))
+ elif isinstance(tool, dict):
+ result.append(ToolDefinition.from_dict(tool))
+ return result
+
+ def _parse_judge_response(
+ self,
+ response_str: str,
+ judge_key: str,
+ judge_identifier: str,
+ iteration: int,
+ clamp_score: bool = True,
+ ) -> JudgeResult:
+ """
+ Parse a structured LLM judge response into a JudgeResult.
+
+ Expects a JSON object with "score" (float) and optionally "rationale"
+ (str). On any parsing failure, logs the exception and returns a zero score.
+
+ :param response_str: Raw string response from the judge LLM
+ :param judge_key: Key used to identify this judge in results dicts
+ :param judge_identifier: Human-readable identifier for log messages
+ :param iteration: Current iteration number for logging
+ :param clamp_score: When True, clamps score to [0.0, 1.0]
+ :return: Parsed JudgeResult, or a zero-score result on failure
+ """
+ try:
+ response_data = extract_json_from_response(response_str)
+ score = float(response_data.get("score", 0.0))
+ if clamp_score:
+ score = max(0.0, min(1.0, score))
+ rationale = response_data.get("rationale")
+ return JudgeResult(score=score, rationale=rationale)
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> Failed to parse judge response for %s",
+ iteration,
+ judge_identifier,
+ )
+ return JudgeResult(score=0.0, rationale=None)
+
+ async def _call_judges(
+ self,
+ completion_response: str,
+ iteration: int,
+ user_input: str,
+ variables: Optional[Dict[str, Any]] = None,
+ agent_tools: Optional[List[ToolDefinition]] = None,
+ expected_response: Optional[str] = None,
+ agent_duration_ms: Optional[float] = None,
+ ) -> Dict[str, JudgeResult]:
+ """
+ Call all judges in parallel (auto-path).
+
+ For judges with judge_key: Fetches judge config on-demand from LaunchDarkly SDK.
+ For judges with acceptance_statement: Uses handle_judge_call callback.
+
+ :param completion_response: The agent's completion response to evaluate
+ :param iteration: Current iteration number
+ :param user_input: The user's question for this turn, forwarded to judges so
+ they know what was actually asked (the current turn is not yet in
+ self._history when judges run)
+ :param variables: The variable set that was used during the agent generation
+ :param agent_tools: Normalised list of tool dicts that were available to the agent
+ :param expected_response: Optional ground truth expected response. When provided,
+ judges are instructed to factor it into their scoring alongside acceptance criteria.
+ :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
+ Forwarded to acceptance judges whose statement implies a latency goal so they
+ can mention the duration change in their rationale.
+ :return: Dictionary of judge results (score and rationale)
+ """
+ if not self._options.judges:
+ return {}
+
+ resolved_variables: Dict[str, Any] = variables or {}
+ resolved_agent_tools: List[ToolDefinition] = agent_tools or []
+
+ logger.info("[Iteration %d] -> Executing evaluation...", iteration)
+ reasoning_history = build_reasoning_history(self._history)
+ judge_results: Dict[str, JudgeResult] = {}
+
+ judge_count = len(self._options.judges)
+ for idx, (judge_key, optimization_judge) in enumerate(
+ self._options.judges.items(), 1
+ ):
+ judge_type = (
+ "config" if optimization_judge.judge_key is not None else "acceptance"
+ )
+ logger.info(
+ "[Iteration %d] -> Running judge %d/%d '%s' (%s)...",
+ iteration,
+ idx,
+ judge_count,
+ judge_key,
+ judge_type,
+ )
+ try:
+ if optimization_judge.judge_key is not None:
+ result = await self._evaluate_config_judge(
+ judge_key,
+ optimization_judge,
+ completion_response,
+ iteration,
+ reasoning_history,
+ user_input=user_input,
+ variables=resolved_variables,
+ agent_tools=resolved_agent_tools,
+ expected_response=expected_response,
+ )
+ judge_results[judge_key] = result
+ else:
+ result = await self._evaluate_acceptance_judge(
+ judge_key,
+ optimization_judge,
+ completion_response,
+ iteration,
+ reasoning_history,
+ user_input=user_input,
+ variables=resolved_variables,
+ agent_tools=resolved_agent_tools,
+ expected_response=expected_response,
+ agent_duration_ms=agent_duration_ms,
+ )
+ judge_results[judge_key] = result
+
+ threshold = (
+ optimization_judge.threshold
+ if optimization_judge.threshold is not None
+ else 1.0
+ )
+ passed = result.score >= threshold
+ logger.debug(
+ "[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f) -> %s%s",
+ iteration,
+ judge_key,
+ result.score,
+ threshold,
+ "PASSED" if passed else "FAILED",
+ f" | {result.rationale}" if result.rationale else "",
+ )
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> Judge %s evaluation failed", iteration, judge_key
+ )
+ judge_results[judge_key] = JudgeResult(score=0.0, rationale=None)
+
+ judge_results_json = self._serialize_scores(judge_results)
+ logger.debug(
+ "[Iteration %d] -> Evaluation result: %s",
+ iteration,
+ json.dumps(judge_results_json, indent=2),
+ )
+ return judge_results
+
+ async def _evaluate_config_judge(
+ self,
+ judge_key: str,
+ optimization_judge: "OptimizationJudge",
+ completion_response: str,
+ iteration: int,
+ reasoning_history: str,
+ user_input: str,
+ variables: Optional[Dict[str, Any]] = None,
+ agent_tools: Optional[List[ToolDefinition]] = None,
+ expected_response: Optional[str] = None,
+ ) -> JudgeResult:
+ """
+ Evaluate using a config-type judge (with judge_key).
+
+ :param judge_key: The key for this judge in the judges dict
+ :param optimization_judge: The optimization judge configuration
+ :param completion_response: The agent's completion response to evaluate
+ :param iteration: Current iteration number
+ :param reasoning_history: Formatted string of reasoning from previous iterations
+ :param user_input: The user's question for this turn
+ :param variables: The variable set that was used during agent generation
+ :param agent_tools: Normalised list of tool dicts that were available to the agent
+ :param expected_response: Optional ground truth expected response. When provided,
+ injected into template variables and judge messages.
+ :return: The judge result with score and rationale
+ """
+ # Config-type judge: fetch judge config on-demand from LaunchDarkly SDK
+ input_text = self._current_instructions or ""
+ # Combine current instructions, history, and current question for message_history
+ message_history_text = build_message_history_text(
+ self._history, input_text, reasoning_history, user_input
+ )
+
+ # Merge agent variables so the judge's LD-managed instructions can reference
+ # {{variable_name}} tokens alongside the standard judge template variables.
+ template_variables: Dict[str, Any] = {
+ **(variables or {}),
+ "message_history": message_history_text,
+ "response_to_evaluate": completion_response,
+ }
+ if expected_response is not None:
+ template_variables["expected_response"] = expected_response
+
+ assert optimization_judge.judge_key is not None
+ judge_config = self._judge_config(
+ optimization_judge.judge_key,
+ self._options.context_choices[0],
+ AIJudgeConfigDefault(enabled=False),
+ template_variables,
+ )
+
+ if not judge_config.enabled:
+ logger.warning(
+ "[Iteration %d] -> Judge %s is disabled",
+ iteration,
+ optimization_judge.judge_key,
+ )
+ return JudgeResult(score=0.0, rationale=None)
+
+ if not judge_config.messages:
+ logger.warning(
+ "[Iteration %d] -> Judge %s has no messages",
+ iteration,
+ optimization_judge.judge_key,
+ )
+ return JudgeResult(score=0.0, rationale=None)
+
+ # Split messages into system and user turns.
+ # System turns are joined into a single instructions string (agents SDK path).
+ # All messages are forwarded as-is for the completions path.
+ system_parts = []
+ user_parts = []
+ for msg in judge_config.messages:
+ if msg.role == "system":
+ system_parts.append(
+ msg.content
+ + " Return your response as a JSON object with 'score' and 'rationale' fields."
+ )
+ elif msg.role == "user":
+ user_parts.append(msg.content)
+
+ instructions = "\n\n".join(system_parts)
+ judge_user_input = (
+ "\n\n".join(user_parts)
+ if user_parts
+ else f"Here is the response to evaluate: {completion_response}"
+ )
+
+ if expected_response is not None:
+ judge_user_input += (
+ f"\n\nHere is the expected response: {expected_response}"
+ "\n\nEvaluate the actual response against both the acceptance criteria AND "
+ "how closely it matches the expected response. Factor both into your score."
+ )
+
+ # Rebuild the message list with the updated system content so completions users
+ # receive the same scoring instructions that are baked into `instructions`.
+ updated_messages: List[LDMessage] = [
+ LDMessage(role="system", content=instructions),
+ LDMessage(role="user", content=judge_user_input),
+ ]
+
+ # Always use the global judge_model; model parameters (temperature, etc.) from
+ # the judge flag are still forwarded, but the model name is never overridden.
+ model_name = self._options.judge_model
+ model_params: Dict[str, Any] = {}
+ tools: List[ToolDefinition] = []
+ if judge_config.model and judge_config.model._parameters:
+ existing_tools = judge_config.model._parameters.get("tools")
+ if existing_tools:
+ raw = (
+ existing_tools
+ if isinstance(existing_tools, list)
+ else [existing_tools]
+ )
+ for t in raw:
+ if isinstance(t, ToolDefinition):
+ tools.append(t)
+ elif hasattr(t, "to_dict"):
+ tools.append(ToolDefinition.from_dict(t.to_dict()))
+ elif isinstance(t, dict):
+ tools.append(ToolDefinition.from_dict(t))
+ model_params = {
+ k: v for k, v in judge_config.model._parameters.items() if k != "tools"
+ }
+
+ # Prepend agent tools so the judge can call them when verifying the response
+ if agent_tools:
+ tools = list(agent_tools) + tools
+
+ tool_params = {"tools": [t.to_dict() for t in tools]} if tools else {}
+ judge_call_config = AIJudgeCallConfig(
+ key=judge_key,
+ model=ModelConfig(
+ name=model_name,
+ parameters={**model_params, **tool_params},
+ ),
+ instructions=instructions,
+ messages=updated_messages,
+ )
+
+ judge_ctx = OptimizationJudgeContext(
+ user_input=judge_user_input,
+ current_variables=variables or {},
+ )
+
+ _judge_start = time.monotonic()
+ result = self._judge_call(
+ judge_key, judge_call_config, judge_ctx, True
+ )
+ judge_response: OptimizationResponse = await await_if_needed(result)
+ judge_duration_ms = (time.monotonic() - _judge_start) * 1000
+ judge_response_str = judge_response.output
+
+ logger.debug(
+ "[Iteration %d] -> Judge response (%s): %s",
+ iteration,
+ judge_key,
+ judge_response_str,
+ )
+
+ # Parse judge response — expect structured JSON output
+ judge_identifier = optimization_judge.judge_key or judge_key
+ judge_result = self._parse_judge_response(
+ judge_response_str,
+ judge_key,
+ judge_identifier,
+ iteration,
+ clamp_score=False,
+ )
+ return dataclasses.replace(judge_result, duration_ms=judge_duration_ms, usage=judge_response.usage)
+
+ async def _evaluate_acceptance_judge(
+ self,
+ judge_key: str,
+ optimization_judge: "OptimizationJudge",
+ completion_response: str,
+ iteration: int,
+ reasoning_history: str,
+ user_input: str,
+ variables: Optional[Dict[str, Any]] = None,
+ agent_tools: Optional[List[ToolDefinition]] = None,
+ expected_response: Optional[str] = None,
+ agent_duration_ms: Optional[float] = None,
+ ) -> JudgeResult:
+ """
+ Evaluate using an acceptance statement judge.
+
+ :param judge_key: The key for this judge in the judges dict
+ :param optimization_judge: The optimization judge configuration
+ :param completion_response: The agent's completion response to evaluate
+ :param iteration: Current iteration number
+ :param reasoning_history: Formatted string of reasoning from previous iterations
+ :param user_input: The user's question for this turn
+ :param variables: The variable set that was used during agent generation
+ :param agent_tools: Normalised list of tool dicts that were available to the agent
+ :param expected_response: Optional ground truth expected response. When provided,
+ injected into instructions and judge message so the judge can score actual vs. expected.
+ :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
+ When the acceptance statement implies a latency goal, the judge is instructed
+ to mention the duration change in its rationale.
+ :return: The judge result with score and rationale
+ """
+ if not optimization_judge.acceptance_statement:
+ logger.error(
+ "[Iteration %d] -> Judge %s has no acceptance_statement",
+ iteration,
+ judge_key,
+ )
+ return JudgeResult(score=0.0, rationale=None)
+
+ resolved_variables = variables or {}
+ resolved_agent_tools = agent_tools or []
+
+ # Build message history including the current user question
+ message_history_text = build_message_history_text(
+ self._history, "", reasoning_history, user_input
+ )
+
+ # Build instructions for the judge
+ instructions = (
+ "You are a judge that evaluates the response to the user's question.\n\n"
+ "Here is the statement that you should evaluate the response against: "
+ f"'{optimization_judge.acceptance_statement}'\n"
+ f"Here is the history of all messages between the user and the assistant: {message_history_text}\n"
+ "You should score the response based on how well it meets the acceptance statement "
+ "using a score between 0.0 and 1.0.\n"
+ "A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly.\n"
+ "A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well.\n"
+ "A score of 0.0-0.3 means that it does not match well at all. "
+ "You can return any value between 0.0 and 1.0.\n"
+ "You should also provide a rationale for your score.\n"
+ "Return your response as a JSON object with 'score' and 'rationale' fields.\n\n"
+ 'Example: {"score": 0.8, "rationale": "The response matches the acceptance statement well."}'
+ )
+
+ if (
+ agent_duration_ms is not None
+ and _acceptance_criteria_implies_duration_optimization(
+ {judge_key: optimization_judge}
+ )
+ ):
+ baseline_ms = (
+ self._history[0].duration_ms
+ if self._history and self._history[0].duration_ms is not None
+ else None
+ )
+ instructions += (
+ f"\n\nThe acceptance criteria for this judge includes a latency/duration goal. "
+ f"The agent's response took {agent_duration_ms:.0f}ms to generate. "
+ )
+ if baseline_ms is not None:
+ delta_ms = agent_duration_ms - baseline_ms
+ direction = "faster" if delta_ms < 0 else "slower"
+ instructions += (
+ f"The baseline duration (first iteration) was {baseline_ms:.0f}ms. "
+ f"This response was {abs(delta_ms):.0f}ms {direction} than the baseline. "
+ )
+ instructions += (
+ "Please mention the duration and any change from baseline in your rationale."
+ )
+
+ if resolved_variables:
+ instructions += f"\n\nThe following variables were available to the agent: {json.dumps(resolved_variables)}"
+
+ if resolved_agent_tools:
+ tool_names = [t.name for t in resolved_agent_tools]
+ instructions += (
+ "\n\nThe following tools were available to the agent and "
+ f"may be called by you to verify the response: {json.dumps(tool_names)}."
+ "\nIf verifying the response requires looking up external information, "
+ "call the appropriate tool before scoring. "
+ "You should only call the tools for the most recent response, "
+ "and should only call the tools if necessary. "
+ "Assume that previous feedback will have addressed bad tool call results from prior iterations."
+ )
+
+ # Agent tools are passed through so the judge can invoke them for verification
+ tools: List[ToolDefinition] = list(resolved_agent_tools)
+
+ judge_user_input = f"Here is the response to evaluate: {completion_response}"
+ if expected_response is not None:
+ judge_user_input += (
+ f"\n\nHere is the expected response: {expected_response}"
+ "\n\nEvaluate the actual response against both the acceptance statement AND "
+ "how closely it matches the expected response. Factor both into your score."
+ )
+
+ tool_params = {"tools": [t.to_dict() for t in tools]} if tools else {}
+ judge_call_config = AIJudgeCallConfig(
+ key=judge_key,
+ model=ModelConfig(
+ name=self._options.judge_model,
+ parameters=tool_params,
+ ),
+ instructions=instructions,
+ messages=[
+ LDMessage(role="system", content=instructions),
+ LDMessage(role="user", content=judge_user_input),
+ ],
+ )
+
+ judge_ctx = OptimizationJudgeContext(
+ user_input=judge_user_input,
+ current_variables=resolved_variables,
+ )
+
+ _judge_start = time.monotonic()
+ result = self._judge_call(
+ judge_key, judge_call_config, judge_ctx, True
+ )
+ judge_response: OptimizationResponse = await await_if_needed(result)
+ judge_duration_ms = (time.monotonic() - _judge_start) * 1000
+ judge_response_str = judge_response.output
+
+ logger.debug(
+ "[Iteration %d] -> Judge response (%s): %s",
+ iteration,
+ judge_key,
+ judge_response_str,
+ )
+
+ # Parse judge response — expect structured JSON output with score and rationale
+ judge_result = self._parse_judge_response(
+ judge_response_str, judge_key, judge_key, iteration, clamp_score=True
+ )
+ return dataclasses.replace(judge_result, duration_ms=judge_duration_ms, usage=judge_response.usage)
+
+ async def _get_agent_config(
+ self, agent_key: str, context: Context
+ ) -> AIAgentConfig:
+ """
+ Fetch the agent configuration, replacing the instructions with the raw variation
+ template so that {{placeholder}} tokens are preserved for client-side interpolation.
+
+ agent_config() is called normally so we get a fully populated AIAgentConfig
+ (including the tracker). We then call variation() separately to retrieve the
+ unrendered instruction template and swap it in, keeping everything else intact.
+
+ :param agent_key: The key for the agent to get the configuration for
+ :param context: The evaluation context
+ :return: AIAgentConfig with raw {{placeholder}} instruction templates intact
+ """
+ try:
+ agent_config = self._ldClient.agent_config(agent_key, context)
+
+ # variation() returns the raw JSON before chevron.render(), so instructions
+ # still contain {{placeholder}} tokens rather than empty strings.
+ raw_variation = self._ldClient._client.variation(agent_key, context, {})
+ raw_instructions = raw_variation.get(
+ "instructions", agent_config.instructions
+ )
+ if not raw_instructions:
+ raise ValueError(
+ f"Agent '{agent_key}' has no instructions configured. "
+ "Ensure the agent flag has instructions set before running an optimization."
+ )
+ self._initial_instructions = raw_instructions
+
+ raw_tools = raw_variation.get("tools", [])
+ self._initial_tool_keys = [
+ t["key"]
+ for t in raw_tools
+ if isinstance(t, dict) and "key" in t
+ ]
+
+ agent_config = dataclasses.replace(
+ agent_config, instructions=raw_instructions
+ )
+ self._initialize_class_members_from_config(agent_config)
+ return agent_config
+ except Exception:
+ logger.exception("[Optimization] -> Failed to get agent configuration")
+ raise
+
+ async def optimize_from_options(
+ self, agent_key: str, options: OptimizationOptions
+ ) -> Any:
+ """Execute an optimization on the given agent with the given options.
+
+ :param agent_key: Identifier of the agent to optimize.
+ :param options: Optimization options.
+ :return: Optimization result.
+ """
+ if options.auto_commit:
+ if not self._has_api_key:
+ raise ValueError(
+ "auto_commit requires LAUNCHDARKLY_API_KEY to be set"
+ )
+ if not options.project_key:
+ raise ValueError(
+ "auto_commit requires project_key to be set on OptimizationOptions"
+ )
+ self._agent_key = agent_key
+ context = random.choice(options.context_choices)
+ agent_config = await self._get_agent_config(agent_key, context)
+ result = await self._run_optimization(agent_config, options)
+ if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context:
+ self._commit_variation(
+ self._last_succeeded_context,
+ project_key=options.project_key, # type: ignore[arg-type]
+ ai_config_key=agent_key,
+ output_key=options.output_key,
+ base_url=options.base_url,
+ )
+ return result
+
+ async def optimize_from_ground_truth_options(
+ self, agent_key: str, options: GroundTruthOptimizationOptions
+ ) -> List[OptimizationContext]:
+ """Execute a ground truth optimization on the given agent.
+
+ Unlike optimize_from_options (which tests random choices until one passes),
+ this path evaluates all N ground truth samples in each attempt and only
+ succeeds when every sample passes its judges. A new variation is generated
+ whenever any sample fails, and all N samples are re-evaluated from scratch
+ with the updated configuration, up to max_attempts.
+
+ :param agent_key: Identifier of the agent to optimize.
+ :param options: Ground truth optimization options including the ordered sample list.
+ :return: List of OptimizationContexts from the final attempt (one per sample).
+ """
+ if options.auto_commit:
+ if not self._has_api_key:
+ raise ValueError(
+ "auto_commit requires LAUNCHDARKLY_API_KEY to be set"
+ )
+ if not options.project_key:
+ raise ValueError(
+ "auto_commit requires project_key to be set on GroundTruthOptimizationOptions"
+ )
+ self._agent_key = agent_key
+ context = random.choice(options.context_choices)
+ agent_config = await self._get_agent_config(agent_key, context)
+ result = await self._run_ground_truth_optimization(agent_config, options)
+ if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context:
+ self._commit_variation(
+ self._last_succeeded_context,
+ project_key=options.project_key, # type: ignore[arg-type]
+ ai_config_key=agent_key,
+ output_key=options.output_key,
+ base_url=options.base_url,
+ )
+ return result
+
+ async def _run_ground_truth_optimization(
+ self,
+ agent_config: AIAgentConfig,
+ gt_options: GroundTruthOptimizationOptions,
+ ) -> List[OptimizationContext]:
+ """Run the ground truth optimization loop.
+
+ Uses the "bridge" pattern to reuse existing internal methods (judge evaluation,
+ variation generation, status callbacks) for the ground truth optimization.
+
+ :param agent_config: Agent configuration from LaunchDarkly.
+ :param gt_options: Ground truth options supplied by the caller.
+ :return: List of OptimizationContexts from the final attempt (one per sample).
+ """
+ bridge = OptimizationOptions(
+ context_choices=gt_options.context_choices,
+ max_attempts=gt_options.max_attempts,
+ model_choices=gt_options.model_choices,
+ judge_model=gt_options.judge_model,
+ variable_choices=[s.variables for s in gt_options.ground_truth_responses],
+ handle_agent_call=gt_options.handle_agent_call,
+ handle_judge_call=gt_options.handle_judge_call,
+ judges=gt_options.judges,
+ on_turn=gt_options.on_turn,
+ on_passing_result=gt_options.on_passing_result,
+ on_failing_result=gt_options.on_failing_result,
+ on_status_update=gt_options.on_status_update,
+ token_limit=gt_options.token_limit,
+ )
+ self._options = bridge
+ self._agent_config = agent_config
+ self._last_run_succeeded = False
+ self._last_succeeded_context = None
+ self._last_optimization_result_id = None
+ self._total_token_usage = 0
+ self._initialize_class_members_from_config(agent_config)
+
+ # Seed from the first model choice on the first iteration
+ # so agent calls never receive an empty model string.
+ if not self._current_model and bridge.model_choices:
+ self._current_model = bridge.model_choices[0]
+ logger.debug(
+ "[GT] -> No model in agent config; defaulting to first model choice: %s",
+ self._current_model,
+ )
+
+ samples = gt_options.ground_truth_responses
+ n = len(samples)
+
+ initial_context = self._create_optimization_context(
+ iteration=0,
+ variables=samples[0].variables,
+ )
+ self._safe_status_update("init", initial_context, 0)
+
+ # Attempt tracks the current "batch" loop that runs
+ # through all N samples. Iteration in this context refers to the
+ # total number of batch runs so far.
+ attempt = 0
+ while True:
+ attempt += 1
+ logger.info(
+ "[GT Attempt %d/%d] -> Starting ground truth run (%d samples, model=%s)",
+ attempt,
+ gt_options.max_attempts,
+ n,
+ self._current_model,
+ )
+
+ attempt_results: List[OptimizationContext] = []
+ all_passed = True
+ failed_count = 0
+
+ # Now iterate through each individual sample in the batch,
+ # creating a new context for each sample + running judges etc.
+ for i, sample in enumerate(samples):
+ linear_iter = (attempt - 1) * n + i + 1
+ truncated = len(sample.user_input) > 100
+ logger.info(
+ "[GT Attempt %d] -> Sample %d/%d (user_input=%.100s%s)",
+ attempt,
+ i + 1,
+ n,
+ sample.user_input,
+ "..." if truncated else "",
+ )
+
+ optimize_context = self._create_optimization_context(
+ iteration=linear_iter,
+ user_input=sample.user_input,
+ variables=sample.variables,
+ )
+
+ self._safe_status_update("generating", optimize_context, linear_iter)
+ optimize_context = await self._execute_agent_turn(
+ optimize_context,
+ linear_iter,
+ expected_response=sample.expected_response,
+ )
+ self._accumulate_tokens(optimize_context)
+ if self._is_token_limit_exceeded():
+ logger.error(
+ "[GT Attempt %d] -> Token limit exceeded on sample %d (total=%d)",
+ attempt,
+ i + 1,
+ self._total_token_usage,
+ )
+ attempt_results.append(optimize_context)
+ self._last_run_succeeded = False
+ self._last_succeeded_context = None
+ self._safe_status_update("failure", optimize_context, linear_iter)
+ if self._options.on_failing_result:
+ try:
+ self._options.on_failing_result(optimize_context)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> on_failing_result callback failed", attempt
+ )
+ return attempt_results
+
+ # Per-sample pass/fail check
+ if self._options.on_turn is not None:
+ try:
+ sample_passed = self._options.on_turn(optimize_context)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> Sample %d on_turn evaluation failed",
+ attempt,
+ i + 1,
+ )
+ sample_passed = False
+ else:
+ sample_passed = self._evaluate_response(optimize_context)
+
+ if sample_passed and _acceptance_criteria_implies_duration_optimization(
+ self._options.judges
+ ):
+ sample_passed = self._evaluate_duration(optimize_context)
+
+ if not sample_passed:
+ logger.info(
+ "[GT Attempt %d] -> Sample %d/%d FAILED",
+ attempt,
+ i + 1,
+ n,
+ )
+ all_passed = False
+ failed_count += 1
+ else:
+ logger.debug(
+ "[GT Attempt %d] -> Sample %d/%d passed",
+ attempt,
+ i + 1,
+ n,
+ )
+
+ attempt_results.append(optimize_context)
+
+ if gt_options.on_sample_result is not None:
+ try:
+ gt_options.on_sample_result(optimize_context)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> on_sample_result callback failed for sample %d",
+ attempt,
+ i + 1,
+ )
+
+ last_ctx = attempt_results[-1]
+
+ if all_passed:
+ logger.info(
+ "[GT Attempt %d] -> All %d samples passed — optimization succeeded",
+ attempt,
+ n,
+ )
+ self._last_run_succeeded = True
+ self._last_succeeded_context = last_ctx
+ self._safe_status_update("success", last_ctx, last_ctx.iteration)
+ if self._options.on_passing_result:
+ try:
+ self._options.on_passing_result(last_ctx)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> on_passing_result callback failed", attempt
+ )
+ return attempt_results
+
+ # We've hit max attempts for the batches, bail at this point
+ if attempt >= gt_options.max_attempts:
+ logger.warning(
+ "[GT Optimization] -> Failed after %d attempt(s) — not all samples passed",
+ attempt,
+ )
+ self._last_run_succeeded = False
+ self._last_succeeded_context = None
+ self._safe_status_update("failure", last_ctx, last_ctx.iteration)
+ if self._options.on_failing_result:
+ try:
+ self._options.on_failing_result(last_ctx)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> on_failing_result callback failed", attempt
+ )
+ return attempt_results
+
+ # Append all N results to history so the variation generator has full context
+ # from all of the previous samples
+ self._history.extend(attempt_results)
+
+ logger.info(
+ "[GT Attempt %d] -> %d/%d samples failed — generating new variation",
+ attempt,
+ failed_count,
+ n,
+ )
+ try:
+ await self._generate_new_variation(last_ctx.iteration, last_ctx.current_variables)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> Variation generation failed", attempt
+ )
+ self._last_run_succeeded = False
+ self._last_succeeded_context = None
+ self._safe_status_update("failure", last_ctx, last_ctx.iteration)
+ if self._options.on_failing_result:
+ try:
+ self._options.on_failing_result(last_ctx)
+ except Exception:
+ logger.exception(
+ "[GT Attempt %d] -> on_failing_result callback failed", attempt
+ )
+ return attempt_results
+
+ self._safe_status_update("turn completed", last_ctx, last_ctx.iteration)
+
+ # Every branch inside the while True loop returns explicitly (success, max-attempts
+ # exhaustion, or variation-generation failure). This line is structurally unreachable,
+ # but without it type checkers infer the return type as List[OptimizationContext] | None
+ # because they don't always treat `while True` as exhaustive. The RuntimeError makes
+ # the intent unambiguous and causes a loud failure if that invariant is ever broken.
+ raise RuntimeError("unreachable: ground truth loop exited without returning")
+
+ def _apply_new_variation_response(
+ self,
+ response_data: Dict[str, Any],
+ variation_ctx: OptimizationContext,
+ response_str: str,
+ iteration: int,
+ ) -> OptimizationContext:
+ """
+ Validate the parsed variation response, mutate instance state, and return
+ an updated OptimizationContext reflecting the new configuration.
+
+ Updates self._current_instructions, self._current_parameters, and
+ self._current_model in place so subsequent turns use the new configuration.
+
+ :param response_data: Parsed JSON dict from the LLM variation response
+ :param variation_ctx: The context that was sent to the LLM (used to carry history/iteration)
+ :param response_str: The raw response string (stored as completion_response)
+ :param iteration: Current iteration number for logging
+ :return: A new OptimizationContext populated with the updated configuration
+ """
+ validation_errors = validate_variation_response(response_data)
+ if validation_errors:
+ logger.debug(
+ "[Iteration %d] -> Variation response failed validation: %s. "
+ "Received fields: %s. Full response_data: %s",
+ iteration,
+ "; ".join(validation_errors),
+ list(response_data.keys()),
+ json.dumps(response_data, indent=2),
+ )
+ raise ValueError(
+ f"Variation response failed validation: {'; '.join(validation_errors)}. "
+ f"Received fields: {list(response_data.keys())}"
+ )
+
+ self._current_instructions = response_data["current_instructions"]
+
+ # Post-process: replace any leaked variable values back to {{key}} form.
+ # This is a deterministic safety net for when the LLM ignores the prompt
+ # instructions and hardcodes a concrete value (e.g. "user-123") instead
+ # of the placeholder ("{{user_id}}").
+ self._current_instructions, placeholder_warnings = restore_variable_placeholders(
+ self._current_instructions,
+ self._options.variable_choices,
+ )
+ for msg in placeholder_warnings:
+ logger.warning("[Iteration %d] -> %s", iteration, msg)
+
+ self._current_parameters = response_data["current_parameters"]
+
+ # Update model — it should always be provided since it's required in the schema
+ model_value = (
+ response_data.get("model", "").strip()
+ if isinstance(response_data.get("model"), str)
+ else response_data.get("model")
+ )
+ if not model_value:
+ logger.warning(
+ "[Iteration %d] -> Model field is empty or None in response, keeping current model %s",
+ iteration,
+ self._current_model,
+ )
+ elif model_value not in self._options.model_choices:
+ logger.warning(
+ "[Iteration %d] -> Model '%s' not in model_choices %s, keeping current model %s",
+ iteration,
+ model_value,
+ self._options.model_choices,
+ self._current_model,
+ )
+ else:
+ old_model = self._current_model
+ self._current_model = model_value
+
+ # Log regardless of whether we change the model so that logs
+ # are consistently structured
+ if old_model != self._current_model:
+ logger.info(
+ "[Iteration %d] -> Model updated from '%s' to '%s'",
+ iteration,
+ old_model,
+ self._current_model,
+ )
+ else:
+ logger.debug(
+ "[Iteration %d] -> Keeping model '%s'",
+ iteration,
+ self._current_model,
+ )
+
+ logger.debug(
+ "[Iteration %d] -> New variation generated: instructions='%s', model=%s, parameters=%s",
+ iteration,
+ self._current_instructions,
+ self._current_model,
+ self._current_parameters,
+ )
+
+ # Create a new context with the updated values for return
+ return OptimizationContext(
+ scores={},
+ completion_response=response_str,
+ current_instructions=self._current_instructions,
+ current_parameters=self._current_parameters.copy(),
+ current_variables=variation_ctx.current_variables,
+ current_model=self._current_model,
+ user_input=None,
+ history=variation_ctx.history,
+ iteration=variation_ctx.iteration,
+ )
+
+ async def _generate_new_variation(
+ self, iteration: int, variables: Dict[str, Any]
+ ) -> OptimizationContext:
+ """
+ Generate new variation for next iteration (auto-path).
+
+ Calls handle_agent_call to generate a new variation and updates current_instructions
+ and current_parameters based on the returned OptimizeContext.
+
+ :param iteration: The current iteration number for logging
+ :param variables: The variable set for this iteration, chosen once by the caller
+ """
+ logger.info("[Iteration %d] -> Generating new variation...", iteration)
+
+ # Create a context for status update before generating the variation
+ status_ctx = self._create_optimization_context(
+ iteration=iteration,
+ variables=variables,
+ )
+ self._safe_status_update("generating variation", status_ctx, iteration)
+
+ optimize_for_duration = _acceptance_criteria_implies_duration_optimization(
+ self._options.judges
+ )
+ instructions = build_new_variation_prompt(
+ self._history,
+ self._options.judges,
+ self._current_model,
+ self._current_instructions,
+ self._current_parameters,
+ self._options.model_choices,
+ self._options.variable_choices,
+ self._initial_instructions,
+ optimize_for_duration=optimize_for_duration,
+ )
+
+ # Create a flat history list (without nested history) to avoid exponential growth
+ flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history]
+
+ # Create context for variation generation — low temperature for deterministic output.
+ variation_ctx = OptimizationContext(
+ scores={},
+ completion_response="",
+ current_instructions=instructions,
+ current_parameters={
+ "temperature": 0.1,
+ },
+ current_variables=variables,
+ current_model=self._current_model,
+ user_input=None,
+ history=tuple(flat_history),
+ iteration=len(self._history) + 1,
+ )
+
+ # Call handle_agent_call to generate new variation; expects a JSON string
+ # matching the structured output schema (current_instructions, current_parameters, model).
+ # Retry up to _MAX_VARIATION_RETRIES times to handle transient empty or unparseable
+ # responses (e.g. when the agent SDK returns the LLM's post-tool-call empty text
+ # instead of the tool result).
+ agent_config = self._build_agent_config_for_context(variation_ctx, skip_interpolation=True)
+ response_data = None
+ response_str = ""
+ for attempt in range(1, _MAX_VARIATION_RETRIES + 1):
+ result = self._options.handle_agent_call(
+ self._agent_key,
+ agent_config,
+ variation_ctx,
+ False,
+ )
+ variation_response: OptimizationResponse = await await_if_needed(result)
+ response_str = variation_response.output
+ try:
+ response_data = extract_json_from_response(response_str)
+ break
+ except ValueError:
+ if attempt == _MAX_VARIATION_RETRIES:
+ raise
+ logger.warning(
+ "[Iteration %d] -> Variation response empty or unparseable "
+ "(attempt %d/%d), retrying...",
+ iteration,
+ attempt,
+ _MAX_VARIATION_RETRIES,
+ )
+
+ assert response_data is not None # loop always raises or breaks with data
+ return self._apply_new_variation_response(
+ response_data, variation_ctx, response_str, iteration
+ )
+
+ async def optimize_from_config(
+ self, optimization_config_key: str, options: OptimizationFromConfigOptions
+ ) -> Any:
+ """Optimize an agent using a configuration fetched from the LaunchDarkly API.
+
+ The agent key, judge configuration, model choices, and other optimization
+ parameters are all sourced from the remote agent optimization config. The
+ caller only needs to provide the execution callbacks and evaluation contexts.
+
+ Iteration results are automatically persisted to the LaunchDarkly API so
+ the UI can display live run progress.
+
+ :param optimization_config_key: Key of the agent optimization config to fetch.
+ :param options: User-provided callbacks and evaluation contexts.
+ :return: Optimization result (OptimizationContext from the final iteration).
+ """
+ if not self._has_api_key:
+ raise ValueError(
+ "LAUNCHDARKLY_API_KEY is not set, so optimize_from_config is not available"
+ )
+
+ assert self._api_key is not None
+ api_client = LDApiClient(
+ self._api_key,
+ **({"base_url": options.base_url} if options.base_url else {}),
+ )
+ config = api_client.get_agent_optimization(options.project_key, optimization_config_key)
+
+ self._agent_key = config["aiConfigKey"]
+ optimization_key: str = config["key"]
+ run_id = str(uuid.uuid4())
+
+ model_configs: List[Dict[str, Any]] = []
+ try:
+ model_configs = api_client.get_model_configs(options.project_key)
+ except Exception as exc:
+ logger.debug("Could not pre-fetch model configs: %s", exc)
+
+ context = random.choice(options.context_choices)
+ # _get_agent_config calls _initialize_class_members_from_config internally;
+ # _run_optimization calls it again to reset history before the loop starts.
+ agent_config = await self._get_agent_config(self._agent_key, context)
+
+ optimization_options = self._build_options_from_config(
+ config, options, api_client, optimization_key, run_id, model_configs
+ )
+ if isinstance(optimization_options, GroundTruthOptimizationOptions):
+ result = await self._run_ground_truth_optimization(agent_config, optimization_options)
+ else:
+ result = await self._run_optimization(agent_config, optimization_options)
+
+ if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context:
+ created_key = self._commit_variation(
+ self._last_succeeded_context,
+ project_key=options.project_key,
+ ai_config_key=config["aiConfigKey"],
+ output_key=options.output_key,
+ api_client=api_client,
+ model_configs=model_configs,
+ )
+ if created_key and self._last_optimization_result_id:
+ api_client.patch_agent_optimization_result(
+ options.project_key,
+ optimization_key,
+ self._last_optimization_result_id,
+ {"createdVariationKey": created_key},
+ )
+ return result
+
+ def _build_options_from_config(
+ self,
+ config: AgentOptimizationConfig,
+ options: OptimizationFromConfigOptions,
+ api_client: LDApiClient,
+ optimization_key: str,
+ run_id: str,
+ model_configs: Optional[List[Dict[str, Any]]] = None,
+ ) -> "Union[OptimizationOptions, GroundTruthOptimizationOptions]":
+ """Map a fetched AgentOptimization config + user options into the appropriate options type.
+
+ When the config contains groundTruthResponses, the three lists (groundTruthResponses,
+ userInputOptions, variableChoices) are zipped by index into GroundTruthSample objects
+ and a GroundTruthOptimizationOptions is returned. Otherwise a standard OptimizationOptions
+ is returned.
+
+ Acceptance statements and judge configs from the API are merged into a single
+ judges dict. An on_status_update closure is injected to persist each iteration
+ result to the LaunchDarkly API; any user-supplied on_status_update is chained
+ after the persistence call.
+
+ :param config: Validated AgentOptimizationConfig from the API.
+ :param options: User-provided options from optimize_from_config.
+ :param api_client: Initialised LDApiClient for result persistence.
+ :param optimization_key: String key of the parent agent_optimization record.
+ :param run_id: UUID that groups all result records for this run.
+ :param model_configs: Pre-fetched list of model config dicts for resolving modelConfigKey.
+ :return: OptimizationOptions or GroundTruthOptimizationOptions.
+ """
+ judges: Dict[str, OptimizationJudge] = {}
+
+ for i, stmt in enumerate(config["acceptanceStatements"]):
+ key = f"acceptance-statement-{i}"
+ judges[key] = OptimizationJudge(
+ threshold=float(stmt.get("threshold", 0.95)),
+ acceptance_statement=stmt["statement"],
+ )
+
+ for judge in config["judges"]:
+ judges[judge["key"]] = OptimizationJudge(
+ threshold=float(judge.get("threshold", 0.95)),
+ judge_key=judge["key"],
+ )
+
+ raw_ground_truth: List[str] = config.get("groundTruthResponses") or []
+ has_ground_truth = bool(raw_ground_truth)
+ if not judges and options.on_turn is None:
+ raise ValueError(
+ "The optimization config has no acceptance statements or judges, and no on_turn "
+ "callback was provided. At least one is required to evaluate optimization results."
+ )
+
+ project_key = options.project_key
+ config_version: int = config["version"]
+ _cached_model_configs: List[Dict[str, Any]] = list(model_configs or [])
+
+ # Maps logical iteration number → result record id. Each new main-loop
+ # iteration (plus the init iteration 0) POSTs a fresh record; subsequent
+ # status events for that same iteration PATCH the existing record.
+ _iteration_result_ids: Dict[int, str] = {}
+
+ # Validation phase tracking. When a candidate passes initial checks the
+ # SDK fires validation sub-iterations (val_iter = main_iter + 1, +2, …).
+ # These are internal cross-checks and should NOT create separate records;
+ # instead they are folded back into the parent main-loop iteration's record.
+ _in_validation_phase: bool = False
+ _validation_parent_iteration: int = -1
+
+ # Tracks the most recently opened (POSTed) iteration so we can close it
+ # with a RUNNING:COMPLETED patch when the next iteration begins. Without
+ # this, iterations that don't naturally receive a terminal event (e.g. the
+ # init iteration 0, or non-final GT samples) are left in a stale state.
+ _last_open_iteration: int = -1
+
+ def _resolve_model_config_key(model_name: str) -> str:
+ if not model_name:
+ return ""
+ match = _find_model_config(model_name, _cached_model_configs)
+ return match["key"] if match else model_name
+
+ def _persist_and_forward(
+ status: Literal[
+ "init",
+ "generating",
+ "evaluating",
+ "generating variation",
+ "validating",
+ "turn completed",
+ "success",
+ "failure",
+ ],
+ ctx: OptimizationContext,
+ ) -> None:
+ nonlocal _in_validation_phase, _validation_parent_iteration, _last_open_iteration
+ # _safe_status_update (the caller) already wraps this entire function in
+ # a try/except, so errors here are caught and logged without aborting the run.
+ mapped = _OPTIMIZATION_STATUS_MAP.get(
+ status, {"status": "RUNNING", "activity": "PENDING"}
+ )
+ snapshot = ctx.copy_without_history()
+
+ # "validating" fires with the parent main-loop iteration's context, so
+ # we capture that number as the anchor for all subsequent validation events.
+ if status == "validating":
+ _in_validation_phase = True
+ _validation_parent_iteration = snapshot.iteration
+
+ # Any event whose ctx.iteration differs from the validation anchor is a
+ # validation sub-iteration; fold it back to the parent's record.
+ if _in_validation_phase and snapshot.iteration != _validation_parent_iteration:
+ logical_iteration = _validation_parent_iteration
+ else:
+ logical_iteration = snapshot.iteration
+
+ # When a new iteration begins (generating), close out whatever iteration
+ # was last open so it doesn't remain in a non-terminal state. This covers
+ # the init iteration (0 → 1) and GT batches where non-final samples never
+ # receive an explicit terminal event.
+ if (
+ status == "generating"
+ and _last_open_iteration >= 0
+ and logical_iteration != _last_open_iteration
+ ):
+ prev_result_id = _iteration_result_ids.get(_last_open_iteration)
+ if prev_result_id:
+ api_client.patch_agent_optimization_result(
+ project_key,
+ optimization_key,
+ prev_result_id,
+ {"status": "RUNNING", "activity": "COMPLETED"},
+ )
+ _last_open_iteration = -1
+
+ # Phase 1: POST to create the record on first encounter of each logical iteration.
+ if logical_iteration not in _iteration_result_ids:
+ post_payload: AgentOptimizationResultPost = {
+ "runId": run_id,
+ "agentOptimizationVersion": config_version,
+ "iteration": logical_iteration,
+ "instructions": snapshot.current_instructions,
+ }
+ if snapshot.current_parameters:
+ post_payload["parameters"] = snapshot.current_parameters
+ if snapshot.user_input:
+ post_payload["userInput"] = snapshot.user_input
+ result_id = api_client.post_agent_optimization_result(
+ project_key, optimization_key, post_payload
+ )
+ if result_id:
+ _iteration_result_ids[logical_iteration] = result_id
+ self._last_optimization_result_id = result_id
+ _last_open_iteration = logical_iteration
+
+ # Phase 2: PATCH the record with current status and available telemetry.
+ result_id = _iteration_result_ids.get(logical_iteration)
+ if result_id:
+ patch: AgentOptimizationResultPatch = {
+ "status": mapped["status"],
+ "activity": mapped["activity"],
+ }
+ if snapshot.completion_response:
+ patch["completionResponse"] = snapshot.completion_response
+ if snapshot.scores:
+ patch["scores"] = {
+ k: {
+ **v.to_json(),
+ **({"threshold": judges[k].threshold} if k in judges else {}),
+ }
+ for k, v in snapshot.scores.items()
+ }
+ if snapshot.duration_ms is not None:
+ patch["generationLatency"] = int(snapshot.duration_ms)
+ if snapshot.usage is not None:
+ patch["generationTokens"] = {
+ "total": snapshot.usage.total,
+ "input": snapshot.usage.input,
+ "output": snapshot.usage.output,
+ }
+ eval_latencies = {
+ k: v.duration_ms
+ for k, v in snapshot.scores.items()
+ if v.duration_ms is not None
+ }
+ if eval_latencies:
+ patch["evaluationLatencies"] = eval_latencies
+ eval_tokens = {
+ k: {"total": v.usage.total, "input": v.usage.input, "output": v.usage.output}
+ for k, v in snapshot.scores.items()
+ if v.usage is not None
+ }
+ if eval_tokens:
+ patch["evaluationTokens"] = eval_tokens
+ patch["variation"] = {
+ "instructions": snapshot.current_instructions,
+ "parameters": snapshot.current_parameters,
+ "modelConfigKey": _resolve_model_config_key(snapshot.current_model or ""),
+ }
+ api_client.patch_agent_optimization_result(
+ project_key, optimization_key, result_id, patch
+ )
+
+ # Reset tracking state after terminal events so the next main-loop
+ # attempt starts fresh.
+ if status in ("turn completed", "success", "failure"):
+ _in_validation_phase = False
+ _validation_parent_iteration = -1
+ _last_open_iteration = -1
+
+ if options.on_status_update:
+ try:
+ options.on_status_update(status, ctx)
+ except Exception:
+ logger.exception("User on_status_update callback failed for status=%s", status)
+
+ # If we have ground truth responses, we provide a different
+ # configuration options type that contains the bundled GroundTruthSamples
+ # so that the ultimate output is correctly formatted.
+ if has_ground_truth:
+ user_inputs: List[str] = config["userInputOptions"] or []
+ variable_choices_raw: List[Dict[str, Any]] = config["variableChoices"] or []
+
+ if len(raw_ground_truth) != len(user_inputs) or len(raw_ground_truth) != len(variable_choices_raw):
+ raise ValueError(
+ f"groundTruthResponses ({len(raw_ground_truth)}), userInputOptions "
+ f"({len(user_inputs)}), and variableChoices ({len(variable_choices_raw)}) "
+ "must all have the same length when groundTruthResponses is provided."
+ )
+
+ gt_samples = [
+ GroundTruthSample(
+ user_input=user_inputs[idx],
+ expected_response=raw_ground_truth[idx],
+ variables=variable_choices_raw[idx],
+ )
+ for idx in range(len(raw_ground_truth))
+ ]
+
+ return GroundTruthOptimizationOptions(
+ context_choices=options.context_choices,
+ ground_truth_responses=gt_samples,
+ max_attempts=config["maxAttempts"],
+ model_choices=[_strip_provider_prefix(m) for m in config["modelChoices"]],
+ judge_model=_strip_provider_prefix(config["judgeModel"]),
+ handle_agent_call=options.handle_agent_call,
+ handle_judge_call=options.handle_judge_call,
+ judges=judges or None,
+ on_turn=options.on_turn,
+ on_sample_result=options.on_sample_result,
+ on_passing_result=options.on_passing_result,
+ on_failing_result=options.on_failing_result,
+ on_status_update=_persist_and_forward,
+ token_limit=config.get("tokenLimit"),
+ )
+
+ variable_choices: List[Dict[str, Any]] = config["variableChoices"] or [{}]
+ user_input_options: Optional[List[str]] = config["userInputOptions"] or None
+
+ return OptimizationOptions(
+ context_choices=options.context_choices,
+ max_attempts=config["maxAttempts"],
+ model_choices=[_strip_provider_prefix(m) for m in config["modelChoices"]],
+ judge_model=_strip_provider_prefix(config["judgeModel"]),
+ variable_choices=variable_choices,
+ handle_agent_call=options.handle_agent_call,
+ handle_judge_call=options.handle_judge_call,
+ judges=judges or None,
+ user_input_options=user_input_options,
+ on_turn=options.on_turn,
+ on_passing_result=options.on_passing_result,
+ on_failing_result=options.on_failing_result,
+ on_status_update=_persist_and_forward,
+ token_limit=config.get("tokenLimit"),
+ )
+
+ async def _execute_agent_turn(
+ self,
+ optimize_context: OptimizationContext,
+ iteration: int,
+ expected_response: Optional[str] = None,
+ ) -> OptimizationContext:
+ """
+ Run the agent call and judge scoring for one optimization turn.
+
+ Returns a new OptimizationContext with completion_response and scores
+ populated, leaving the input context unchanged. Variables are read from
+ optimize_context.current_variables and interpolated into the agent's
+ instructions at call time so the stored template is never mutated.
+
+ :param optimize_context: The context for this turn (instructions, model, history, etc.)
+ :param iteration: Current iteration number for logging and status callbacks
+ :param expected_response: Optional ground truth expected response. When provided,
+ injected into judge context so judges can score actual vs. expected.
+ :return: Updated context with completion_response and scores filled in
+ """
+ logger.info(
+ "[Iteration %d] -> Calling agent (model=%s)...",
+ iteration,
+ optimize_context.current_model,
+ )
+ try:
+ _agent_start = time.monotonic()
+ result = self._options.handle_agent_call(
+ self._agent_key,
+ self._build_agent_config_for_context(optimize_context),
+ optimize_context,
+ False,
+ )
+ agent_response: OptimizationResponse = await await_if_needed(result)
+ agent_duration_ms = (time.monotonic() - _agent_start) * 1000
+ completion_response = agent_response.output
+ logger.debug(
+ "[Iteration %d] -> Agent response: %.300s%s",
+ iteration,
+ completion_response,
+ "..." if len(completion_response) > 300 else "",
+ )
+ except Exception:
+ logger.exception("[Iteration %d] -> Agent call failed", iteration)
+ if self._options.on_failing_result:
+ self._options.on_failing_result(optimize_context)
+ raise
+
+ scores: Dict[str, JudgeResult] = {}
+ if self._options.judges:
+ agent_tools = self._extract_agent_tools(optimize_context.current_parameters)
+ scores = await self._call_judges(
+ completion_response,
+ iteration,
+ user_input=optimize_context.user_input or "",
+ variables=optimize_context.current_variables,
+ agent_tools=agent_tools,
+ expected_response=expected_response,
+ agent_duration_ms=agent_duration_ms,
+ )
+
+ # Build the fully-populated result context before firing the evaluating event so
+ # the PATCH includes scores, generationLatency, and completionResponse. This is
+ # particularly important for non-final GT samples which receive no further status
+ # events — without this, those fields would never be written to their API records.
+ result_ctx = dataclasses.replace(
+ optimize_context,
+ completion_response=completion_response,
+ scores=scores,
+ duration_ms=agent_duration_ms,
+ usage=agent_response.usage,
+ )
+
+ if self._options.judges:
+ self._safe_status_update("evaluating", result_ctx, iteration)
+
+ return result_ctx
+
+ def _accumulate_tokens(self, optimize_context: OptimizationContext) -> None:
+ """Add token usage from a completed turn to the running total.
+
+ Sums the agent's token usage and each judge's token usage from the given
+ context and adds them to ``_total_token_usage``.
+
+ :param optimize_context: The completed turn context containing usage data.
+ """
+ if optimize_context.usage is not None:
+ self._total_token_usage += optimize_context.usage.total or 0
+ for judge_result in optimize_context.scores.values():
+ if judge_result.usage is not None:
+ self._total_token_usage += judge_result.usage.total or 0
+
+ def _is_token_limit_exceeded(self) -> bool:
+ """Return True if the accumulated token usage has met or exceeded the configured limit.
+
+ Returns False when no token limit is set so callers can use this as a
+ simple guard without needing to check for ``None`` themselves.
+
+ :return: True if token limit is set and ``_total_token_usage >= token_limit``.
+ """
+ limit: Optional[int] = getattr(self._options, "token_limit", None)
+ return limit is not None and self._total_token_usage >= limit
+
+ def _evaluate_response(self, optimize_context: OptimizationContext) -> bool:
+ """
+ Determine whether the current iteration's scores meet all judge thresholds.
+
+ A judge without an explicit threshold is treated as requiring a perfect
+ score of 1.0. Returns True immediately when no judges are configured.
+
+ :param optimize_context: The completed turn context containing scores
+ :return: True if all judges passed, False if any judge failed or is missing
+ """
+ if not self._options.judges:
+ return True
+
+ for judge_key, optimization_judge in self._options.judges.items():
+ result = optimize_context.scores.get(judge_key)
+ if result is None:
+ return False
+ threshold = (
+ optimization_judge.threshold
+ if optimization_judge.threshold is not None
+ else 1.0
+ )
+ if result.score < threshold:
+ return False
+
+ return True
+
+ def _evaluate_duration(self, optimize_context: OptimizationContext) -> bool:
+ """
+ Check whether the candidate's duration meets the improvement target vs. the baseline.
+
+ The baseline is history[0].duration_ms — the very first completed iteration,
+ representing the original unoptimized configuration's latency. The candidate
+ must be at least _DURATION_TOLERANCE faster (default: 20% improvement).
+
+ Returns True without blocking when no baseline is available (empty history or
+ history[0].duration_ms is None), or when the candidate's duration_ms was not
+ captured. This avoids penalising configurations when timing data is missing.
+
+ :param optimize_context: The completed turn context containing duration_ms
+ :return: True if the duration requirement is met or cannot be checked
+ """
+ if not self._history or self._history[0].duration_ms is None:
+ return True
+ if optimize_context.duration_ms is None:
+ return True
+ baseline = self._history[0].duration_ms
+ passed = optimize_context.duration_ms < baseline * _DURATION_TOLERANCE
+ if not passed:
+ logger.warning(
+ "[Iteration %d] -> Duration check failed: %.0fms >= baseline %.0fms * %.0f%% (%.0fms)",
+ optimize_context.iteration,
+ optimize_context.duration_ms,
+ baseline,
+ _DURATION_TOLERANCE * 100,
+ baseline * _DURATION_TOLERANCE,
+ )
+ return passed
+
+ def _handle_success(
+ self, optimize_context: OptimizationContext, iteration: int
+ ) -> Any:
+ """
+ Handle a successful optimization result.
+
+ Fires the "success" status update, invokes on_passing_result if set,
+ and returns the winning OptimizationContext.
+
+ :param optimize_context: The context from the passing iteration
+ :param iteration: Current iteration number for logging
+ :return: The passing OptimizationContext
+ """
+ logger.info("[Iteration %d] -> Optimization succeeded", iteration)
+ self._last_run_succeeded = True
+ self._last_succeeded_context = optimize_context
+ self._safe_status_update("success", optimize_context, iteration)
+ if self._options.on_passing_result:
+ try:
+ self._options.on_passing_result(optimize_context)
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> on_passing_result callback failed", iteration
+ )
+ return optimize_context
+
+ def _handle_failure(
+ self, optimize_context: OptimizationContext, iteration: int
+ ) -> Any:
+ """
+ Handle a failed optimization result (max attempts reached).
+
+ Fires the "failure" status update, invokes on_failing_result if set,
+ and returns the last OptimizationContext.
+
+ :param optimize_context: The context from the final iteration
+ :param iteration: Current iteration number for logging
+ :return: The last OptimizationContext
+ """
+ logger.warning(
+ "[Optimization] -> Optimization failed after %d attempt(s)", iteration
+ )
+ self._last_run_succeeded = False
+ self._last_succeeded_context = None
+ self._safe_status_update("failure", optimize_context, iteration)
+ if self._options.on_failing_result:
+ try:
+ self._options.on_failing_result(optimize_context)
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> on_failing_result callback failed", iteration
+ )
+ return optimize_context
+
+ def _commit_variation(
+ self,
+ optimize_context: OptimizationContext,
+ project_key: str,
+ ai_config_key: str,
+ output_key: Optional[str],
+ api_client: Optional[LDApiClient] = None,
+ base_url: Optional[str] = None,
+ model_configs: Optional[List[Dict[str, Any]]] = None,
+ ) -> str:
+ """Commit the winning optimization context as a new AI Config variation.
+
+ Determines a unique variation key (from output_key or an auto-generated
+ adjective-noun slug), checks for collisions against existing variation keys,
+ appends a random hex suffix if the key is taken, then POSTs the new variation
+ with up to 2 retries before raising on persistent failure.
+
+ :param optimize_context: The winning OptimizationContext.
+ :param project_key: LaunchDarkly project key.
+ :param ai_config_key: The AI Config key to add the variation to.
+ :param output_key: Desired variation key/name; auto-generated if None.
+ :param api_client: Optional pre-built LDApiClient to reuse (e.g. from optimize_from_config).
+ :param base_url: Optional base URL override forwarded to a newly created LDApiClient.
+ :return: The created variation key.
+ :raises LDApiError: If the variation cannot be created after retries.
+ """
+ if api_client is None:
+ assert self._api_key is not None
+ api_client = LDApiClient(
+ self._api_key,
+ **({"base_url": base_url} if base_url else {}),
+ )
+
+ candidate = output_key if output_key else generate_slug()
+
+ try:
+ ai_config = api_client.get_ai_config(project_key, ai_config_key)
+ existing_keys = {v["key"] for v in ai_config.get("variations", [])}
+ except Exception:
+ logger.warning(
+ "Could not fetch AI Config to check variation key collisions; proceeding with candidate key."
+ )
+ existing_keys = set()
+
+ if candidate in existing_keys:
+ suffix = "%04x" % random.randint(0, 0xFFFF)
+ candidate = f"{candidate}-{suffix}"
+ logger.info("Variation key collision detected; using '%s' instead.", candidate)
+
+ model_name = optimize_context.current_model or ""
+ model_config_key = model_name # fallback if lookup fails
+ try:
+ configs_to_search = (
+ model_configs if model_configs is not None else api_client.get_model_configs(project_key)
+ )
+ match = _find_model_config(model_name, configs_to_search)
+ if match:
+ model_config_key = match["key"]
+ else:
+ logger.debug(
+ "No model config found for model id '%s'; using model name as key.", model_name
+ )
+ except Exception as exc:
+ logger.debug("Could not fetch model configs to resolve modelConfigKey: %s", exc)
+
+ payload: Dict[str, Any] = {
+ "key": candidate,
+ "name": candidate,
+ "mode": "agent",
+ "instructions": optimize_context.current_instructions,
+ "modelConfigKey": model_config_key,
+ }
+ if self._initial_tool_keys:
+ payload["toolKeys"] = list(self._initial_tool_keys)
+
+ last_exc: Optional[Exception] = None
+ for attempt in range(1, 4):
+ try:
+ api_client.create_ai_config_variation(project_key, ai_config_key, payload)
+ logger.info(
+ "Auto-committed variation '%s' to AI Config '%s'.", candidate, ai_config_key
+ )
+ return candidate
+ except Exception as exc:
+ last_exc = exc
+ if attempt < 3:
+ logger.warning(
+ "Failed to create variation (attempt %d/3): %s. Retrying...", attempt, exc
+ )
+
+ raise last_exc # type: ignore[misc]
+
+ async def _run_validation_phase(
+ self,
+ passing_context: OptimizationContext,
+ iteration: int,
+ ) -> "tuple[bool, OptimizationContext]":
+ """Run additional evaluations against distinct random samples to confirm a passing candidate.
+
+ Mirrors the sampling logic of _run_optimization: each validation turn selects
+ a user_input from user_input_options (when provided) AND a variables dict from
+ variable_choices independently. The validation count and distinctness guarantee
+ are driven by whichever pool is larger — user_input_options when present,
+ otherwise variable_choices — ensuring validation turns use inputs the passing
+ turn did not.
+
+ If all samples pass, the caller should proceed to _handle_success. If any
+ sample fails, the caller should treat the result as a normal failed attempt
+ and generate a new variation.
+
+ Validation turns are numbered sequentially in logs (iteration + 1, + 2, …)
+ for readability, but this numbering is internal only — the caller's iteration
+ counter is never advanced by this method so validation samples do not consume
+ the attempt budget.
+
+ :param passing_context: The OptimizationContext from the turn that just passed.
+ :param iteration: The iteration number of the passing turn; used as the
+ base for validation log line numbering only.
+ :return: Tuple of (all_passed, last_context).
+ """
+ options = self._options
+
+ # Determine the primary axis of distinctness and the pool size.
+ # user_input_options drives the count when present; otherwise variable_choices does.
+ # In either case, both user_input and variables are selected per-sample just as
+ # they are in the main optimization loop.
+ if options.user_input_options:
+ primary_pool: List[str] = options.user_input_options
+ passing_input: Optional[str] = passing_context.user_input
+ remaining_inputs: List[str] = [
+ inp for inp in primary_pool if inp != passing_input
+ ]
+ pool_size = len(primary_pool)
+ else:
+ var_pool: List[Dict[str, Any]] = options.variable_choices
+ passing_vars: Dict[str, Any] = passing_context.current_variables
+ remaining_vars: List[Dict[str, Any]] = [
+ v for v in var_pool if v != passing_vars
+ ]
+ pool_size = len(var_pool)
+
+ validation_count = _compute_validation_count(pool_size)
+ # Cap to the number of distinct remaining items, but never below 1.
+ # When the pool is exhausted (e.g. only one variable choice), sample
+ # with replacement from the full pool so at least one validation run
+ # always executes.
+ if options.user_input_options:
+ available = len(remaining_inputs)
+ else:
+ available = len(remaining_vars)
+
+ allow_repeats = available == 0
+ if allow_repeats:
+ validation_count = 1
+ else:
+ validation_count = min(validation_count, available)
+
+ logger.info(
+ "[Iteration %d] -> Candidate passed — entering validation phase (%d sample(s)%s)",
+ iteration,
+ validation_count,
+ ", repeated draw" if allow_repeats else "",
+ )
+ self._safe_status_update("validating", passing_context, iteration)
+
+ # Sample primary items, falling back to the full pool when no distinct
+ # items remain so the minimum-1 floor is always satisfied.
+ if options.user_input_options:
+ source_inputs = primary_pool if allow_repeats else remaining_inputs
+ sampled_inputs: List[str] = random.sample(source_inputs, validation_count)
+ else:
+ source_vars = var_pool if allow_repeats else remaining_vars
+ sampled_vars: List[Dict[str, Any]] = random.sample(source_vars, validation_count)
+
+ last_ctx = passing_context
+ for i in range(validation_count):
+ val_iter = iteration + i + 1
+ if options.user_input_options:
+ user_input: Optional[str] = sampled_inputs[i]
+ variables: Dict[str, Any] = random.choice(options.variable_choices)
+ else:
+ user_input = None
+ variables = sampled_vars[i]
+
+ logger.info(
+ "[Validation %d/%d] -> Running sample (iteration=%d)",
+ i + 1,
+ validation_count,
+ val_iter,
+ )
+
+ val_ctx = self._create_optimization_context(
+ iteration=val_iter,
+ user_input=user_input,
+ variables=variables,
+ )
+ self._safe_status_update("generating", val_ctx, val_iter)
+ val_ctx = await self._execute_agent_turn(val_ctx, val_iter)
+ self._accumulate_tokens(val_ctx)
+ if self._is_token_limit_exceeded():
+ logger.error(
+ "[Validation %d/%d] -> Token limit exceeded (total=%d)",
+ i + 1,
+ validation_count,
+ self._total_token_usage,
+ )
+ return False, val_ctx
+
+ if options.on_turn is not None:
+ try:
+ sample_passed = options.on_turn(val_ctx)
+ except Exception:
+ logger.exception(
+ "[Validation %d/%d] -> on_turn evaluation failed", i + 1, validation_count
+ )
+ sample_passed = False
+ else:
+ sample_passed = self._evaluate_response(val_ctx)
+
+ if sample_passed and _acceptance_criteria_implies_duration_optimization(
+ self._options.judges
+ ):
+ sample_passed = self._evaluate_duration(val_ctx)
+
+ last_ctx = val_ctx
+
+ if not sample_passed:
+ logger.info(
+ "[Validation %d/%d] -> FAILED (iteration=%d) — candidate rejected",
+ i + 1,
+ validation_count,
+ val_iter,
+ )
+ return False, last_ctx
+
+ logger.debug(
+ "[Validation %d/%d] -> passed (iteration=%d)",
+ i + 1,
+ validation_count,
+ val_iter,
+ )
+
+ logger.info(
+ "[Iteration %d] -> All %d validation sample(s) passed — candidate confirmed",
+ iteration,
+ validation_count,
+ )
+ return True, last_ctx
+
+ async def _run_optimization(
+ self, agent_config: AIAgentConfig, options: OptimizationOptions
+ ) -> Any:
+ """Run an optimization on the given agent with the given options.
+
+ :param agent_config: Agent configuration from LaunchDarkly.
+ :param options: Optimization options.
+ :return: Optimization result.
+ """
+ self._options = options
+ self._agent_config = agent_config
+ self._last_run_succeeded = False
+ self._last_succeeded_context = None
+ self._last_optimization_result_id = None
+ self._total_token_usage = 0
+ self._initialize_class_members_from_config(agent_config)
+
+ # If the LD flag doesn't carry a model name, seed from the first model choice
+ # so agent calls never receive an empty model string.
+ if not self._current_model and options.model_choices:
+ self._current_model = options.model_choices[0]
+ logger.debug(
+ "[Optimization] -> No model in agent config; defaulting to first model choice: %s",
+ self._current_model,
+ )
+
+ initial_context = self._create_optimization_context(
+ iteration=0,
+ variables=random.choice(options.variable_choices),
+ )
+
+ self._safe_status_update("init", initial_context, 0)
+
+ iteration = 0
+ while True:
+ iteration += 1
+ logger.info(
+ "[Iteration %d] -> Starting (attempt %d/%d, model=%s)",
+ iteration,
+ iteration,
+ self._options.max_attempts,
+ self._current_model,
+ )
+ user_input = None
+ if self._options.user_input_options:
+ user_input = random.choice(self._options.user_input_options)
+ if user_input:
+ logger.debug("[Iteration %d] -> User input: %s", iteration, user_input)
+
+ optimize_context = self._create_optimization_context(
+ iteration=iteration,
+ user_input=user_input,
+ # Pick a fresh variable set each turn for call-time interpolation
+ variables=random.choice(self._options.variable_choices),
+ )
+
+ self._safe_status_update("generating", optimize_context, iteration)
+ optimize_context = await self._execute_agent_turn(
+ optimize_context, iteration
+ )
+ self._accumulate_tokens(optimize_context)
+ if self._is_token_limit_exceeded():
+ logger.error(
+ "[Iteration %d] -> Token limit exceeded (total=%d)",
+ iteration,
+ self._total_token_usage,
+ )
+ return self._handle_failure(optimize_context, iteration)
+
+ # Manual path: on_turn callback gives caller full control over pass/fail
+ if self._options.on_turn is not None:
+ try:
+ on_turn_result = self._options.on_turn(optimize_context)
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> on_turn evaluation failed", iteration
+ )
+ on_turn_result = False
+
+ initial_passed = on_turn_result
+ if initial_passed:
+ logger.info(
+ "[Iteration %d] -> on_turn returned True — turn passed",
+ iteration,
+ )
+ else:
+ # Auto-path: judge scores determine pass/fail via _evaluate_response
+ initial_passed = self._evaluate_response(optimize_context)
+ if initial_passed:
+ logger.info(
+ "[Iteration %d] -> All judges passed — turn succeeded",
+ iteration,
+ )
+
+ if initial_passed and _acceptance_criteria_implies_duration_optimization(
+ self._options.judges
+ ):
+ initial_passed = self._evaluate_duration(optimize_context)
+
+ if initial_passed:
+ all_valid, last_ctx = await self._run_validation_phase(
+ optimize_context, iteration
+ )
+ if all_valid:
+ return self._handle_success(optimize_context, iteration)
+ if self._is_token_limit_exceeded():
+ return self._handle_failure(last_ctx, iteration)
+ # Validation failed — treat as a normal failed attempt.
+ # Use optimize_context (the main iteration) for terminal API events so
+ # the persisted record's completionResponse and userInput stay aligned.
+ # last_ctx (the failing validation run) goes into history so the
+ # variation generator can see what went wrong.
+ logger.info(
+ "[Iteration %d] -> Validation failed — generating new variation (attempt %d/%d)",
+ iteration,
+ iteration,
+ self._options.max_attempts,
+ )
+ if iteration >= self._options.max_attempts:
+ return self._handle_failure(optimize_context, iteration)
+ self._history.append(last_ctx)
+ try:
+ await self._generate_new_variation(
+ iteration, last_ctx.current_variables
+ )
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> variation generation failed", iteration
+ )
+ return self._handle_failure(optimize_context, iteration)
+ self._safe_status_update("turn completed", optimize_context, iteration)
+ continue
+
+ # Initial turn failed
+ if self._options.on_turn is not None:
+ logger.info(
+ "[Iteration %d] -> on_turn returned False — turn failed (attempt %d/%d)",
+ iteration,
+ iteration,
+ self._options.max_attempts,
+ )
+ else:
+ logger.info(
+ "[Iteration %d] -> One or more judges failed (attempt %d/%d) — generating new variation",
+ iteration,
+ iteration,
+ self._options.max_attempts,
+ )
+ if iteration >= self._options.max_attempts:
+ return self._handle_failure(optimize_context, iteration)
+ self._history.append(optimize_context)
+ try:
+ await self._generate_new_variation(
+ iteration, optimize_context.current_variables
+ )
+ except Exception:
+ logger.exception(
+ "[Iteration %d] -> variation generation failed", iteration
+ )
+ return self._handle_failure(optimize_context, iteration)
+ self._safe_status_update("turn completed", optimize_context, iteration)
+ continue
diff --git a/packages/optimization/src/ldai_optimizer/dataclasses.py b/packages/optimization/src/ldai_optimizer/dataclasses.py
new file mode 100644
index 00000000..9e52e046
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/dataclasses.py
@@ -0,0 +1,488 @@
+"""Dataclasses for the LaunchDarkly AI optimization package."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import (
+ Any,
+ Awaitable,
+ Callable,
+ Dict,
+ List,
+ Literal,
+ Optional,
+ Sequence,
+ Union,
+)
+
+from ldai import AIAgentConfig
+from ldai.models import LDMessage, ModelConfig
+from ldai.tracker import TokenUsage
+from ldclient import Context
+from typing_extensions import Protocol
+
+
+@dataclass
+class OptimizationResponse:
+ """The return value for both ``handle_agent_call`` and ``handle_judge_call`` callbacks.
+
+ :param output: The text output produced by the LLM.
+ :param usage: Optional token usage for this call. Set fields to 0 or omit entirely
+ if token tracking is not available for the framework being used.
+ """
+
+ output: str
+ usage: Optional[TokenUsage] = None
+
+
+@dataclass
+class JudgeResult:
+ """Result from a judge evaluation."""
+
+ score: float
+ rationale: Optional[str] = None
+ duration_ms: Optional[float] = None
+ usage: Optional[TokenUsage] = None
+
+ def to_json(self) -> Dict[str, Any]:
+ """
+ Convert the judge result to a JSON-serializable dictionary.
+
+ :return: Dictionary representation of the judge result that can be serialized with json.dumps()
+ """
+ result: Dict[str, Any] = {
+ "score": self.score,
+ "rationale": self.rationale,
+ "duration_ms": self.duration_ms,
+ }
+ if self.usage is not None:
+ result["usage"] = {
+ "total": self.usage.total,
+ "input": self.usage.input,
+ "output": self.usage.output,
+ }
+ return result
+
+
+@dataclass
+class ToolDefinition:
+ """
+ Generic tool definition for enforcing structured output from LLM responses.
+
+ This tool can be used with any LLM provider to ensure responses conform to
+ a specific JSON schema. The tool takes the LLM's response and returns
+ parsed and validated data according to the input_schema.
+ """
+
+ name: str
+ description: str
+ input_schema: Dict[str, Any] # JSON schema defining the expected output structure
+ type: Literal["function"] = "function"
+
+ def to_dict(self) -> Dict[str, Any]:
+ """
+ Convert the tool definition to a dictionary format compatible with LLM APIs.
+
+ :return: Dictionary representation of the tool
+ """
+ return {
+ "name": self.name,
+ "description": self.description,
+ "input_schema": self.input_schema,
+ "type": self.type,
+ }
+
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "ToolDefinition":
+ """
+ Construct a ToolDefinition from a plain dictionary.
+
+ :param data: Dictionary with at least a ``name`` key; ``description`` and
+ ``input_schema`` default to empty values when absent.
+ :return: A new ToolDefinition instance
+ """
+ return cls(
+ name=data.get("name", ""),
+ description=data.get("description", ""),
+ input_schema=data.get("input_schema", {}),
+ type=data.get("type", "function"),
+ )
+
+
+class LLMCallConfig(Protocol):
+ """Structural protocol satisfied by both ``AIAgentConfig`` and ``AIJudgeCallConfig``.
+
+ Use this as the config parameter type when you want a single handler function
+ that can be passed to both ``handle_agent_call`` and ``handle_judge_call``::
+
+ async def handle_llm_call(
+ key: str,
+ config: LLMCallConfig,
+ context: LLMCallContext,
+ is_evaluation: bool,
+ ) -> OptimizationResponse:
+ model_name = config.model.name if config.model else "gpt-4o"
+ instructions = config.instructions or ""
+ tools = config.model.get_parameter("tools") if config.model else []
+ ...
+
+ OptimizationOptions(
+ handle_agent_call=handle_llm_call,
+ handle_judge_call=handle_llm_call,
+ ...
+ )
+ """
+
+ @property
+ def key(self) -> str: ...
+ @property
+ def model(self) -> Optional[ModelConfig]: ...
+ @property
+ def instructions(self) -> Optional[str]: ...
+
+
+class LLMCallContext(Protocol):
+ """Structural protocol satisfied by both ``OptimizationContext`` and ``OptimizationJudgeContext``.
+
+ Use alongside ``LLMCallConfig`` when writing a single handler for both
+ ``handle_agent_call`` and ``handle_judge_call``.
+ """
+
+ @property
+ def user_input(self) -> Optional[str]: ...
+ @property
+ def current_variables(self) -> Dict[str, Any]: ...
+
+
+@dataclass
+class AIJudgeCallConfig:
+ """
+ Configuration passed to ``handle_judge_call``.
+
+ Carries everything needed to run a judge in either paradigm:
+
+ * **Completions path** — pass ``messages`` directly to ``chat.completions.create``.
+ The full system + user turn sequence is already assembled and interpolated.
+ * **Agents path** — use ``instructions`` as the system prompt and
+ ``OptimizationJudgeContext.user_input`` as the ``Runner.run`` input.
+
+ Both fields are always populated, regardless of whether the judge comes from a
+ LaunchDarkly flag (config judge) or an inline acceptance statement.
+ """
+
+ key: str
+ model: ModelConfig
+ instructions: str
+ messages: List[LDMessage]
+
+
+@dataclass
+class Message:
+ """A message in a conversation."""
+
+ role: Literal["system", "user", "assistant"]
+ content: str
+
+ def to_dict(self) -> Dict[str, str]:
+ """Convert message to dictionary format."""
+ return {
+ "role": self.role,
+ "content": self.content,
+ }
+
+
+@dataclass
+class OptimizationJudge:
+ threshold: float
+ judge_key: Optional[str] = None
+ acceptance_statement: Optional[str] = None
+
+
+@dataclass
+class OptimizationContext:
+ """Context for a single optimization iteration."""
+
+ scores: Dict[str, JudgeResult] # the scores and rationales from the judges, if configured
+ completion_response: str
+ current_instructions: str
+ current_parameters: Dict[str, Any]
+ # variable set chosen for this iteration; interpolated into instructions at call time
+ current_variables: Dict[str, Any]
+ current_model: Optional[str] = None # the current model being used
+ user_input: Optional[str] = None # the user input message for this iteration
+ history: Sequence[OptimizationContext] = field(
+ default_factory=list
+ ) # previous context items
+ iteration: int = 0 # current iteration number
+ duration_ms: Optional[float] = None # wall-clock time for the agent call in milliseconds
+ usage: Optional[TokenUsage] = None # token usage reported by the agent for this iteration
+
+ def copy_without_history(self) -> OptimizationContext:
+ """
+ Create a copy of this context without the history field (for flattening).
+
+ :return: A new OptimizeContext with the same data but empty history
+ """
+ return OptimizationContext(
+ scores=self.scores,
+ completion_response=self.completion_response,
+ current_instructions=self.current_instructions,
+ current_parameters=self.current_parameters,
+ current_variables=self.current_variables,
+ current_model=self.current_model,
+ user_input=self.user_input,
+ history=(), # Empty history to keep it flat
+ iteration=self.iteration,
+ duration_ms=self.duration_ms,
+ usage=self.usage,
+ )
+
+ def to_json(self) -> Dict[str, Any]:
+ """
+ Convert the optimization context to a JSON-serializable dictionary.
+
+ :return: Dictionary representation of the context that can be serialized with json.dumps()
+ """
+ scores_dict = {}
+ for judge_key, judge_result in self.scores.items():
+ scores_dict[judge_key] = judge_result.to_json()
+
+ history_list = [ctx.to_json() for ctx in self.history]
+
+ result: Dict[str, Any] = {
+ "scores": scores_dict,
+ "completion_response": self.completion_response,
+ "current_instructions": self.current_instructions,
+ "current_parameters": self.current_parameters,
+ "current_model": self.current_model,
+ "user_input": self.user_input,
+ "current_variables": self.current_variables,
+ "history": history_list,
+ "iteration": self.iteration,
+ "duration_ms": self.duration_ms,
+ }
+ if self.usage is not None:
+ result["usage"] = {
+ "total": self.usage.total,
+ "input": self.usage.input,
+ "output": self.usage.output,
+ }
+ return result
+
+
+@dataclass
+class OptimizationJudgeContext:
+ """Context for a single judge evaluation turn."""
+
+ user_input: str # the agent response being evaluated
+ current_variables: Dict[str, Any] = field(default_factory=dict) # variable set used during agent generation
+
+
+# Shared callback type aliases used by both OptimizationOptions and
+# OptimizationFromConfigOptions to avoid duplicating the full signatures.
+# Placed here so all referenced types (OptimizationContext, AIJudgeCallConfig,
+# OptimizationJudgeContext) are already defined above.
+#
+# Both aliases use the LLMCallConfig / LLMCallContext Protocols so callers can
+# write a single handler for both agent and judge calls. Handlers typed with
+# the concrete types (AIAgentConfig / AIJudgeCallConfig) continue to work
+# because those types structurally satisfy the Protocols.
+HandleAgentCall = Union[
+ Callable[[str, LLMCallConfig, LLMCallContext, bool], OptimizationResponse],
+ Callable[[str, LLMCallConfig, LLMCallContext, bool], Awaitable[OptimizationResponse]],
+]
+HandleJudgeCall = Union[
+ Callable[[str, LLMCallConfig, LLMCallContext, bool], OptimizationResponse],
+ Callable[[str, LLMCallConfig, LLMCallContext, bool], Awaitable[OptimizationResponse]],
+]
+
+_StatusLiteral = Literal[
+ "init",
+ "generating",
+ "evaluating",
+ "generating variation",
+ "validating",
+ "turn completed",
+ "success",
+ "failure",
+]
+
+
+@dataclass
+class OptimizationOptions:
+ """Options for agent optimization."""
+
+ # Configuration - Required
+ max_attempts: int
+ model_choices: List[str] # model ids the LLM can choose from, 1 min required
+ judge_model: str # which model to use as judge; this should remain consistent
+ variable_choices: List[
+ Dict[str, Any]
+ ] # choices of interpolated variables to be chosen at random per turn, 1 min required
+ # Actual agent/completion (judge) calls - Required
+ handle_agent_call: HandleAgentCall
+ # Optional; falls back to handle_agent_call when omitted (both share the same signature)
+ handle_judge_call: Optional[HandleJudgeCall] = None
+ # Criteria for pass/fail - Optional
+ user_input_options: Optional[List[str]] = (
+ None # optional list of user input messages to randomly select from
+ )
+ judges: Optional[Dict[str, OptimizationJudge]] = (
+ None # auto-judges for this model that the LLM will use
+ )
+ on_turn: Optional[Callable[[OptimizationContext], bool]] = (
+ None # if you want manual control of pass/fail
+ )
+ # Context - Optional; defaults to a single anonymous context
+ context_choices: List[Context] = field(
+ default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()]
+ )
+ # Auto-commit - Optional
+ auto_commit: bool = False
+ project_key: Optional[str] = None # required when auto_commit=True
+ output_key: Optional[str] = None # variation key/name; auto-generated if omitted
+ base_url: Optional[str] = None # override to target a non-default LD instance
+ on_passing_result: Optional[Callable[[OptimizationContext], None]] = None
+ on_failing_result: Optional[Callable[[OptimizationContext], None]] = None
+ # called to provide status updates during the optimization flow
+ on_status_update: Optional[Callable[[_StatusLiteral, OptimizationContext], None]] = None
+ token_limit: Optional[int] = None # stop the run when total token usage reaches this value
+
+ def __post_init__(self):
+ """Validate required options."""
+ if len(self.model_choices) < 1:
+ raise ValueError("model_choices must have at least 1 model")
+ if self.judges is None and self.on_turn is None:
+ raise ValueError("Either judges or on_turn must be provided")
+ if self.judge_model is None:
+ raise ValueError("judge_model must be provided")
+
+
+@dataclass
+class GroundTruthSample:
+ """A single ground truth evaluation sample for use with optimize_from_ground_truth_options.
+
+ Each sample ties together the user input, expected response, and variable set for one
+ evaluation. Samples are evaluated in order; the optimization only passes if all samples
+ pass their judges in the same attempt.
+
+ :param user_input: The user message to send to the agent for this evaluation.
+ :param expected_response: The ideal response the agent should produce. Injected into
+ judge context so judges can score actual vs. expected.
+ :param variables: Variable set interpolated into the agent instructions for this sample.
+ Defaults to an empty dict if no placeholders are used.
+ """
+
+ user_input: str
+ expected_response: str
+ variables: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class GroundTruthOptimizationOptions:
+ """Options for optimize_from_ground_truth_options.
+
+ Mirrors OptimizationOptions but replaces variable_choices / user_input_options with
+ ground_truth_responses. Each GroundTruthSample bundles the user input, expected
+ response, and variable set for one evaluation. All N samples must pass their judges
+ in the same attempt for the optimization to succeed.
+
+ :param context_choices: One or more LD evaluation contexts to use.
+ :param ground_truth_responses: Ordered list of ground truth samples to evaluate.
+ At least 1 required. All samples share the same instructions and model being optimized.
+ :param max_attempts: Maximum number of variation attempts before the run is marked failed.
+ :param model_choices: Model IDs the variation generator may select from. At least 1 required.
+ :param judge_model: Model used for judge evaluation. Should remain consistent across attempts.
+ :param handle_agent_call: Callback that invokes the agent and returns its response.
+ :param handle_judge_call: Callback that invokes a judge LLM and returns its response.
+ :param judges: Auto-judges (config judges and/or acceptance statements) to score each response.
+ :param on_turn: Optional manual pass/fail callback applied per sample; skips judge scoring when provided.
+ :param on_sample_result: Called with each sample's OptimizationContext as results arrive,
+ before the overall pass/fail decision is made for the attempt.
+ :param on_passing_result: Called once with the last context when all N samples pass.
+ :param on_failing_result: Called once with the last context when max attempts are exhausted.
+ :param on_status_update: Called on each status transition during the run.
+ """
+
+ ground_truth_responses: List[GroundTruthSample]
+ max_attempts: int
+ model_choices: List[str]
+ judge_model: str
+ handle_agent_call: HandleAgentCall
+ # Optional; falls back to handle_agent_call when omitted (both share the same signature)
+ handle_judge_call: Optional[HandleJudgeCall] = None
+ judges: Optional[Dict[str, OptimizationJudge]] = None
+ on_turn: Optional[Callable[[OptimizationContext], bool]] = None
+ on_sample_result: Optional[Callable[[OptimizationContext], None]] = None
+ on_passing_result: Optional[Callable[[OptimizationContext], None]] = None
+ on_failing_result: Optional[Callable[[OptimizationContext], None]] = None
+ on_status_update: Optional[
+ Callable[
+ [
+ _StatusLiteral,
+ OptimizationContext,
+ ],
+ None,
+ ]
+ ] = None
+ # Context - Optional; defaults to a single anonymous context
+ context_choices: List[Context] = field(
+ default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()]
+ )
+ # Auto-commit - Optional
+ auto_commit: bool = False
+ project_key: Optional[str] = None # required when auto_commit=True
+ output_key: Optional[str] = None # variation key/name; auto-generated if omitted
+ base_url: Optional[str] = None # override to target a non-default LD instance
+ token_limit: Optional[int] = None # stop the run when total token usage reaches this value
+
+ def __post_init__(self):
+ """Validate required options."""
+ if len(self.model_choices) < 1:
+ raise ValueError("model_choices must have at least 1 model")
+ if len(self.ground_truth_responses) < 1:
+ raise ValueError("ground_truth_responses must have at least 1 sample")
+ if self.judges is None and self.on_turn is None:
+ raise ValueError("Either judges or on_turn must be provided")
+
+
+@dataclass
+class OptimizationFromConfigOptions:
+ """User-provided options for optimize_from_config.
+
+ Fields that come from the LaunchDarkly API (max_attempts, model_choices,
+ judge_model, variable_choices, user_input_options, judges) are omitted here
+ and sourced from the fetched agent optimization config instead.
+
+ :param project_key: LaunchDarkly project key used to build API paths.
+ :param context_choices: One or more LD evaluation contexts to use.
+ :param handle_agent_call: Callback that invokes the agent and returns its response.
+ :param handle_judge_call: Callback that invokes a judge and returns its response.
+ :param on_turn: Optional manual pass/fail callback; when provided, judge scoring is skipped.
+ :param on_sample_result: Ground truth path only. Called with each sample's
+ OptimizationContext as results arrive during a ground truth run.
+ :param on_passing_result: Called with the winning OptimizationContext on success.
+ :param on_failing_result: Called with the final OptimizationContext on failure.
+ :param on_status_update: Called on each status transition; chained after the
+ automatic result-persistence POST so it always runs after the record is saved.
+ :param base_url: Base URL of the LaunchDarkly instance. Defaults to
+ https://app.launchdarkly.com. Override to target a staging instance.
+ """
+
+ project_key: str
+ handle_agent_call: HandleAgentCall
+ # Optional; falls back to handle_agent_call when omitted (both share the same signature)
+ handle_judge_call: Optional[HandleJudgeCall] = None
+ on_turn: Optional[Callable[["OptimizationContext"], bool]] = None
+ on_sample_result: Optional[Callable[["OptimizationContext"], None]] = None
+ on_passing_result: Optional[Callable[["OptimizationContext"], None]] = None
+ on_failing_result: Optional[Callable[["OptimizationContext"], None]] = None
+ on_status_update: Optional[Callable[[_StatusLiteral, "OptimizationContext"], None]] = None
+ # Context - Optional; defaults to a single anonymous context
+ context_choices: List[Context] = field(
+ default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()]
+ )
+ base_url: Optional[str] = None
+ # Auto-commit defaults to True for config-driven runs; set False to disable
+ auto_commit: bool = True
+ output_key: Optional[str] = None # variation key/name; auto-generated if omitted
diff --git a/packages/optimization/src/ldai_optimizer/ld_api_client.py b/packages/optimization/src/ldai_optimizer/ld_api_client.py
new file mode 100644
index 00000000..3efa725d
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/ld_api_client.py
@@ -0,0 +1,380 @@
+"""Internal LaunchDarkly REST API client for the optimization package."""
+
+import json
+import logging
+import time
+import urllib.error
+import urllib.request
+from typing import Any, Dict, List, Optional, TypedDict
+
+from ldai_optimizer.util import RedactionFilter
+
+logger = logging.getLogger(__name__)
+logger.addFilter(RedactionFilter())
+
+_BASE_URL = "https://app.launchdarkly.com"
+
+_MAX_RETRIES = 3
+_INITIAL_BACKOFF = 1.0 # seconds; doubles on each attempt (1s, 2s, 4s)
+
+# Status codes that warrant a retry. Everything else (including 400, 401, 403,
+# 404) is a permanent or auth failure — retrying would not help and could lead
+# to corrupted optimization results if some requests succeed and others fail.
+_RETRYABLE_STATUS_CODES = frozenset({429, 500, 502, 503, 504})
+
+
+class LDApiError(Exception):
+ """Raised when the LaunchDarkly REST API returns an error or is unreachable.
+
+ Attributes:
+ status_code: HTTP status code, or None for network-level failures.
+ path: The API path that was requested.
+ """
+
+ def __init__(self, message: str, status_code: Optional[int] = None, path: str = "") -> None:
+ super().__init__(message)
+ self.status_code = status_code
+ self.path = path
+
+
+_HTTP_ERROR_HINTS: Dict[int, str] = {
+ 401: "Authentication failed — check that LAUNCHDARKLY_API_KEY is set correctly.",
+ 403: "Authorization failed — check that your API key has the required permissions.",
+ 404: "Resource not found — check that the project key and optimization config key are correct.",
+ 429: "Rate limit exceeded — too many requests to the LaunchDarkly API.",
+}
+
+_REQUIRED_STRING_FIELDS = ("id", "key", "aiConfigKey", "judgeModel")
+_REQUIRED_INT_FIELDS = ("maxAttempts", "version", "createdAt")
+_REQUIRED_LIST_FIELDS = (
+ "modelChoices",
+ "variableChoices",
+ "acceptanceStatements",
+ "judges",
+ "userInputOptions",
+)
+
+
+# ---------------------------------------------------------------------------
+# API response shapes
+# ---------------------------------------------------------------------------
+
+class _AcceptanceStatement(TypedDict):
+ statement: str
+ threshold: float
+
+
+class _AgentOptimizationJudge(TypedDict):
+ key: str
+ threshold: float
+
+
+class _AgentOptimizationConfigRequired(TypedDict):
+ id: str
+ key: str
+ aiConfigKey: str
+ maxAttempts: int
+ modelChoices: List[str]
+ judgeModel: str
+ variableChoices: List[Dict[str, Any]]
+ acceptanceStatements: List[_AcceptanceStatement]
+ judges: List[_AgentOptimizationJudge]
+ userInputOptions: List[str]
+ version: int
+ createdAt: int
+
+
+class AgentOptimizationConfig(_AgentOptimizationConfigRequired, total=False):
+ """Typed representation of the AgentOptimization API response."""
+
+ groundTruthResponses: List[str]
+ metricKey: str
+ tokenLimit: int
+
+
+# ---------------------------------------------------------------------------
+# Result payload shapes
+# ---------------------------------------------------------------------------
+
+class _AgentOptimizationResultPostRequired(TypedDict):
+ runId: str
+ agentOptimizationVersion: int
+ iteration: int
+ instructions: str
+
+
+class AgentOptimizationResultPost(_AgentOptimizationResultPostRequired, total=False):
+ """Payload for POST /agent-optimizations/{key}/results — creates a new result record."""
+
+ userInput: str
+ parameters: Dict[str, Any]
+
+
+class AgentOptimizationResultPatch(TypedDict, total=False):
+ """Payload for PATCH /agent-optimizations/{key}/results/{id} — updates a result record."""
+
+ status: str
+ activity: str
+ completionResponse: str
+ scores: Dict[str, Any]
+ generationLatency: int
+ generationTokens: Dict[str, int]
+ evaluationLatencies: Dict[str, float]
+ evaluationTokens: Dict[str, Dict[str, int]]
+ variation: Dict[str, Any]
+ createdVariationKey: str
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+def _parse_agent_optimization(data: Any) -> AgentOptimizationConfig:
+ """Validate and cast a raw API response dict to AgentOptimizationConfig.
+
+ :param data: Parsed JSON response from the GET endpoint.
+ :return: The same dict narrowed to AgentOptimizationConfig.
+ :raises ValueError: If required fields are missing or have wrong types.
+ """
+ if not isinstance(data, dict):
+ raise ValueError(
+ f"Expected a JSON object from AgentOptimization API, got {type(data).__name__}"
+ )
+
+ errors: List[str] = []
+
+ for field in _REQUIRED_STRING_FIELDS:
+ if field not in data:
+ errors.append(f"missing required field '{field}'")
+ elif not isinstance(data[field], str):
+ errors.append(
+ f"field '{field}' must be a string, got {type(data[field]).__name__}"
+ )
+
+ for field in _REQUIRED_INT_FIELDS:
+ if field not in data:
+ errors.append(f"missing required field '{field}'")
+ elif not isinstance(data[field], int):
+ errors.append(
+ f"field '{field}' must be an integer, got {type(data[field]).__name__}"
+ )
+
+ for field in _REQUIRED_LIST_FIELDS:
+ if field not in data:
+ errors.append(f"missing required field '{field}'")
+ elif not isinstance(data[field], list):
+ errors.append(
+ f"field '{field}' must be a list, got {type(data[field]).__name__}"
+ )
+
+ if not errors and "modelChoices" in data and isinstance(data["modelChoices"], list):
+ if len(data["modelChoices"]) < 1:
+ errors.append("field 'modelChoices' must have at least 1 entry")
+
+ if errors:
+ raise ValueError(
+ f"Invalid AgentOptimization response: {'; '.join(errors)}"
+ )
+
+ return data # type: ignore[return-value]
+
+
+# ---------------------------------------------------------------------------
+# Client
+# ---------------------------------------------------------------------------
+
+class LDApiClient:
+ """Thin wrapper around the LaunchDarkly REST API for agent-optimization endpoints."""
+
+ def __init__(self, api_key: str, base_url: str = _BASE_URL) -> None:
+ self._api_key = api_key
+ self._base_url = base_url.rstrip("/")
+
+ def __repr__(self) -> str:
+ return f"LDApiClient(base_url={self._base_url!r})"
+
+ def _auth_headers(self) -> Dict[str, str]:
+ return {"Authorization": self._api_key}
+
+ def _request(
+ self,
+ method: str,
+ path: str,
+ body: Any = None,
+ extra_headers: Optional[Dict[str, str]] = None,
+ ) -> Any:
+ """Execute an HTTP request with automatic retry and exponential backoff.
+
+ Retries up to ``_MAX_RETRIES`` times for transient errors (429, 5xx,
+ network failures) with exponential backoff starting at ``_INITIAL_BACKOFF``
+ seconds. Non-retryable status codes (400, 401, 403, 404, …) are raised
+ immediately without retrying.
+
+ :param method: HTTP method (GET, POST, PATCH, …).
+ :param path: API path, appended to ``self._base_url``.
+ :param body: Optional request body; serialised to JSON.
+ :param extra_headers: Additional headers merged with the auth header.
+ :raises LDApiError: After all retry attempts are exhausted, or immediately
+ for non-retryable status codes.
+ """
+ url = f"{self._base_url}{path}"
+ headers = {**self._auth_headers(), **(extra_headers or {})}
+ data = json.dumps(body).encode() if body is not None else None
+ if data is not None:
+ headers["Content-Type"] = "application/json"
+
+ last_exc: Optional[LDApiError] = None
+ for attempt in range(_MAX_RETRIES + 1):
+ req = urllib.request.Request(url, data=data, headers=headers, method=method)
+ try:
+ with urllib.request.urlopen(req) as resp:
+ raw = resp.read()
+ return json.loads(raw) if raw else None
+ except urllib.error.HTTPError as exc:
+ body_excerpt = exc.read(500).decode(errors="replace")
+ hint = _HTTP_ERROR_HINTS.get(exc.code, "")
+ detail = f"{hint} (API response: {body_excerpt})" if hint else f"API response: {body_excerpt}"
+ api_error = LDApiError(
+ f"LaunchDarkly API error {exc.code} {exc.msg} for {method} {path}. {detail}",
+ status_code=exc.code,
+ path=path,
+ )
+ if exc.code not in _RETRYABLE_STATUS_CODES:
+ raise api_error from exc
+ last_exc = api_error
+ except urllib.error.URLError as exc:
+ last_exc = LDApiError(
+ f"Could not reach LaunchDarkly API at {url}: {exc.reason}. "
+ "Check your network connection and the base_url setting.",
+ path=path,
+ )
+
+ if attempt < _MAX_RETRIES:
+ delay = _INITIAL_BACKOFF * (2 ** attempt)
+ logger.warning(
+ "LaunchDarkly API request failed (attempt %d/%d, path=%s), "
+ "retrying in %.1fs: %s",
+ attempt + 1,
+ _MAX_RETRIES + 1,
+ path,
+ delay,
+ last_exc,
+ )
+ time.sleep(delay)
+
+ assert last_exc is not None
+ raise last_exc
+
+ def get_model_configs(self, project_key: str) -> List[Dict[str, Any]]:
+ """Fetch all AI model configs for a project.
+
+ :param project_key: LaunchDarkly project key.
+ :return: List of model config dicts (each has at minimum ``id`` and ``key``).
+ :raises LDApiError: On non-200 HTTP responses or network errors.
+ """
+ path = f"/api/v2/projects/{project_key}/ai-configs/model-configs"
+ result = self._request("GET", path, extra_headers={"LD-API-Version": "beta"})
+ return result if isinstance(result, list) else []
+
+ def get_ai_config(self, project_key: str, config_key: str) -> Any:
+ """Fetch a single AI Config by key, including its variations.
+
+ :param project_key: LaunchDarkly project key.
+ :param config_key: Key of the AI Config (aiConfigKey).
+ :return: Raw AI Config dict with a ``variations`` list.
+ :raises LDApiError: On non-200 HTTP responses or network errors.
+ """
+ path = f"/api/v2/projects/{project_key}/ai-configs/{config_key}"
+ return self._request("GET", path, extra_headers={"LD-API-Version": "beta"})
+
+ def create_ai_config_variation(
+ self, project_key: str, config_key: str, payload: Dict[str, Any]
+ ) -> Any:
+ """Create a new variation on an AI Config.
+
+ :param project_key: LaunchDarkly project key.
+ :param config_key: Key of the AI Config.
+ :param payload: Variation payload (key, name, mode, instructions, model).
+ :return: Created AIConfigVariation dict.
+ :raises LDApiError: On non-200 HTTP responses or network errors.
+ """
+ path = f"/api/v2/projects/{project_key}/ai-configs/{config_key}/variations"
+ return self._request("POST", path, body=payload, extra_headers={"LD-API-Version": "beta"})
+
+ def get_agent_optimization(
+ self, project_key: str, optimization_key: str
+ ) -> AgentOptimizationConfig:
+ """Fetch and validate a single agent optimization config by key.
+
+ :param project_key: LaunchDarkly project key.
+ :param optimization_key: Key of the agent optimization config.
+ :return: Validated AgentOptimizationConfig.
+ :raises LDApiError: On non-200 HTTP responses or network errors.
+ :raises ValueError: If the response is missing required fields.
+ """
+ path = f"/api/v2/projects/{project_key}/agent-optimizations/{optimization_key}"
+ raw = self._request("GET", path)
+ return _parse_agent_optimization(raw)
+
+ def post_agent_optimization_result(
+ self, project_key: str, optimization_key: str, payload: AgentOptimizationResultPost
+ ) -> Optional[str]:
+ """Create an iteration result record for the given optimization run.
+
+ Errors are caught and logged rather than raised so that persistence
+ failures never abort an in-progress optimization run.
+
+ :param project_key: LaunchDarkly project key.
+ :param optimization_key: String key of the parent agent_optimization record.
+ :param payload: POST payload for this iteration.
+ :return: The ``id`` of the newly created result record, or None on failure.
+ """
+ path = f"/api/v2/projects/{project_key}/agent-optimizations/{optimization_key}/results"
+ try:
+ result = self._request("POST", path, body=payload)
+ return result.get("id") if isinstance(result, dict) else None
+ except LDApiError as exc:
+ logger.debug(
+ "Failed to persist optimization result (optimization_key=%s, iteration=%s): %s",
+ optimization_key,
+ payload.get("iteration"),
+ exc,
+ )
+ return None
+ except Exception as exc:
+ logger.debug(
+ "Unexpected error persisting optimization result (optimization_key=%s, iteration=%s): %s",
+ optimization_key,
+ payload.get("iteration"),
+ exc,
+ )
+ return None
+
+ def patch_agent_optimization_result(
+ self, project_key: str, optimization_key: str, result_id: str, payload: AgentOptimizationResultPatch
+ ) -> None:
+ """Update an existing iteration result record.
+
+ Errors are caught and logged rather than raised so that persistence
+ failures never abort an in-progress optimization run.
+
+ :param project_key: LaunchDarkly project key.
+ :param optimization_key: String key of the parent agent_optimization record.
+ :param result_id: ID of the result record to update.
+ :param payload: PATCH payload with fields to update.
+ """
+ path = f"/api/v2/projects/{project_key}/agent-optimizations/{optimization_key}/results/{result_id}"
+ try:
+ self._request("PATCH", path, body=payload)
+ except LDApiError as exc:
+ logger.debug(
+ "Failed to update optimization result (result_id=%s): %s",
+ result_id,
+ exc,
+ )
+ except Exception as exc:
+ logger.debug(
+ "Unexpected error updating optimization result (result_id=%s): %s",
+ result_id,
+ exc,
+ )
diff --git a/packages/optimization/src/ldai_optimizer/prompts.py b/packages/optimization/src/ldai_optimizer/prompts.py
new file mode 100644
index 00000000..c699cb19
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/prompts.py
@@ -0,0 +1,557 @@
+"""Prompt-building functions for LaunchDarkly AI optimization."""
+
+import re
+from typing import Any, Dict, List, Optional
+
+from ldai_optimizer.dataclasses import (
+ OptimizationContext,
+ OptimizationJudge,
+)
+
+_DURATION_KEYWORDS = re.compile(
+ r"\b(fast|faster|quickly|quick|latency|low-latency|duration|response\s+time|"
+ r"time\s+to\s+respond|milliseconds|performant|snappy|efficient|seconds)\b|"
+ r"(? bool:
+ """Return True if any judge acceptance statement implies a latency optimization goal.
+
+ Scans each judge's acceptance_statement for latency-related keywords. The
+ check is case-insensitive. Returns False when judges is None or no judge
+ carries an acceptance statement.
+
+ :param judges: Judge configuration dict from OptimizationOptions, or None.
+ :return: True if duration optimization should be applied.
+ """
+ if not judges:
+ return False
+ for judge in judges.values():
+ if judge.acceptance_statement and _DURATION_KEYWORDS.search(
+ judge.acceptance_statement
+ ):
+ return True
+ return False
+
+
+def build_message_history_text(
+ history: List[OptimizationContext],
+ input_text: str,
+ reasoning_history: str,
+ current_user_input: str,
+) -> str:
+ """
+ Build a formatted message-history string for use as a judge template variable.
+
+ Combines the current instructions (system text), the conversation turns
+ recorded in history, the current turn's user question, and the accumulated
+ reasoning/score history.
+
+ :param history: All previous OptimizationContexts, oldest first
+ :param input_text: Current system instructions (may be empty string)
+ :param reasoning_history: Pre-formatted string from build_reasoning_history
+ :param current_user_input: The user question for the turn being evaluated.
+ Must be passed explicitly because the current turn is not yet in
+ history when the judge runs.
+ :return: Combined string to substitute into the judge's message_history variable
+ """
+ turn_messages = []
+ for ctx in history:
+ if ctx.user_input:
+ turn_messages.append(f"User: {ctx.user_input}")
+ if ctx.completion_response:
+ turn_messages.append(f"Assistant: {ctx.completion_response}")
+
+ # Include the current turn's question so judges see what was actually asked
+ turn_messages.append(f"User: {current_user_input}")
+
+ parts = []
+ if input_text:
+ parts.append(f"System: {input_text}")
+ if turn_messages:
+ parts.append("\n".join(turn_messages))
+ if reasoning_history:
+ parts.append(f"Evaluation history:\n{reasoning_history}")
+
+ return "\n\n".join(parts)
+
+
+def build_reasoning_history(history: List[OptimizationContext]) -> str:
+ """
+ Build a formatted string of reasoning from previous iterations.
+
+ :param history: All previous OptimizationContexts, oldest first
+ :return: Formatted string containing reasoning history
+ """
+ if not history:
+ return ""
+
+ reasoning_parts = []
+ for i, prev_ctx in enumerate(history, 1):
+ if prev_ctx.scores:
+ reasoning_parts.append(f"## Iteration {i} Judge Evaluations:")
+ for judge_key, result in prev_ctx.scores.items():
+ reasoning_parts.append(f"- {judge_key}: Score {result.score}")
+ if result.rationale:
+ reasoning_parts.append(f" Reasoning: {result.rationale}")
+ reasoning_parts.append("")
+
+ return "\n".join(reasoning_parts)
+
+
+def build_new_variation_prompt(
+ history: List[OptimizationContext],
+ judges: Optional[Dict[str, OptimizationJudge]],
+ current_model: Optional[str],
+ current_instructions: str,
+ current_parameters: Dict[str, Any],
+ model_choices: List[str],
+ variable_choices: List[Dict[str, Any]],
+ initial_instructions: str,
+ optimize_for_duration: bool = False,
+) -> str:
+ """
+ Build the LLM prompt for generating an improved agent configuration.
+
+ Constructs a detailed instruction string based on the full optimization
+ history, including all previous configurations, completion results, and
+ judge scores. When history is empty (first variation attempt), asks the
+ LLM to improve the current config without evaluation feedback.
+
+ :param history: All previous OptimizationContexts, oldest first. Empty on the first attempt.
+ :param judges: Judge configuration dict from OptimizationOptions
+ :param current_model: The model currently in use
+ :param current_instructions: The current agent instructions template
+ :param current_parameters: The current model parameters dict
+ :param model_choices: List of model IDs the LLM may select from
+ :param variable_choices: List of variable dicts (used to derive placeholder names)
+ :param initial_instructions: The original unmodified instructions template
+ :param optimize_for_duration: When True, appends a duration optimization section
+ instructing the LLM to prefer faster models and simpler instructions.
+ :return: The assembled prompt string
+ """
+ sections = [
+ variation_prompt_preamble(),
+ variation_prompt_acceptance_criteria(judges),
+ variation_prompt_configuration(
+ history, current_model, current_instructions, current_parameters
+ ),
+ variation_prompt_feedback(history, judges),
+ variation_prompt_overfit_warning(history),
+ variation_prompt_improvement_instructions(
+ history, model_choices, variable_choices, initial_instructions
+ ),
+ variation_prompt_duration_optimization(model_choices) if optimize_for_duration else "",
+ ]
+
+ return "\n\n".join(s for s in sections if s)
+
+
+def variation_prompt_preamble() -> str:
+ """Static opening section for the variation generation prompt."""
+ return "\n".join(
+ [
+ "You are an assistant that helps improve agent configurations through iterative optimization.",
+ "",
+ "Your task is to generate improved agent instructions and parameters based on the feedback provided.",
+ "The feedback you provide should guide the LLM to improve the agent instructions "
+ "for all possible use cases, not one concrete case.",
+ "For example, if the feedback is that the agent is not returning the correct records, "
+ "you should improve the agent instructions to return the correct records for all possible use cases. "
+ "Not just the one concrete case that was provided in the feedback.",
+ "When changing the instructions, keep the original intent in mind "
+ "when it comes to things like the use of variables and placeholders.",
+ "If the original instructions were to use a placeholder like {{id}}, "
+ "you should keep the placeholder in the new instructions, not replace it with the actual value. "
+ "This is the case for all parameterized values (all parameters should appear in each new variation).",
+ "IMPORTANT: placeholder names are fixed identifiers (e.g. {{user_id}}, {{trip_purpose}}) — "
+ "never substitute the runtime value of a variable in place of its name. "
+ "For example, if the variable key is 'user_id' and its current value is 'user-125', "
+ "the placeholder MUST be written as {{user_id}}, NOT {{user-125}}.",
+ "Pay particular attention to the instructions regarding tools and the rules for variables.",
+ ]
+ )
+
+
+def variation_prompt_acceptance_criteria(
+ judges: Optional[Dict[str, OptimizationJudge]],
+) -> str:
+ """
+ Acceptance criteria section of the variation prompt.
+
+ Collects every acceptance statement defined across all judges and renders
+ them as an emphatic block so the LLM understands exactly what the improved
+ configuration must achieve. Returns an empty string when no judges carry
+ acceptance statements (e.g. all judges are config-key-only judges).
+ """
+ if not judges:
+ return ""
+
+ statements = [
+ (key, judge.acceptance_statement)
+ for key, judge in judges.items()
+ if judge.acceptance_statement
+ ]
+
+ if not statements:
+ return ""
+
+ lines = [
+ "## *** ACCEPTANCE CRITERIA (MUST BE MET) ***",
+ "The improved configuration MUST produce responses that satisfy ALL of the following criteria.",
+ "These criteria are non-negotiable — every generated variation will be evaluated against them.",
+ "All variables must be used in the new instructions.",
+ "",
+ ]
+ for key, statement in statements:
+ lines.append(f"- [{key}] {statement}")
+
+ lines += [
+ "",
+ "When writing new instructions, explicitly address each criterion above.",
+ "Do not sacrifice any criterion in favour of another.",
+ ]
+
+ return "\n".join(lines)
+
+
+def variation_prompt_configuration(
+ history: List[OptimizationContext],
+ current_model: Optional[str],
+ current_instructions: str,
+ current_parameters: Dict[str, Any],
+) -> str:
+ """
+ Configuration section of the variation prompt.
+
+ Shows the most recent iteration's model, instructions, parameters,
+ user input, and completion response when history is available, or the
+ current state on the first attempt.
+ """
+ if history:
+ previous_ctx = history[-1]
+ lines = [
+ "## Most Recent Configuration:",
+ f"Model: {previous_ctx.current_model}",
+ f"Instructions: {previous_ctx.current_instructions}",
+ f"Parameters: {previous_ctx.current_parameters}",
+ "",
+ "## Most Recent Result:",
+ ]
+ if previous_ctx.user_input:
+ lines.append(f"User question: {previous_ctx.user_input}")
+ lines.append(f"Agent response: {previous_ctx.completion_response}")
+ if previous_ctx.duration_ms is not None:
+ lines.append(f"Agent duration: {previous_ctx.duration_ms:.0f}ms")
+ return "\n".join(lines)
+ else:
+ return "\n".join(
+ [
+ "## Current Configuration:",
+ f"Model: {current_model}",
+ f"Instructions: {current_instructions}",
+ f"Parameters: {current_parameters}",
+ ]
+ )
+
+
+def variation_prompt_feedback(
+ history: List[OptimizationContext],
+ judges: Optional[Dict[str, OptimizationJudge]],
+) -> str:
+ """
+ Evaluation feedback section of the variation prompt.
+
+ Renders all previous iterations' scores in chronological order so the
+ LLM can observe trends across the full optimization run. Returns an
+ empty string when no history exists or no iteration has scores, so it
+ is filtered out of the assembled prompt entirely.
+ """
+ iterations_with_scores = [ctx for ctx in history if ctx.scores]
+ if not iterations_with_scores:
+ return ""
+
+ lines = ["## Evaluation History:"]
+ for ctx in iterations_with_scores:
+ lines.append(f"\n### Iteration {ctx.iteration}:")
+ if ctx.user_input:
+ lines.append(f"User question: {ctx.user_input}")
+ for judge_key, result in ctx.scores.items():
+ optimization_judge = judges.get(judge_key) if judges else None
+ if optimization_judge:
+ score = result.score
+ if optimization_judge.threshold is not None:
+ passed = score >= optimization_judge.threshold
+ status = "PASSED" if passed else "FAILED"
+ feedback_line = (
+ f"- {judge_key}: Score {score:.3f}"
+ f" (threshold: {optimization_judge.threshold}) - {status}"
+ )
+ else:
+ passed = score >= 1.0
+ status = "PASSED" if passed else "FAILED"
+ feedback_line = f"- {judge_key}: {status}"
+ if result.rationale:
+ feedback_line += f"\n Reasoning: {result.rationale}"
+ lines.append(feedback_line)
+ if ctx.duration_ms is not None:
+ lines.append(f"Agent duration: {ctx.duration_ms:.0f}ms")
+ return "\n".join(lines)
+
+
+def variation_prompt_overfit_warning(history: List[OptimizationContext]) -> str:
+ """
+ Overfitting warning section of the variation prompt.
+
+ Combines a general reminder to write generalizable instructions with
+ specific values from the most recent iteration so the LLM knows exactly
+ what concrete values to avoid embedding literally. Returns an empty string
+ when there is no history (first attempt, no feedback to overfit to).
+
+ :param history: All previous OptimizationContexts, oldest first.
+ :return: Overfitting warning block, or empty string if history is empty.
+ """
+ if not history:
+ return ""
+
+ recent = history[-1]
+
+ lines = [
+ "## *** OVERFITTING WARNING ***",
+ "Do NOT hardcode specific values from the evaluation feedback into the instructions.",
+ "The configuration must generalise to all possible inputs, not just the ones seen so far.",
+ "Write instructions that treat the values below as examples of a broader class of inputs,",
+ "not as literals to match.",
+ "",
+ "The following specific values appeared in the most recent iteration "
+ "— do not embed them literally:",
+ ]
+
+ if recent.user_input:
+ lines.append(f'- User input: "{recent.user_input}"')
+
+ if recent.current_variables:
+ for k, v in recent.current_variables.items():
+ lines.append(f' - placeholder {{{{{k}}}}}, current value: "{v}"')
+ lines.append(
+ " (These are the placeholder NAMES mapped to their current VALUES"
+ " — never use a value as a placeholder name)"
+ )
+
+ lines += [
+ "",
+ "If you find yourself writing instructions that only work for the exact values above,",
+ "step back and generalise: what rule, pattern, or intent do those values represent?",
+ "Write instructions that satisfy that rule for any valid input.",
+ ]
+
+ return "\n".join(lines)
+
+
+def variation_prompt_improvement_instructions(
+ history: List[OptimizationContext],
+ model_choices: List[str],
+ variable_choices: List[Dict[str, Any]],
+ initial_instructions: str,
+) -> str:
+ """
+ Improvement instructions section of the variation prompt.
+
+ Includes model-choice guidance, prompt variable rules, and the required
+ output format schema. When history is non-empty, adds feedback-driven
+ improvement directives.
+ """
+ model_instructions = "\n".join(
+ [
+ "You may also choose to change the model if you believe that the current model is "
+ "not performing well or a different model would be better suited for the task. "
+ f"Here are the models you may choose from: {model_choices}. "
+ "You must always return a model property, even if it's the same as the current model.",
+ "When suggesting a new model, you should provide a rationale for why you believe "
+ "the new model would be better suited for the task.",
+ ]
+ )
+
+ # Build a per-variable table: key → sorted list of unique example values
+ # collected across all variable_choices entries.
+ examples: Dict[str, List[str]] = {}
+ for choice in variable_choices:
+ for k, v in choice.items():
+ examples.setdefault(k, [])
+ sv = str(v)
+ if sv not in examples[k]:
+ examples[k].append(sv)
+
+ table_lines = [
+ "## Prompt Variables:",
+ "These are the ONLY valid placeholder names. "
+ "Use them exactly as shown (case-sensitive) with {{...}} syntax:",
+ "",
+ ]
+ for k in sorted(examples.keys()):
+ vals = ", ".join(f'"{v}"' for v in examples[k])
+ table_lines.append(f" - {{{{{k}}}}} (example values: {vals})")
+
+ # Build concrete bad/good counterexamples using the actual keys and values
+ # so the LLM cannot mistake a runtime value for a placeholder name.
+ first_key = sorted(examples.keys())[0] if examples else "variable_name"
+ first_val = examples[first_key][0] if examples.get(first_key) else "some-value"
+ table_lines += [
+ "",
+ "IMPORTANT: The names above are the KEYS — they are the placeholder names.",
+ "The values listed are only runtime examples that will be substituted at call time.",
+ "NEVER use a runtime value as a placeholder name.",
+ f'BAD: "...{{{{...{first_val}...}}}}..." '
+ f'— "{first_val}" is a runtime value, not a placeholder name',
+ f'GOOD: "...{{{{{first_key}}}}}..." '
+ f'— "{first_key}" is the correct placeholder name',
+ ]
+
+ variable_instructions = "\n".join(
+ table_lines
+ + [
+ "",
+ "If a placeholder is not present in the current instructions, "
+ "include it where logically appropriate.",
+ "Here are the original instructions so that you can see how the "
+ "placeholders are used and which are available:",
+ "\nSTART:" "\n" + initial_instructions + "\n",
+ "\nEND OF ORIGINAL INSTRUCTIONS\n",
+ ]
+ )
+
+ tool_instructions = "\n".join(
+ [
+ "## Tool Format:",
+ 'If the current configuration includes tools, you MUST return them '
+ 'unchanged in current_parameters["tools"].',
+ "Do NOT include internal framework tools such as the evaluation tool or structured output tool.",
+ "Each tool must follow this exact format:",
+ "{",
+ ' "name": "tool-name",',
+ ' "type": "function",',
+ ' "description": "What the tool does",',
+ ' "parameters": {',
+ ' "type": "object",',
+ ' "properties": {',
+ ' "param_name": {',
+ ' "type": "type of the input parameter",',
+ ' "description": "Description of the parameter"',
+ " }",
+ " },",
+ ' "required": ["param_name"],',
+ ' "additionalProperties": false',
+ " }",
+ "}",
+ "Example:",
+ "{",
+ ' "name": "user-preferences-lookup",',
+ ' "type": "function",',
+ ' "description": "Looks up user preferences by ID",',
+ ' "parameters": {',
+ ' "type": "object",',
+ ' "properties": {',
+ ' "user_id": {',
+ ' "type": "string",',
+ ' "description": "The user id"',
+ " }",
+ " },",
+ ' "required": ["user_id"],',
+ ' "additionalProperties": false',
+ " }",
+ "}",
+ "",
+ ]
+ )
+
+ parameters_instructions = "\n".join(
+ [
+ "Return these values in a JSON object with the following keys: "
+ "current_instructions, current_parameters, and model.",
+ "Example:",
+ "{",
+ ' "current_instructions": "...',
+ ' "current_parameters": {',
+ ' "...": "..."',
+ " },",
+ ' "model": "gpt-4o"',
+ "}",
+ "Parameters should only be things that are directly parseable by an LLM call, "
+ "for example, temperature, max_tokens, etc.",
+ "Do not include any other parameters that are not directly parseable by an LLM call. "
+ "If you want to provide instruction for tone or other attributes, "
+ "provide them directly in the instructions.",
+ ]
+ )
+
+ if history:
+ return "\n".join(
+ [
+ "## Improvement Instructions:",
+ "Based on the evaluation history above, generate improved agent instructions and parameters.",
+ "Focus on addressing the areas where the evaluation failed or scored below threshold.",
+ "The new configuration should aim to improve the agent's performance on the evaluation criteria.",
+ model_instructions,
+ "",
+ variable_instructions,
+ "",
+ tool_instructions,
+ "",
+ "Return the improved configuration in a structured format that can be parsed to update:",
+ "1. The agent instructions (current_instructions)",
+ "2. The agent parameters (current_parameters)",
+ "3. The model (model) - you must always return a model, "
+ "even if it's the same as the current model.",
+ "4. You should return the tools the user has defined, as-is, on the new parameters. "
+ "Do not modify them, but make sure you do not include internal tools like "
+ "the evaluation tool or structured output tool.",
+ parameters_instructions,
+ ]
+ )
+ else:
+ return "\n".join(
+ [
+ "Generate an improved version of this configuration.",
+ model_instructions,
+ "",
+ variable_instructions,
+ "",
+ tool_instructions,
+ "",
+ parameters_instructions,
+ ]
+ )
+
+
+def variation_prompt_duration_optimization(model_choices: List[str]) -> str:
+ """
+ Duration optimization section of the variation prompt.
+
+ Included when acceptance criteria imply a latency reduction goal. Instructs
+ the LLM to treat response speed as a secondary objective — quality criteria
+ must still be met first — and provides concrete guidance on how to reduce
+ latency through model selection and instruction simplification.
+
+ :param model_choices: List of model IDs the LLM may select from, so it can
+ apply its own knowledge of which models tend to be faster.
+ :return: The duration optimization prompt block.
+ """
+ return "\n".join(
+ [
+ "## Duration Optimization:",
+ "The acceptance criteria for this optimization implies that response latency should be reduced.",
+ "In addition to improving quality, generate a variation that aims to reduce the agent's response time.",
+ "You may:",
+ "- Select a faster model from the available choices if quality requirements can still be met.",
+ f" Available models: {model_choices}",
+ " Use your knowledge of these models to prefer those that are known to respond more quickly.",
+ "- Simplify or shorten the instructions where this does not compromise the acceptance criteria.",
+ " Shorter prompts reduce input token counts and typically yield faster responses.",
+ "- Avoid increasing max_tokens or other parameters that extend generation time.",
+ "Quality criteria remain the primary objective — do not sacrifice passing scores to achieve lower latency.",
+ ]
+ )
diff --git a/packages/optimization/src/ldai_optimizer/util.py b/packages/optimization/src/ldai_optimizer/util.py
new file mode 100644
index 00000000..6f757602
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/util.py
@@ -0,0 +1,305 @@
+"""Utility functions for the LaunchDarkly AI optimization package."""
+
+import inspect
+import json
+import logging
+import random
+import re
+from typing import Any, Awaitable, Dict, List, Optional, Tuple, TypeVar, Union
+
+from ldai_optimizer._slug_words import _ADJECTIVES, _NOUNS
+
+logger = logging.getLogger(__name__)
+
+# Matches LaunchDarkly API key and SDK key formats:
+# api-
+# sdk-
+# cli-
+_KEY_PATTERN = re.compile(r"\b(api|sdk|cli)-[A-Za-z0-9_\-]{16,}\b")
+
+
+class RedactionFilter(logging.Filter):
+ """Logging filter that redacts strings resembling LaunchDarkly API keys.
+
+ Scrubs both the format string (``record.msg``) and each positional argument
+ (``record.args``) before the handler formats the final log line, so raw key
+ values are never written to any log destination.
+ """
+
+ def filter(self, record: logging.LogRecord) -> bool:
+ record.msg = _KEY_PATTERN.sub("[REDACTED]", str(record.msg))
+ if record.args:
+ record.args = tuple(
+ _KEY_PATTERN.sub("[REDACTED]", str(a)) if isinstance(a, str) else a
+ for a in (record.args if isinstance(record.args, tuple) else (record.args,))
+ )
+ return True
+
+
+logger.addFilter(RedactionFilter())
+
+
+def generate_slug() -> str:
+ """Generate a random ``adjective-noun`` slug (e.g. ``blazing-lobster``).
+
+ Produces the same format as ``coolname.generate_slug(2)`` using an
+ internal word list, removing the external dependency.
+
+ :return: A hyphen-joined two-word lowercase string.
+ """
+ return f"{random.choice(_ADJECTIVES)}-{random.choice(_NOUNS)}"
+
+
+def interpolate_variables(text: str, variables: Dict[str, Any]) -> str:
+ """
+ Interpolate ``{{variable}}`` placeholders in text using the provided variables.
+
+ Matches LaunchDarkly's Mustache-style template format so that manually
+ generated variation instructions use the same syntax as LD-fetched templates.
+ Unrecognised placeholders are left unchanged.
+
+ :param text: Template string potentially containing ``{{key}}`` placeholders
+ :param variables: Mapping of variable names to their replacement values
+ :return: Text with all recognised placeholders replaced
+ """
+ def replace(match: re.Match) -> str:
+ key = match.group(1).strip()
+ return str(variables[key]) if key in variables else match.group(0)
+
+ return re.sub(r"\{\{([\w-]+)\}\}", replace, text)
+
+
+def restore_variable_placeholders(
+ text: str,
+ variable_choices: List[Dict[str, Any]],
+ min_value_length: int = 3,
+) -> Tuple[str, List[str]]:
+ """
+ Scan ``text`` for leaked variable values and restore them to ``{{key}}`` form.
+
+ This is the deterministic inverse of :func:`interpolate_variables`. It acts
+ as a post-processing safety net after variation generation: when the LLM
+ hardcodes a concrete variable value (e.g. ``user-123``) instead of writing
+ the placeholder (``{{user_id}}``), this function replaces the value back so
+ subsequent iterations receive correctly templated instructions.
+
+ Values are matched with boundary guards so that a value like ``user-123``
+ inside a longer token like ``user-1234`` is not substituted. Multi-line
+ values are handled identically to single-line ones — ``re.escape`` produces
+ a literal pattern and the lookbehind/lookahead only inspect the character
+ immediately adjacent to the match boundary.
+
+ Values shorter than ``min_value_length`` characters are skipped because
+ short strings (e.g. ``"en"``, ``"US"``) are too likely to appear
+ coincidentally in unrelated prose.
+
+ :param text: The generated instruction string to clean.
+ :param variable_choices: All possible variable dicts, used to build the
+ reverse value→key map. When the same value appears under multiple keys
+ the first key encountered wins.
+ :param min_value_length: Minimum character length a value must have before
+ it is considered for replacement. Defaults to 3.
+ :return: A tuple of ``(cleaned_text, warnings)`` where ``warnings`` is a
+ list of human-readable strings describing each replacement made.
+ """
+ # Build reverse map: string(value) → key. Longest values first so that
+ # a longer value like "user-123-admin" is replaced before the shorter
+ # "user-123" substring, preventing partial-match corruption.
+ value_to_key: Dict[str, str] = {}
+ for choice in variable_choices:
+ for key, value in choice.items():
+ str_value = str(value)
+ if str_value not in value_to_key:
+ value_to_key[str_value] = key
+
+ sorted_entries = sorted(value_to_key.items(), key=lambda kv: len(kv[0]), reverse=True)
+
+ warnings: List[str] = []
+ for value, key in sorted_entries:
+ if len(value) < min_value_length:
+ continue
+ placeholder = f"{{{{{key}}}}}"
+ # Skip if the placeholder is already present — nothing to fix.
+ if placeholder in text and value not in text:
+ continue
+
+ total_count = 0
+
+ # Pass 1: replace {{value}} forms — the LLM used the runtime value as
+ # if it were a placeholder key (e.g. {{user-125}} instead of {{user_id}}).
+ # This must run before the boundary-guarded pass so that the bare value
+ # inside the braces is consumed here rather than matched by pass 2,
+ # which would otherwise leave the surrounding braces and produce
+ # {{{{user_id}}}}.
+ brace_pattern = r'\{\{' + re.escape(value) + r'\}\}'
+ new_text, brace_count = re.subn(brace_pattern, placeholder, text, flags=re.DOTALL)
+ if brace_count:
+ text = new_text
+ total_count += brace_count
+
+ # Pass 2: replace bare value occurrences with a boundary guard so that
+ # "user-123" inside "user-1234" is not substituted.
+ pattern = r'(? _T:
+ """
+ Handle both sync and async callable results.
+
+ :param result: Either a value or an awaitable that returns a value
+ :return: The resolved value
+ """
+ if inspect.isawaitable(result):
+ return await result # type: ignore[return-value]
+ return result # type: ignore[return-value]
+
+
+def validate_variation_response(response_data: Dict[str, Any]) -> List[str]:
+ """Validate the shape of a parsed LLM variation response.
+
+ Checks that the three required fields are present and have the expected
+ types. An empty ``current_parameters`` dict is acceptable; an empty
+ ``current_instructions`` or ``model`` string is flagged as an error
+ because downstream code cannot meaningfully use a blank value.
+
+ :param response_data: Parsed dict from the LLM (output of extract_json_from_response).
+ :return: List of human-readable error strings. Empty list means the response is valid.
+ """
+ errors: List[str] = []
+
+ if "current_instructions" not in response_data:
+ errors.append("missing required field 'current_instructions'")
+ elif not isinstance(response_data["current_instructions"], str):
+ errors.append(
+ f"'current_instructions' must be a string, "
+ f"got {type(response_data['current_instructions']).__name__}"
+ )
+ elif not response_data["current_instructions"].strip():
+ errors.append("'current_instructions' must not be empty")
+
+ if "current_parameters" not in response_data:
+ errors.append("missing required field 'current_parameters'")
+ elif not isinstance(response_data["current_parameters"], dict):
+ errors.append(
+ f"'current_parameters' must be a dict, "
+ f"got {type(response_data['current_parameters']).__name__}"
+ )
+
+ if "model" not in response_data:
+ errors.append("missing required field 'model'")
+ elif not isinstance(response_data["model"], str):
+ errors.append(
+ f"'model' must be a string, got {type(response_data['model']).__name__}"
+ )
+
+ return errors
+
+
+def extract_json_from_response(response_str: str) -> Dict[str, Any]:
+ """
+ Parse a JSON object from an LLM response string.
+
+ Attempts direct JSON parsing first, then progressively falls back to
+ extracting JSON from markdown code blocks and balanced-brace scanning.
+
+ :param response_str: Raw string response from an LLM
+ :return: Parsed dictionary
+ :raises ValueError: If no valid JSON object can be extracted
+ """
+ # Try direct parse first
+ try:
+ return json.loads(response_str)
+ except json.JSONDecodeError:
+ pass
+
+ response_data: Optional[Dict[str, Any]] = None
+
+ # Try to extract JSON from markdown code blocks
+ code_block_match = re.search(
+ r'```(?:json)?\s*(\{.*?\})\s*```',
+ response_str,
+ re.DOTALL,
+ )
+ if code_block_match:
+ try:
+ response_data = json.loads(code_block_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ # Try balanced-brace scanning
+ if response_data is None:
+ start_idx = response_str.find('{')
+ if start_idx != -1:
+ logger.warning(
+ "Direct JSON parse and code-block extraction failed; "
+ "falling back to balanced-brace scanner. "
+ "Response may be malformed JSON (length: %d).",
+ len(response_str),
+ )
+ while start_idx != -1 and response_data is None:
+ brace_count = 0
+ i = start_idx
+ while i < len(response_str):
+ if response_str[i] == '{':
+ brace_count += 1
+ elif response_str[i] == '}':
+ brace_count -= 1
+ if brace_count == 0:
+ json_str = response_str[start_idx:i + 1]
+ try:
+ response_data = json.loads(json_str)
+ except json.JSONDecodeError:
+ start_idx = response_str.find('{', start_idx + 1)
+ break
+ i += 1
+ else:
+ # Exhausted the string without closing the object
+ break
+
+ # Legacy regex fallback
+ if response_data is None:
+ json_match = re.search(
+ r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*"current_instructions"[^{}]*(?:\{[^{}]*\}[^{}]*)*\}',
+ response_str,
+ re.DOTALL,
+ )
+ if json_match:
+ try:
+ response_data = json.loads(json_match.group())
+ except json.JSONDecodeError:
+ logger.debug(
+ "Extracted JSON string failed to parse: %s",
+ json_match.group()[:200],
+ )
+ raise ValueError(
+ "Failed to parse extracted JSON from variation generation response"
+ )
+
+ if response_data is None:
+ logger.debug(
+ "Failed to extract JSON from response. "
+ "Response length: %d",
+ len(response_str),
+ )
+ raise ValueError(
+ "Failed to parse structured output from variation generation. "
+ "Expected JSON object with 'current_instructions', 'current_parameters', and 'model' fields. "
+ f"Response length: {len(response_str)}"
+ )
+
+ return response_data
diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py
new file mode 100644
index 00000000..c441eedc
--- /dev/null
+++ b/packages/optimization/tests/test_client.py
@@ -0,0 +1,4406 @@
+"""Tests for OptimizationClient."""
+
+import json
+from typing import Any, Dict
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from ldai import AIAgentConfig, AIJudgeConfig, AIJudgeConfigDefault, LDAIClient
+from ldai.models import LDMessage, ModelConfig
+from ldai.tracker import TokenUsage
+from ldclient import Context
+
+from ldai_optimizer.client import OptimizationClient, _compute_validation_count, _find_model_config
+from ldai_optimizer.dataclasses import (
+ AIJudgeCallConfig,
+ GroundTruthOptimizationOptions,
+ GroundTruthSample,
+ JudgeResult,
+ OptimizationContext,
+ OptimizationFromConfigOptions,
+ OptimizationJudge,
+ OptimizationJudgeContext,
+ OptimizationOptions,
+ OptimizationResponse,
+ ToolDefinition,
+)
+from ldai_optimizer.prompts import (
+ _acceptance_criteria_implies_duration_optimization,
+ build_new_variation_prompt,
+ variation_prompt_acceptance_criteria,
+ variation_prompt_improvement_instructions,
+ variation_prompt_overfit_warning,
+ variation_prompt_preamble,
+)
+from ldai_optimizer.util import interpolate_variables
+from ldai_optimizer.util import (
+ restore_variable_placeholders,
+)
+
+# ---------------------------------------------------------------------------
+# Shared helpers / fixtures
+# ---------------------------------------------------------------------------
+
+LD_CONTEXT = Context.create("test-user")
+
+AGENT_INSTRUCTIONS = "You are a helpful assistant. Answer using {{language}}."
+VARIATION_RESPONSE = json.dumps({
+ "current_instructions": "You are an improved assistant.",
+ "current_parameters": {"temperature": 0.5},
+ "model": "gpt-4o",
+})
+JUDGE_PASS_RESPONSE = json.dumps({"score": 1.0, "rationale": "Perfect answer."})
+JUDGE_FAIL_RESPONSE = json.dumps({"score": 0.2, "rationale": "Off topic."})
+
+
+def _make_agent_config(
+ instructions: str = AGENT_INSTRUCTIONS,
+ model_name: str = "gpt-4o",
+ parameters: Dict[str, Any] | None = None,
+) -> AIAgentConfig:
+ return AIAgentConfig(
+ key="test-agent",
+ enabled=True,
+ create_tracker=MagicMock,
+ model=ModelConfig(name=model_name, parameters=parameters or {}),
+ instructions=instructions,
+ )
+
+
+def _make_ldai_client(agent_config: AIAgentConfig | None = None) -> MagicMock:
+ mock = MagicMock(spec=LDAIClient)
+ mock.agent_config.return_value = agent_config or _make_agent_config()
+ mock._client = MagicMock()
+ mock._client.variation.return_value = {"instructions": AGENT_INSTRUCTIONS}
+ return mock
+
+
+def _make_options(
+ *,
+ handle_agent_call=None,
+ handle_judge_call=None,
+ judges=None,
+ max_attempts: int = 3,
+ variable_choices=None,
+ **extra,
+) -> OptimizationOptions:
+ if handle_agent_call is None:
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="The capital of France is Paris."))
+ if handle_judge_call is None:
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ if judges is None:
+ judges = {
+ "accuracy": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be accurate and concise.",
+ )
+ }
+ return OptimizationOptions(
+ context_choices=[LD_CONTEXT],
+ max_attempts=max_attempts,
+ model_choices=["gpt-4o", "gpt-4o-mini"],
+ judge_model="gpt-4o",
+ variable_choices=variable_choices or [{"language": "English"}],
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ judges=judges,
+ **extra,
+ )
+
+
+def _make_client(ldai: MagicMock | None = None) -> OptimizationClient:
+ client = OptimizationClient(ldai or _make_ldai_client())
+ return client
+
+
+# ---------------------------------------------------------------------------
+# Util functions
+# ---------------------------------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# _find_model_config
+# ---------------------------------------------------------------------------
+
+
+class TestFindModelConfig:
+ def test_returns_none_when_no_configs(self):
+ assert _find_model_config("gpt-4o", []) is None
+
+ def test_returns_none_when_no_id_match(self):
+ configs = [{"id": "claude-3", "key": "Anthropic.claude-3", "global": True}]
+ assert _find_model_config("gpt-4o", configs) is None
+
+ def test_returns_single_match(self):
+ configs = [{"id": "gpt-4o", "key": "OpenAI.gpt-4o", "global": False}]
+ result = _find_model_config("gpt-4o", configs)
+ assert result is not None
+ assert result["key"] == "OpenAI.gpt-4o"
+
+ def test_prefers_global_match_over_non_global(self):
+ configs = [
+ {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+ {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+ ]
+ result = _find_model_config("gpt-4o", configs)
+ assert result is not None
+ assert result["key"] == "global.gpt-4o"
+
+ def test_prefers_global_match_regardless_of_list_order(self):
+ configs = [
+ {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+ {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+ ]
+ result = _find_model_config("gpt-4o", configs)
+ assert result["key"] == "global.gpt-4o"
+
+ def test_falls_back_to_non_global_when_no_global_exists(self):
+ configs = [
+ {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+ ]
+ result = _find_model_config("gpt-4o", configs)
+ assert result is not None
+ assert result["key"] == "project.gpt-4o"
+
+ def test_treats_missing_global_field_as_non_global(self):
+ configs = [
+ {"id": "gpt-4o", "key": "no-global-field.gpt-4o"},
+ {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+ ]
+ result = _find_model_config("gpt-4o", configs)
+ assert result["key"] == "global.gpt-4o"
+
+
+# ---------------------------------------------------------------------------
+# _extract_agent_tools
+# ---------------------------------------------------------------------------
+
+
+class TestExtractAgentTools:
+ def setup_method(self):
+ self.client = _make_client()
+ self.client._agent_key = "test-agent"
+ self.client._options = _make_options()
+ self.client._agent_config = _make_agent_config()
+ self.client._initialize_class_members_from_config(_make_agent_config())
+
+ def test_returns_empty_list_when_no_tools(self):
+ result = self.client._extract_agent_tools({})
+ assert result == []
+
+ def test_returns_empty_list_when_tools_key_is_empty(self):
+ result = self.client._extract_agent_tools({"tools": []})
+ assert result == []
+
+ def test_returns_structured_output_tool_from_dict(self):
+ tool_dict = {
+ "name": "lookup",
+ "description": "Looks up data",
+ "input_schema": {"type": "object", "properties": {}},
+ }
+ result = self.client._extract_agent_tools({"tools": [tool_dict]})
+ assert len(result) == 1
+ assert isinstance(result[0], ToolDefinition)
+ assert result[0].name == "lookup"
+
+ def test_passes_through_existing_structured_output_tool(self):
+ tool = ToolDefinition(
+ name="my-tool", description="desc", input_schema={}
+ )
+ result = self.client._extract_agent_tools({"tools": [tool]})
+ assert result == [tool]
+
+ def test_wraps_single_non_list_tool(self):
+ tool_dict = {"name": "single", "description": "x", "input_schema": {}}
+ result = self.client._extract_agent_tools({"tools": tool_dict})
+ assert len(result) == 1
+ assert result[0].name == "single"
+
+ def test_converts_object_with_to_dict(self):
+ mock_tool = MagicMock()
+ mock_tool.to_dict.return_value = {
+ "name": "converted",
+ "description": "via to_dict",
+ "input_schema": {},
+ }
+ result = self.client._extract_agent_tools({"tools": [mock_tool]})
+ assert len(result) == 1
+ assert result[0].name == "converted"
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_response
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateResponse:
+ def setup_method(self):
+ self.client = _make_client()
+ self.client._options = _make_options()
+
+ def _ctx_with_scores(self, scores: Dict[str, JudgeResult]) -> OptimizationContext:
+ return OptimizationContext(
+ scores=scores,
+ completion_response="Some response.",
+ current_instructions="Do X.",
+ current_parameters={},
+ current_variables={},
+ iteration=1,
+ )
+
+ def test_passes_when_all_judges_meet_threshold(self):
+ ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.9)})
+ assert self.client._evaluate_response(ctx) is True
+
+ def test_fails_when_judge_below_threshold(self):
+ ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.5)})
+ assert self.client._evaluate_response(ctx) is False
+
+ def test_fails_when_judge_result_missing(self):
+ ctx = self._ctx_with_scores({})
+ assert self.client._evaluate_response(ctx) is False
+
+ def test_passes_at_exact_threshold(self):
+ ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.8)})
+ assert self.client._evaluate_response(ctx) is True
+
+ def test_no_judges_always_passes(self):
+ options = _make_options(judges=None, handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="x")))
+ # Need on_turn to satisfy validation — inject directly
+ options_with_on_turn = OptimizationOptions(
+ context_choices=[LD_CONTEXT],
+ max_attempts=1,
+ model_choices=["gpt-4o"],
+ judge_model="gpt-4o",
+ variable_choices=[{}],
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="x")),
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+ judges={"j": OptimizationJudge(threshold=1.0, acceptance_statement="x")},
+ on_turn=lambda ctx: True,
+ )
+ self.client._options = options_with_on_turn
+ # Without judges, _evaluate_response returns True
+ options_no_judges = MagicMock()
+ options_no_judges.judges = None
+ self.client._options = options_no_judges
+ ctx = self._ctx_with_scores({})
+ assert self.client._evaluate_response(ctx) is True
+
+ def test_multiple_judges_all_must_pass(self):
+ self.client._options = _make_options(
+ judges={
+ "a": OptimizationJudge(threshold=0.8, acceptance_statement="A"),
+ "b": OptimizationJudge(threshold=0.9, acceptance_statement="B"),
+ }
+ )
+ ctx = self._ctx_with_scores({
+ "a": JudgeResult(score=0.9),
+ "b": JudgeResult(score=0.7), # fails
+ })
+ assert self.client._evaluate_response(ctx) is False
+
+ def test_multiple_judges_all_passing(self):
+ self.client._options = _make_options(
+ judges={
+ "a": OptimizationJudge(threshold=0.8, acceptance_statement="A"),
+ "b": OptimizationJudge(threshold=0.8, acceptance_statement="B"),
+ }
+ )
+ ctx = self._ctx_with_scores({
+ "a": JudgeResult(score=0.9),
+ "b": JudgeResult(score=1.0),
+ })
+ assert self.client._evaluate_response(ctx) is True
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_acceptance_judge
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateAcceptanceJudge:
+ def setup_method(self):
+ self.client = _make_client()
+ agent_config = _make_agent_config()
+ self.client._agent_key = "test-agent"
+ self.client._agent_config = agent_config
+ self.client._initialize_class_members_from_config(agent_config)
+ self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ self.client._options = _make_options(handle_judge_call=self.handle_judge_call)
+
+ async def test_returns_parsed_score_and_rationale(self):
+ judge = OptimizationJudge(
+ threshold=0.8, acceptance_statement="Must be concise."
+ )
+ result = await self.client._evaluate_acceptance_judge(
+ judge_key="conciseness",
+ optimization_judge=judge,
+ completion_response="Paris.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What is the capital of France?",
+ )
+ assert result.score == 1.0
+ assert result.rationale == "Perfect answer."
+
+ async def test_handle_judge_call_receives_correct_key_and_config(self):
+ judge = OptimizationJudge(
+ threshold=0.8, acceptance_statement="Must answer the question."
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="relevance",
+ optimization_judge=judge,
+ completion_response="Some answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What time is it?",
+ )
+ call_args = self.handle_judge_call.call_args
+ key, config, ctx, _ = call_args.args
+ assert key == "relevance"
+ assert isinstance(config, AIJudgeCallConfig)
+ assert isinstance(ctx, OptimizationJudgeContext)
+
+ async def test_messages_has_system_and_user_turns(self):
+ judge = OptimizationJudge(
+ threshold=0.8, acceptance_statement="Must be factual."
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="facts",
+ optimization_judge=judge,
+ completion_response="The sky is blue.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What colour is the sky?",
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ roles = [m.role for m in config.messages]
+ assert roles == ["system", "user"]
+
+ async def test_messages_system_content_matches_instructions(self):
+ judge = OptimizationJudge(
+ threshold=0.8, acceptance_statement="Be concise."
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="brevity",
+ optimization_judge=judge,
+ completion_response="Yes.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Is Paris in France?",
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ system_msg = next(m for m in config.messages if m.role == "system")
+ assert system_msg.content == config.instructions
+
+ async def test_messages_user_content_matches_context_user_input(self):
+ judge = OptimizationJudge(
+ threshold=0.8, acceptance_statement="Answer directly."
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="directness",
+ optimization_judge=judge,
+ completion_response="Paris.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Capital of France?",
+ )
+ _, config, ctx, _ = self.handle_judge_call.call_args.args
+ user_msg = next(m for m in config.messages if m.role == "user")
+ assert user_msg.content == ctx.user_input
+
+ async def test_acceptance_statement_in_instructions(self):
+ statement = "Response must mention the Eiffel Tower."
+ judge = OptimizationJudge(threshold=0.8, acceptance_statement=statement)
+ await self.client._evaluate_acceptance_judge(
+ judge_key="tower",
+ optimization_judge=judge,
+ completion_response="Paris has the Eiffel Tower.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Tell me about Paris.",
+ )
+ call_args = self.handle_judge_call.call_args
+ _, config, _, _ = call_args.args
+ assert statement in config.instructions
+
+ async def test_no_structured_output_tool_in_judge_config(self):
+ """Structured output tool must not be injected — judges return plain JSON."""
+ judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be brief.")
+ await self.client._evaluate_acceptance_judge(
+ judge_key="brevity",
+ optimization_judge=judge,
+ completion_response="Yes.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Is Paris in France?",
+ )
+ call_args = self.handle_judge_call.call_args
+ _, config, _, _ = call_args.args
+ tools = config.model.get_parameter("tools") or []
+ assert tools == []
+
+ async def test_agent_tools_included_in_config_tools(self):
+ agent_tool = ToolDefinition(
+ name="lookup", description="Lookup data", input_schema={}
+ )
+ judge = OptimizationJudge(threshold=0.8, acceptance_statement="Use tool.")
+ await self.client._evaluate_acceptance_judge(
+ judge_key="tool-use",
+ optimization_judge=judge,
+ completion_response="I looked it up.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Find me something.",
+ agent_tools=[agent_tool],
+ )
+ call_args = self.handle_judge_call.call_args
+ _, config, _, _ = call_args.args
+ tools = config.model.get_parameter("tools") or []
+ tool_names = [t["name"] for t in tools]
+ assert tool_names == ["lookup"]
+
+ async def test_variables_in_context(self):
+ judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ variables = {"language": "French", "topic": "geography"}
+ await self.client._evaluate_acceptance_judge(
+ judge_key="accuracy",
+ optimization_judge=judge,
+ completion_response="Paris.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Capital?",
+ variables=variables,
+ )
+ call_args = self.handle_judge_call.call_args
+ _, _, ctx, _ = call_args.args
+ assert ctx.current_variables == variables
+
+ async def test_duration_context_added_to_instructions_when_latency_keyword_present(self):
+ """When acceptance statement has a latency keyword and agent_duration_ms is provided,
+ the instructions mention the duration."""
+ judge = OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be fast.",
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="speed",
+ optimization_judge=judge,
+ completion_response="Here is the answer.",
+ iteration=2,
+ reasoning_history="",
+ user_input="Tell me something.",
+ agent_duration_ms=1500.0,
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ assert "1500ms" in config.instructions
+ assert "mention the duration" in config.instructions
+
+ async def test_duration_context_includes_baseline_comparison_when_history_present(self):
+ """When history[0] has a duration, the judge instructions include a baseline comparison."""
+ self.client._history = [
+ OptimizationContext(
+ scores={},
+ completion_response="old response",
+ current_instructions="Do X.",
+ current_parameters={},
+ current_variables={},
+ iteration=1,
+ duration_ms=2000.0,
+ )
+ ]
+ judge = OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="Responses should have low latency.",
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="latency",
+ optimization_judge=judge,
+ completion_response="Here is the answer.",
+ iteration=2,
+ reasoning_history="",
+ user_input="Tell me something.",
+ agent_duration_ms=1500.0,
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ assert "1500ms" in config.instructions
+ assert "2000ms" in config.instructions
+ assert "faster" in config.instructions
+
+ async def test_duration_context_says_slower_when_candidate_is_slower(self):
+ """When the candidate is slower than baseline, the instructions say 'slower'."""
+ self.client._history = [
+ OptimizationContext(
+ scores={},
+ completion_response="old response",
+ current_instructions="Do X.",
+ current_parameters={},
+ current_variables={},
+ iteration=1,
+ duration_ms=1000.0,
+ )
+ ]
+ judge = OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be fast.",
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="speed",
+ optimization_judge=judge,
+ completion_response="Here is the answer.",
+ iteration=2,
+ reasoning_history="",
+ user_input="Tell me something.",
+ agent_duration_ms=1800.0,
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ assert "slower" in config.instructions
+
+ async def test_duration_context_not_added_when_no_latency_keyword(self):
+ """When acceptance statement has no latency keyword, duration is not injected."""
+ judge = OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be accurate.",
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="accuracy",
+ optimization_judge=judge,
+ completion_response="Paris.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Capital of France?",
+ agent_duration_ms=2000.0,
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ assert "2000ms" not in config.instructions
+ assert "duration" not in config.instructions.lower() or "acceptance" in config.instructions.lower()
+
+ async def test_duration_context_not_added_when_agent_duration_ms_is_none(self):
+ """When agent_duration_ms is None, no duration block is added even if keyword matches."""
+ judge = OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be fast.",
+ )
+ await self.client._evaluate_acceptance_judge(
+ judge_key="speed",
+ optimization_judge=judge,
+ completion_response="Here is the answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Tell me something.",
+ agent_duration_ms=None,
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ assert "mention the duration" not in config.instructions
+
+ async def test_returns_zero_score_on_missing_acceptance_statement(self):
+ judge = OptimizationJudge(threshold=0.8, acceptance_statement=None)
+ result = await self.client._evaluate_acceptance_judge(
+ judge_key="broken",
+ optimization_judge=judge,
+ completion_response="Anything.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Hello?",
+ )
+ assert result.score == 0.0
+ self.handle_judge_call.assert_not_called()
+
+ async def test_returns_zero_score_on_parse_failure(self):
+ self.handle_judge_call.return_value = OptimizationResponse(output="not json at all")
+ judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be clear.")
+ result = await self.client._evaluate_acceptance_judge(
+ judge_key="clarity",
+ optimization_judge=judge,
+ completion_response="Clear answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Explain X.",
+ )
+ assert result.score == 0.0
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_config_judge
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateConfigJudge:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+ self.client = _make_client(self.mock_ldai)
+ agent_config = _make_agent_config()
+ self.client._agent_key = "test-agent"
+ self.client._agent_config = agent_config
+ self.client._initialize_class_members_from_config(agent_config)
+ self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ self.client._options = _make_options(handle_judge_call=self.handle_judge_call)
+
+ def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig:
+ return AIJudgeConfig(
+ key="ld-judge-key",
+ enabled=enabled,
+ create_tracker=MagicMock,
+ model=ModelConfig(name="gpt-4o", parameters={}),
+ messages=[
+ LDMessage(role="system", content="You are an evaluator."),
+ LDMessage(role="user", content="Evaluate this response."),
+ ],
+ )
+
+ async def test_calls_handle_judge_call_with_correct_config_type(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Good answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What is X?",
+ )
+ call_args = self.handle_judge_call.call_args
+ key, config, ctx, _ = call_args.args
+ assert key == "quality"
+ assert isinstance(config, AIJudgeCallConfig)
+ assert "You are an evaluator." in config.instructions
+ assert isinstance(ctx, OptimizationJudgeContext)
+
+ async def test_messages_has_system_and_user_turns(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Good answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What is X?",
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ roles = [m.role for m in config.messages]
+ assert roles == ["system", "user"]
+
+ async def test_messages_system_content_matches_instructions(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Good answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What is X?",
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ system_msg = next(m for m in config.messages if m.role == "system")
+ assert system_msg.content == config.instructions
+
+ async def test_messages_user_content_matches_context_user_input(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Good answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What is X?",
+ )
+ _, config, ctx, _ = self.handle_judge_call.call_args.args
+ user_msg = next(m for m in config.messages if m.role == "user")
+ assert user_msg.content == ctx.user_input
+
+ async def test_messages_user_content_contains_ld_user_message(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Good answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What is X?",
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ user_msg = next(m for m in config.messages if m.role == "user")
+ assert "Evaluate this response." in user_msg.content
+
+ async def test_returns_zero_score_when_judge_disabled(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False)
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ result = await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Some answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="What?",
+ )
+ assert result.score == 0.0
+ self.handle_judge_call.assert_not_called()
+
+ async def test_returns_zero_score_when_judge_has_no_messages(self):
+ judge_config = AIJudgeConfig(
+ key="ld-judge-key",
+ enabled=True,
+ create_tracker=MagicMock,
+ model=ModelConfig(name="gpt-4o", parameters={}),
+ messages=None,
+ )
+ self.mock_ldai.judge_config.return_value = judge_config
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ result = await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Any.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Anything?",
+ )
+ assert result.score == 0.0
+ self.handle_judge_call.assert_not_called()
+
+ async def test_template_variables_merged_into_judge_config_call(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ variables = {"language": "Spanish"}
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Q?",
+ variables=variables,
+ )
+ call_kwargs = self.mock_ldai.judge_config.call_args
+ passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {})
+ assert passed_vars.get("language") == "Spanish"
+ assert "message_history" in passed_vars
+ assert "response_to_evaluate" in passed_vars
+
+ async def test_agent_tools_included_without_evaluation_tool(self):
+ self.mock_ldai.judge_config.return_value = self._make_judge_config()
+ agent_tool = ToolDefinition(name="search", description="Search", input_schema={})
+ judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+ await self.client._evaluate_config_judge(
+ judge_key="quality",
+ optimization_judge=judge,
+ completion_response="Answer.",
+ iteration=1,
+ reasoning_history="",
+ user_input="Q?",
+ agent_tools=[agent_tool],
+ )
+ _, config, _, _ = self.handle_judge_call.call_args.args
+ tools = config.model.get_parameter("tools") or []
+ names = [t["name"] for t in tools]
+ assert names == ["search"]
+
+
+# ---------------------------------------------------------------------------
+# _execute_agent_turn
+# ---------------------------------------------------------------------------
+
+
+class TestExecuteAgentTurn:
+ def setup_method(self):
+ self.agent_response = "Paris is the capital of France."
+ self.handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=self.agent_response))
+ self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ self.client = _make_client()
+ agent_config = _make_agent_config()
+ self.client._agent_key = "test-agent"
+ self.client._agent_config = agent_config
+ self.client._initialize_class_members_from_config(agent_config)
+ self.client._options = _make_options(
+ handle_agent_call=self.handle_agent_call,
+ handle_judge_call=self.handle_judge_call,
+ )
+
+ def _make_context(self, user_input: str = "What is the capital of France?") -> OptimizationContext:
+ return OptimizationContext(
+ scores={},
+ completion_response="",
+ current_instructions=AGENT_INSTRUCTIONS,
+ current_parameters={},
+ current_variables={"language": "English"},
+ current_model="gpt-4o",
+ user_input=user_input,
+ iteration=1,
+ )
+
+ async def test_calls_handle_agent_call_with_config_and_context(self):
+ ctx = self._make_context()
+ await self.client._execute_agent_turn(ctx, iteration=1)
+ self.handle_agent_call.assert_called_once()
+ key, config, passed_ctx, _ = self.handle_agent_call.call_args.args
+ assert key == "test-agent"
+ assert isinstance(config, AIAgentConfig)
+ assert passed_ctx is ctx
+
+ async def test_completion_response_stored_in_returned_context(self):
+ ctx = self._make_context()
+ result = await self.client._execute_agent_turn(ctx, iteration=1)
+ assert result.completion_response == self.agent_response
+
+ async def test_judge_scores_stored_in_returned_context(self):
+ ctx = self._make_context()
+ result = await self.client._execute_agent_turn(ctx, iteration=1)
+ assert "accuracy" in result.scores
+ assert result.scores["accuracy"].score == 1.0
+
+ async def test_variables_interpolated_into_agent_config_instructions(self):
+ ctx = self._make_context()
+ await self.client._execute_agent_turn(ctx, iteration=1)
+ _, config, _, _ = self.handle_agent_call.call_args.args
+ assert "{{language}}" not in config.instructions
+ assert "English" in config.instructions
+
+ async def test_raises_on_agent_call_failure(self):
+ self.handle_agent_call.side_effect = RuntimeError("LLM unavailable")
+ ctx = self._make_context()
+ with pytest.raises(RuntimeError, match="LLM unavailable"):
+ await self.client._execute_agent_turn(ctx, iteration=1)
+
+
+# ---------------------------------------------------------------------------
+# _generate_new_variation
+# ---------------------------------------------------------------------------
+
+
+class TestGenerateNewVariation:
+ def setup_method(self):
+ self.handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+ self.client = _make_client()
+ agent_config = _make_agent_config()
+ self.client._agent_key = "test-agent"
+ self.client._agent_config = agent_config
+ self.client._initial_instructions = AGENT_INSTRUCTIONS
+ self.client._initialize_class_members_from_config(agent_config)
+ self.client._options = _make_options(handle_agent_call=self.handle_agent_call)
+
+ async def test_updates_current_instructions(self):
+ await self.client._generate_new_variation(iteration=1, variables={"language": "English"})
+ assert self.client._current_instructions == "You are an improved assistant."
+
+ async def test_updates_current_parameters(self):
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert self.client._current_parameters == {"temperature": 0.5}
+
+ async def test_updates_current_model(self):
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert self.client._current_model == "gpt-4o"
+
+ async def test_no_structured_output_tool_in_variation_config(self):
+ """Variation turn must not inject the structured-output tool — prompts use plain JSON."""
+ await self.client._generate_new_variation(iteration=1, variables={})
+ _, config, _, _ = self.handle_agent_call.call_args.args
+ tools = config.model.get_parameter("tools") or []
+ assert tools == []
+
+ async def test_variation_call_uses_three_arg_signature(self):
+ """handle_agent_call receives exactly (key, config, context) — no tools arg."""
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert len(self.handle_agent_call.call_args.args) == 4
+
+ async def test_model_not_updated_when_not_in_model_choices(self):
+ bad_response = json.dumps({
+ "current_instructions": "New instructions.",
+ "current_parameters": {},
+ "model": "some-unknown-model",
+ })
+ self.handle_agent_call.return_value = OptimizationResponse(output=bad_response)
+ original_model = self.client._current_model
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert self.client._current_model == original_model
+
+ async def test_retries_on_empty_response_and_succeeds(self):
+ """First attempt returns empty string; second returns valid JSON — succeeds."""
+ self.handle_agent_call.side_effect = [
+ OptimizationResponse(output=""), # attempt 1: empty
+ OptimizationResponse(output=VARIATION_RESPONSE), # attempt 2: valid
+ ]
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert self.client._current_instructions == "You are an improved assistant."
+ assert self.handle_agent_call.call_count == 2
+
+ async def test_retries_on_unparseable_response_and_succeeds(self):
+ """First attempt returns non-JSON text; second returns valid JSON — succeeds."""
+ self.handle_agent_call.side_effect = [
+ OptimizationResponse(output="Sorry, I cannot do that."), # attempt 1: not JSON
+ OptimizationResponse(output=VARIATION_RESPONSE), # attempt 2: valid
+ ]
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert self.client._current_instructions == "You are an improved assistant."
+ assert self.handle_agent_call.call_count == 2
+
+ async def test_raises_after_max_retries_exhausted(self):
+ """All three attempts return empty strings — ValueError is raised."""
+ self.handle_agent_call.side_effect = [
+ OptimizationResponse(output=""),
+ OptimizationResponse(output=""),
+ OptimizationResponse(output=""),
+ ]
+ with pytest.raises(ValueError, match="Failed to parse structured output"):
+ await self.client._generate_new_variation(iteration=1, variables={})
+ assert self.handle_agent_call.call_count == 3
+
+
+# ---------------------------------------------------------------------------
+# Full optimization loop
+# ---------------------------------------------------------------------------
+
+
+class TestRunOptimization:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ async def test_succeeds_on_first_attempt_when_judge_passes(self):
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="The capital of France is Paris."))
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert result.scores["accuracy"].score == 1.0
+ # 1 initial agent call + 1 validation sample (repeated draw — only 1 variable choice)
+ assert handle_agent_call.call_count == 2
+
+ async def test_generates_variation_when_judge_fails(self):
+ agent_responses = [
+ OptimizationResponse(output="Bad answer."),
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation generation
+ OptimizationResponse(output="Better answer."),
+ OptimizationResponse(output="Better answer."), # 1 validation sample (repeated draw — only 1 variable choice)
+ ]
+ handle_agent_call = AsyncMock(side_effect=agent_responses)
+ judge_responses = [
+ OptimizationResponse(output=JUDGE_FAIL_RESPONSE),
+ OptimizationResponse(output=JUDGE_PASS_RESPONSE),
+ OptimizationResponse(output=JUDGE_PASS_RESPONSE),
+ ]
+ handle_judge_call = AsyncMock(side_effect=judge_responses)
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ max_attempts=3,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert result.scores["accuracy"].score == 1.0
+ # 1 agent + 1 variation + 1 agent + 1 validation sample
+ assert handle_agent_call.call_count == 4
+
+ async def test_returns_last_context_after_max_attempts(self):
+ # The max_attempts guard fires before variation on the final iteration,
+ # so only iterations 1 and 2 produce a variation call.
+ handle_agent_call = AsyncMock(side_effect=[
+ OptimizationResponse(output="Bad answer."), # iteration 1: agent
+ OptimizationResponse(output=VARIATION_RESPONSE), # iteration 1: variation
+ OptimizationResponse(output="Still bad."), # iteration 2: agent
+ OptimizationResponse(output=VARIATION_RESPONSE), # iteration 2: variation
+ OptimizationResponse(output="Still bad."), # iteration 3: agent (max_attempts reached — no variation)
+ ])
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ max_attempts=3,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert result.scores["accuracy"].score == 0.2
+
+ async def test_on_passing_result_called_on_success(self):
+ on_passing = MagicMock()
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="Great answer."))
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ )
+ options.on_passing_result = on_passing
+ await client.optimize_from_options("test-agent", options)
+ on_passing.assert_called_once()
+
+ async def test_on_failing_result_called_on_max_attempts(self):
+ on_failing = MagicMock()
+ handle_agent_call = AsyncMock(side_effect=[
+ OptimizationResponse(output="Bad."), # iteration 1: agent
+ OptimizationResponse(output=VARIATION_RESPONSE), # iteration 1: variation
+ OptimizationResponse(output="Still bad."), # iteration 2: agent (max_attempts reached — no variation)
+ ])
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ max_attempts=2,
+ )
+ options.on_failing_result = on_failing
+ await client.optimize_from_options("test-agent", options)
+ on_failing.assert_called_once()
+
+ async def test_on_turn_manual_path_success(self):
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="Answer."))
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = OptimizationOptions(
+ context_choices=[LD_CONTEXT],
+ max_attempts=3,
+ model_choices=["gpt-4o"],
+ judge_model="gpt-4o",
+ variable_choices=[{}],
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ judges={"j": OptimizationJudge(threshold=0.8, acceptance_statement="x")},
+ on_turn=lambda ctx: True,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert result.completion_response == "Answer."
+
+ async def test_success_result_carries_main_iteration_context_not_validation_context(self):
+ # The main iteration returns "Main answer." but the validation run returns
+ # "Validation answer.". The result should reflect the main iteration so that
+ # completion_response and user_input are consistent with what was POSTed to the API.
+ agent_responses = [
+ OptimizationResponse(output="Main answer."), # main iteration
+ OptimizationResponse(output="Validation answer."), # validation sample
+ ]
+ handle_agent_call = AsyncMock(side_effect=agent_responses)
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert result.completion_response == "Main answer."
+
+ async def test_status_update_callback_called_at_each_stage(self):
+ statuses = []
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="Good answer."))
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ )
+ options.on_status_update = lambda status, ctx: statuses.append(status)
+ await client.optimize_from_options("test-agent", options)
+ assert "init" in statuses
+ assert "generating" in statuses
+ assert "evaluating" in statuses
+ assert "success" in statuses
+
+
+# ---------------------------------------------------------------------------
+# _compute_validation_count
+# ---------------------------------------------------------------------------
+
+
+class TestComputeValidationCount:
+ def test_pool_of_10_returns_2(self):
+ assert _compute_validation_count(10) == 2
+
+ def test_pool_of_20_returns_5(self):
+ assert _compute_validation_count(20) == 5
+
+ def test_pool_of_16_returns_4(self):
+ assert _compute_validation_count(16) == 4
+
+ def test_small_pool_floors_at_2(self):
+ assert _compute_validation_count(1) == 2
+ assert _compute_validation_count(3) == 2
+
+ def test_large_pool_caps_at_5(self):
+ assert _compute_validation_count(100) == 5
+
+ def test_pool_of_8_returns_2(self):
+ assert _compute_validation_count(8) == 2
+
+
+# ---------------------------------------------------------------------------
+# Validation phase (chaos mode)
+# ---------------------------------------------------------------------------
+
+# Helper: build OptimizationOptions with multiple variable choices so the
+# validation phase has a non-empty distinct pool to sample from.
+def _make_multi_options(
+ *,
+ variable_count: int = 8,
+ user_input_options=None,
+ on_turn=None,
+ handle_agent_call=None,
+ handle_judge_call=None,
+ on_passing_result=None,
+ max_attempts: int = 5,
+) -> OptimizationOptions:
+ if handle_agent_call is None:
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="answer"))
+ if handle_judge_call is None:
+ handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ judges = None if on_turn is not None else {
+ "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ }
+ return OptimizationOptions(
+ context_choices=[LD_CONTEXT],
+ max_attempts=max_attempts,
+ model_choices=["gpt-4o"],
+ judge_model="gpt-4o",
+ variable_choices=[{"x": i} for i in range(variable_count)],
+ user_input_options=user_input_options,
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ judges=judges,
+ on_turn=on_turn,
+ on_passing_result=on_passing_result,
+ )
+
+
+class TestValidationPhase:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ def _make_client(self) -> OptimizationClient:
+ return _make_client(self.mock_ldai)
+
+ async def test_on_passing_result_fires_only_after_all_validation_passes(self):
+ """on_passing_result must not fire until all validation samples pass."""
+ on_passing = MagicMock()
+ client = self._make_client()
+ # 8 variable_choices → validation_count = 2; all judges always pass
+ opts = _make_multi_options(on_passing_result=on_passing)
+ await client.optimize_from_options("test-agent", opts)
+ on_passing.assert_called_once()
+
+ async def test_validation_runs_additional_agent_calls(self):
+ """With 8 variable choices, validation runs 2 extra agent calls after the initial pass."""
+ call_count = [0]
+
+ async def counting_agent(key, config, ctx, is_evaluation=False):
+ call_count[0] += 1
+ return OptimizationResponse(output="answer")
+
+ client = self._make_client()
+ opts = _make_multi_options(handle_agent_call=counting_agent)
+ await client.optimize_from_options("test-agent", opts)
+ # 1 initial pass + 2 validation samples
+ assert call_count[0] == 3
+
+ async def test_validation_failure_suppresses_on_passing_result_then_retries(self):
+ """When a validation sample fails, on_passing_result is not fired and the loop retries."""
+ turn_calls = [0]
+
+ def on_turn(ctx):
+ turn_calls[0] += 1
+ # call 1: initial pass, call 2: first validation FAIL, everything else passes
+ return turn_calls[0] != 2
+
+ on_passing = MagicMock()
+ client = self._make_client()
+ opts = _make_multi_options(
+ on_turn=on_turn,
+ # 8 items → validation_count = 2
+ variable_count=8,
+ handle_agent_call=AsyncMock(side_effect=[
+ OptimizationResponse(output="iter1"), # initial turn (passes)
+ OptimizationResponse(output="val_iter2"), # validation sample 1 (fails)
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation generation
+ OptimizationResponse(output="iter3"), # new attempt initial (passes)
+ OptimizationResponse(output="val_iter4"), # new validation sample 1 (passes)
+ OptimizationResponse(output="val_iter5"), # new validation sample 2 (passes)
+ ]),
+ on_passing_result=on_passing,
+ max_attempts=3,
+ )
+ result = await client.optimize_from_options("test-agent", opts)
+ # Eventually succeeds after one failed validation cycle
+ on_passing.assert_called_once()
+ assert result is not None
+
+ async def test_validation_does_not_reuse_passing_turn_variable(self):
+ """The variable set used in the initial passing turn must not appear in validation."""
+ seen_variables = []
+
+ async def capture_agent(key, config, ctx, is_evaluation=False):
+ seen_variables.append(ctx.current_variables)
+ return OptimizationResponse(output="answer")
+
+ client = self._make_client()
+ opts = _make_multi_options(handle_agent_call=capture_agent, variable_count=8)
+ await client.optimize_from_options("test-agent", opts)
+
+ # First call is the initial passing turn
+ initial_vars = seen_variables[0]
+ # Remaining calls are validation samples — none should match the initial
+ for val_vars in seen_variables[1:]:
+ assert val_vars != initial_vars, (
+ f"Validation reused the passing turn's variables: {initial_vars}"
+ )
+
+ async def test_validation_uses_user_input_options_as_pool_when_provided(self):
+ """When user_input_options is provided, validation samples from that pool."""
+ seen_inputs = []
+
+ async def capture_agent(key, config, ctx, is_evaluation=False):
+ seen_inputs.append(ctx.user_input)
+ return OptimizationResponse(output="answer")
+
+ client = self._make_client()
+ user_inputs = [f"question {i}" for i in range(8)]
+ opts = _make_multi_options(
+ handle_agent_call=capture_agent,
+ user_input_options=user_inputs,
+ )
+ await client.optimize_from_options("test-agent", opts)
+
+ # Initial input is at index 0; all validation inputs must be different
+ initial_input = seen_inputs[0]
+ for val_input in seen_inputs[1:]:
+ assert val_input != initial_input, (
+ f"Validation reused the passing turn's user_input: {initial_input}"
+ )
+
+ async def test_pool_exhaustion_caps_validation_at_available_distinct_items(self):
+ """When fewer distinct items remain than validation_count, all available ones are used."""
+ call_count = [0]
+
+ async def counting_agent(key, config, ctx, is_evaluation=False):
+ call_count[0] += 1
+ return OptimizationResponse(output="answer")
+
+ client = self._make_client()
+ # 3 variable choices → _compute_validation_count(3) = 2, but only 2 remain after
+ # excluding the passing item, so validation_count is still 2 (min of 2 and 2)
+ opts = _make_multi_options(handle_agent_call=counting_agent, variable_count=3)
+ await client.optimize_from_options("test-agent", opts)
+ # 1 initial + 2 validation (uses all remaining distinct items)
+ assert call_count[0] == 3
+
+ async def test_single_variable_choice_falls_back_to_repeated_draw(self):
+ """With only 1 variable choice validation still runs 1 sample (repeated draw)."""
+ call_count = [0]
+
+ async def counting_agent(key, config, ctx, is_evaluation=False):
+ call_count[0] += 1
+ return OptimizationResponse(output="answer")
+
+ client = self._make_client()
+ opts = _make_multi_options(handle_agent_call=counting_agent, variable_count=1)
+ await client.optimize_from_options("test-agent", opts)
+ # 1 initial pass + 1 validation sample (repeated draw from the only item)
+ assert call_count[0] == 2
+
+ async def test_validation_does_not_consume_attempt_budget(self):
+ """Validation samples must not count against max_attempts.
+
+ With max_attempts=2 and 8 variable choices (validation_count=2), a failed
+ validation on attempt 1 should still leave a full attempt 2 available.
+ Without the fix, iteration would be inflated to 3 after validation, which
+ exceeds max_attempts=2 and would trigger _handle_failure prematurely.
+ """
+ turn_calls = [0]
+
+ def on_turn(ctx):
+ turn_calls[0] += 1
+ # attempt 1 passes initial, validation sample 1 fails
+ # attempt 2 passes initial and all validation
+ return turn_calls[0] != 2
+
+ on_passing = MagicMock()
+ client = self._make_client()
+ opts = _make_multi_options(
+ on_turn=on_turn,
+ variable_count=8,
+ handle_agent_call=AsyncMock(side_effect=[
+ OptimizationResponse(output="iter1"), # attempt 1 initial (passes)
+ OptimizationResponse(output="val_iter"), # validation sample 1 (fails)
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation generation
+ OptimizationResponse(output="iter2"), # attempt 2 initial (passes)
+ OptimizationResponse(output="val_iter3"), # validation sample 1 (passes)
+ OptimizationResponse(output="val_iter4"), # validation sample 2 (passes)
+ ]),
+ on_passing_result=on_passing,
+ max_attempts=2,
+ )
+ result = await client.optimize_from_options("test-agent", opts)
+ on_passing.assert_called_once()
+ assert result is not None
+
+ async def test_validating_status_emitted(self):
+ """The 'validating' status must be emitted when entering the validation phase."""
+ statuses = []
+ client = self._make_client()
+ opts = _make_multi_options()
+ opts.on_status_update = lambda s, ctx: statuses.append(s)
+ await client.optimize_from_options("test-agent", opts)
+ assert "validating" in statuses
+
+ async def test_turn_completed_after_validation_failure_uses_main_iteration_context(self):
+ """When validation fails, the 'turn completed' event must carry the MAIN iteration's
+ user_input and completion_response — not the failing validation sample's values.
+
+ Regression test for the mismatch where a record stored userInput='hostel near paris'
+ but completionResponse described 'airbmbs near tahoe' (from a validation run with a
+ different user_input that was folded back onto the main iteration's API record).
+ """
+ turn_calls = [0]
+ status_events: list = []
+
+ user_inputs = [f"query-{i}" for i in range(8)]
+
+ def on_turn(ctx):
+ turn_calls[0] += 1
+ # Call 1: main iteration passes. Call 2: first validation sample FAILS.
+ # Call 3+: everything passes (new attempt succeeds).
+ return turn_calls[0] != 2
+
+ def capture_status(status, ctx):
+ status_events.append((status, ctx.user_input, ctx.completion_response))
+
+ client = self._make_client()
+ opts = _make_multi_options(
+ on_turn=on_turn,
+ variable_count=8,
+ user_input_options=user_inputs,
+ handle_agent_call=AsyncMock(side_effect=[
+ OptimizationResponse(output="main-response"), # main turn (passes)
+ OptimizationResponse(output="val-response"), # validation sample (fails)
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation generation
+ OptimizationResponse(output="main-response-2"), # 2nd attempt main (passes)
+ OptimizationResponse(output="val-response-2"), # 2nd attempt validation (passes)
+ OptimizationResponse(output="val-response-3"), # 2nd attempt validation (passes)
+ ]),
+ max_attempts=3,
+ )
+ opts.on_status_update = capture_status
+ await client.optimize_from_options("test-agent", opts)
+
+ # The 'generating' event captures the main iteration's user_input.
+ # The validation run fires 'generating' as well, but with a different user_input.
+ # The first 'generating' is always the main iteration.
+ generating_events = [(u, r) for s, u, r in status_events if s == "generating"]
+ main_user_input = generating_events[0][0]
+
+ # Find the 'turn completed' event from the first attempt (after validation failure)
+ tc_events = [(u, r) for s, u, r in status_events if s == "turn completed"]
+ assert len(tc_events) >= 1, "Expected at least one 'turn completed' event"
+
+ tc_user_input, tc_completion = tc_events[0]
+ # turn completed must use the MAIN iteration's data, not the validation sample's.
+ # If the bug is present, tc_completion would be "val-response" and tc_user_input
+ # would be the validation sample's query (different from main_user_input).
+ assert tc_completion == "main-response", (
+ f"turn completed should carry the main iteration's completion_response "
+ f"('main-response'), not the validation run's (got: {tc_completion!r})"
+ )
+ assert tc_user_input == main_user_input, (
+ f"turn completed should carry the main iteration's user_input "
+ f"('{main_user_input}'), not the validation run's (got: {tc_user_input!r})"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — acceptance criteria section
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptAcceptanceCriteria:
+ def test_includes_acceptance_statement_in_section(self):
+ judges = {
+ "quality": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="Responses must be concise and factual.",
+ )
+ }
+ section = variation_prompt_acceptance_criteria(judges)
+ assert "Responses must be concise and factual." in section
+ assert "quality" in section
+
+ def test_labels_all_judges(self):
+ judges = {
+ "a": OptimizationJudge(threshold=0.8, acceptance_statement="Must be brief."),
+ "b": OptimizationJudge(threshold=0.9, acceptance_statement="Must cite sources."),
+ }
+ section = variation_prompt_acceptance_criteria(judges)
+ assert "[a]" in section
+ assert "[b]" in section
+ assert "Must be brief." in section
+ assert "Must cite sources." in section
+
+ def test_returns_empty_string_when_no_acceptance_statements(self):
+ judges = {
+ "ld-judge": OptimizationJudge(threshold=0.8, judge_key="some-ld-key"),
+ }
+ section = variation_prompt_acceptance_criteria(judges)
+ assert section == ""
+
+ def test_returns_empty_string_with_no_judges(self):
+ section = variation_prompt_acceptance_criteria(None)
+ assert section == ""
+
+ def test_section_appears_in_full_prompt(self):
+ judges = {
+ "accuracy": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="Facts only.",
+ )
+ }
+ options = _make_options(judges=judges)
+ prompt = build_new_variation_prompt(
+ history=[],
+ judges=judges,
+ current_model="gpt-4o",
+ current_instructions=AGENT_INSTRUCTIONS,
+ current_parameters={},
+ model_choices=options.model_choices,
+ variable_choices=options.variable_choices,
+ initial_instructions=AGENT_INSTRUCTIONS,
+ )
+ assert "Facts only." in prompt
+ assert "ACCEPTANCE CRITERIA" in prompt
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — overfitting warning section
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptOverfitWarning:
+ def _make_ctx(self, user_input=None, variables=None, iteration=1):
+ return OptimizationContext(
+ iteration=iteration,
+ current_instructions=AGENT_INSTRUCTIONS,
+ current_parameters={},
+ current_model="gpt-4o",
+ current_variables=variables or {},
+ user_input=user_input,
+ completion_response=None,
+ scores={},
+ )
+
+ def test_returns_empty_string_with_no_history(self):
+ assert variation_prompt_overfit_warning([]) == ""
+
+ def test_contains_general_overfitting_reminder(self):
+ ctx = self._make_ctx(user_input="What is 2+2?")
+ section = variation_prompt_overfit_warning([ctx])
+ assert "OVERFITTING" in section.upper()
+ assert "generalise" in section.lower() or "generalize" in section.lower() or "generaliz" in section.lower() or "general" in section.lower()
+
+ def test_includes_recent_user_input(self):
+ ctx = self._make_ctx(user_input="What is the capital of France?")
+ section = variation_prompt_overfit_warning([ctx])
+ assert "What is the capital of France?" in section
+
+ def test_includes_recent_variables_as_structured_breakdown(self):
+ ctx = self._make_ctx(variables={"language": "English", "tone": "formal"})
+ section = variation_prompt_overfit_warning([ctx])
+ # Keys (placeholder names) and values must both appear
+ assert "{{language}}" in section
+ assert '"English"' in section
+ assert "{{tone}}" in section
+ assert '"formal"' in section
+
+ def test_variables_section_labels_name_vs_value(self):
+ ctx = self._make_ctx(variables={"user_id": "user-125"})
+ section = variation_prompt_overfit_warning([ctx])
+ assert "{{user_id}}" in section
+ assert '"user-125"' in section
+ assert "placeholder" in section.lower()
+ assert "value" in section.lower()
+ # Must NOT render as a raw Python dict
+ assert "{'user_id': 'user-125'}" not in section
+
+ def test_uses_most_recent_history_entry(self):
+ ctx_old = self._make_ctx(user_input="old question", iteration=1)
+ ctx_new = self._make_ctx(user_input="new question", iteration=2)
+ section = variation_prompt_overfit_warning([ctx_old, ctx_new])
+ assert "new question" in section
+ assert "old question" not in section
+
+ def test_omits_user_input_line_when_none(self):
+ ctx = self._make_ctx(user_input=None, variables={"lang": "en"})
+ section = variation_prompt_overfit_warning([ctx])
+ assert "User input" not in section
+ assert "lang" in section
+
+ def test_omits_variables_line_when_empty(self):
+ ctx = self._make_ctx(user_input="hello", variables={})
+ section = variation_prompt_overfit_warning([ctx])
+ assert "Variables" not in section
+ assert "hello" in section
+
+ def test_warning_appears_in_full_prompt_when_history_present(self):
+ ctx = self._make_ctx(user_input="test question", variables={"k": "v"})
+ prompt = build_new_variation_prompt(
+ history=[ctx],
+ judges=None,
+ current_model="gpt-4o",
+ current_instructions=AGENT_INSTRUCTIONS,
+ current_parameters={},
+ model_choices=["gpt-4o"],
+ variable_choices=[{"k": "v"}],
+ initial_instructions=AGENT_INSTRUCTIONS,
+ )
+ assert "OVERFITTING" in prompt.upper()
+ assert "test question" in prompt
+
+ def test_warning_absent_from_full_prompt_when_no_history(self):
+ prompt = build_new_variation_prompt(
+ history=[],
+ judges=None,
+ current_model="gpt-4o",
+ current_instructions=AGENT_INSTRUCTIONS,
+ current_parameters={},
+ model_choices=["gpt-4o"],
+ variable_choices=[{"k": "v"}],
+ initial_instructions=AGENT_INSTRUCTIONS,
+ )
+ assert "OVERFITTING" not in prompt.upper()
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — preamble key-vs-value note
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptPreamble:
+ def test_contains_key_vs_value_important_note(self):
+ preamble = variation_prompt_preamble()
+ assert "IMPORTANT" in preamble
+ assert "placeholder" in preamble.lower()
+ assert "value" in preamble.lower()
+
+ def test_never_use_value_as_placeholder_name(self):
+ preamble = variation_prompt_preamble()
+ assert "never" in preamble.lower()
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — placeholder table
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptPlaceholderTable:
+ _variable_choices = [
+ {"user_id": "user-123", "trip_purpose": "business"},
+ {"user_id": "user-125", "trip_purpose": "personal"},
+ ]
+
+ def _section(self, variable_choices=None, history=None):
+ return variation_prompt_improvement_instructions(
+ history=history or [],
+ model_choices=["gpt-4o"],
+ variable_choices=variable_choices or self._variable_choices,
+ initial_instructions=AGENT_INSTRUCTIONS,
+ )
+
+ def test_placeholder_names_appear_in_table(self):
+ section = self._section()
+ assert "{{user_id}}" in section
+ assert "{{trip_purpose}}" in section
+
+ def test_example_values_appear_alongside_keys(self):
+ section = self._section()
+ assert 'user-123' in section or 'user-125' in section
+ assert 'business' in section or 'personal' in section
+
+ def test_keys_and_values_clearly_separated(self):
+ section = self._section()
+ assert "example values" in section.lower()
+
+ def test_bad_good_counterexamples_use_actual_values(self):
+ section = self._section()
+ # The bad example must reference a runtime value, good example the key
+ assert "BAD" in section
+ assert "GOOD" in section
+ # At least one of the real values should appear in the bad example
+ assert "user-123" in section or "user-125" in section \
+ or "business" in section or "personal" in section
+
+ def test_raw_placeholder_list_not_used(self):
+ # The old format was a comma-separated list like "{{trip_purpose}}, {{user_id}}"
+ # The new format is a structured table; confirm no bare comma-list
+ section = self._section()
+ assert "{{trip_purpose}}, {{user_id}}" not in section
+ assert "{{user_id}}, {{trip_purpose}}" not in section
+
+ def test_single_variable_choice(self):
+ section = self._section(variable_choices=[{"lang": "en"}])
+ assert "{{lang}}" in section
+ assert 'en' in section
+
+ def test_table_appears_in_full_prompt(self):
+ prompt = build_new_variation_prompt(
+ history=[],
+ judges=None,
+ current_model="gpt-4o",
+ current_instructions=AGENT_INSTRUCTIONS,
+ current_parameters={},
+ model_choices=["gpt-4o"],
+ variable_choices=self._variable_choices,
+ initial_instructions=AGENT_INSTRUCTIONS,
+ )
+ assert "{{user_id}}" in prompt
+ assert "{{trip_purpose}}" in prompt
+ assert "example values" in prompt.lower()
+
+
+# ---------------------------------------------------------------------------
+# interpolate_variables — hyphenated key support
+# ---------------------------------------------------------------------------
+
+
+class TestInterpolateVariables:
+ def test_substitutes_standard_underscore_key(self):
+ result = interpolate_variables("Hello {{user_id}}", {"user_id": "abc"})
+ assert result == "Hello abc"
+
+ def test_substitutes_hyphenated_key(self):
+ result = interpolate_variables("Hello {{user-id}}", {"user-id": "abc"})
+ assert result == "Hello abc"
+
+ def test_leaves_unknown_placeholder_unchanged(self):
+ result = interpolate_variables("Hello {{unknown}}", {"user_id": "abc"})
+ assert result == "Hello {{unknown}}"
+
+ def test_leaves_unknown_hyphenated_placeholder_unchanged(self):
+ result = interpolate_variables("Hello {{bad-125}}", {"user_id": "abc"})
+ assert result == "Hello {{bad-125}}"
+
+ def test_mixed_keys_in_same_string(self):
+ result = interpolate_variables(
+ "{{user-id}} and {{trip_purpose}}",
+ {"user-id": "u-1", "trip_purpose": "leisure"},
+ )
+ assert result == "u-1 and leisure"
+
+ def test_empty_variables_leaves_text_unchanged(self):
+ result = interpolate_variables("{{foo}} bar", {})
+ assert result == "{{foo}} bar"
+
+
+# ---------------------------------------------------------------------------
+# restore_variable_placeholders
+# ---------------------------------------------------------------------------
+
+
+class TestRestoreVariablePlaceholders:
+ _CHOICES = [{"user_id": "user-123", "trip_purpose": "business"}]
+
+ def test_replaces_hardcoded_id_value(self):
+ text = "Use the user ID user-123 to look up preferences."
+ result, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert "{{user_id}}" in result
+ assert "user-123" not in result
+ assert len(warnings) == 1
+ assert "user-123" in warnings[0]
+ assert "{{user_id}}" in warnings[0]
+
+ def test_replaces_multiline_value_verbatim(self):
+ multiline_value = "line one\nline two\nline three"
+ choices = [{"body_text": multiline_value}]
+ text = f"Instructions:\n{multiline_value}\nEnd."
+ result, warnings = restore_variable_placeholders(text, choices)
+ assert "{{body_text}}" in result
+ assert multiline_value not in result
+ assert len(warnings) == 1
+
+ def test_skips_value_shorter_than_min_length(self):
+ choices = [{"lang": "en"}] # "en" is only 2 chars
+ text = "Use language en for this request."
+ result, warnings = restore_variable_placeholders(text, choices, min_value_length=3)
+ assert result == text
+ assert warnings == []
+
+ def test_does_not_partially_match_longer_token(self):
+ """'user-123' must not be replaced inside 'user-1234'."""
+ text = "Contact user-1234 for help."
+ result, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert "user-1234" in result
+ assert warnings == []
+
+ def test_replaces_multiple_variables(self):
+ text = "User user-123 is on a business trip."
+ result, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert "{{user_id}}" in result
+ assert "{{trip_purpose}}" in result
+ assert "user-123" not in result
+ assert "business" not in result
+ assert len(warnings) == 2
+
+ def test_leaves_correct_placeholder_unchanged(self):
+ text = "User {{user_id}} is on a {{trip_purpose}} trip."
+ result, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert result == text
+ assert warnings == []
+
+ def test_replaces_multiple_occurrences_of_same_value(self):
+ text = "user-123 and user-123 are duplicates."
+ result, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert result == "{{user_id}} and {{user_id}} are duplicates."
+ assert "2 occurrence(s)" in warnings[0]
+
+ def test_longer_value_replaced_before_shorter_substring(self):
+ """When one value is a prefix of another, the longer one is replaced first."""
+ choices = [{"full_id": "user-123-admin", "short_id": "user-123"}]
+ text = "Admin is user-123-admin, regular is user-123."
+ result, warnings = restore_variable_placeholders(text, choices)
+ assert "{{full_id}}" in result
+ assert "{{short_id}}" in result
+ assert "user-123-admin" not in result
+ # The shorter value should not have corrupted the longer replacement
+ assert result.count("{{full_id}}") == 1
+ assert result.count("{{short_id}}") == 1
+
+ def test_replaces_brace_wrapped_value_without_double_bracketing(self):
+ """{{user-125}} must become {{user_id}}, not {{{{user_id}}}}."""
+ text = "Fetch preferences for user {{user-123}}."
+ result, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert result == "Fetch preferences for user {{user_id}}."
+ assert len(warnings) == 1
+
+ def test_empty_variable_choices_returns_text_unchanged(self):
+ text = "Some instructions here."
+ result, warnings = restore_variable_placeholders(text, [])
+ assert result == text
+ assert warnings == []
+
+ def test_warning_message_format(self):
+ text = "Handle user user-123 carefully."
+ _, warnings = restore_variable_placeholders(text, self._CHOICES)
+ assert any("user-123" in w for w in warnings)
+ assert any("{{user_id}}" in w for w in warnings)
+
+ async def test_apply_variation_response_calls_restore_and_logs_warning(self):
+ """_apply_new_variation_response must restore leaked values and log warnings."""
+ leaked_instructions = "You serve user user-123 on a business trip."
+ variation_response = json.dumps({
+ "current_instructions": leaked_instructions,
+ "current_parameters": {},
+ "model": "gpt-4o",
+ })
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=variation_response))
+ client = _make_client()
+ agent_config = _make_agent_config()
+ client._agent_key = "test-agent"
+ client._agent_config = agent_config
+ client._initial_instructions = AGENT_INSTRUCTIONS
+ client._initialize_class_members_from_config(agent_config)
+ client._options = _make_options(
+ handle_agent_call=handle_agent_call,
+ variable_choices=[{"user_id": "user-123", "trip_purpose": "business"}],
+ )
+
+ with patch("ldai_optimizer.client.logger") as mock_logger:
+ await client._generate_new_variation(iteration=1, variables={})
+ warning_calls = [
+ call for call in mock_logger.warning.call_args_list
+ if "user-123" in str(call) or "business" in str(call)
+ ]
+ assert len(warning_calls) >= 1
+
+ assert "{{user_id}}" in client._current_instructions
+ assert "user-123" not in client._current_instructions
+
+
+# ---------------------------------------------------------------------------
+# _build_options_from_config helpers
+# ---------------------------------------------------------------------------
+
+_API_CONFIG: Dict[str, Any] = {
+ "id": "opt-uuid-123",
+ "key": "my-optimization",
+ "aiConfigKey": "my-agent",
+ "maxAttempts": 3,
+ "modelChoices": ["gpt-4o", "gpt-4o-mini"],
+ "judgeModel": "gpt-4o",
+ "variableChoices": [{"language": "English"}],
+ "acceptanceStatements": [{"statement": "Be accurate.", "threshold": 0.9}],
+ "judges": [],
+ "userInputOptions": ["What is 2+2?"],
+ "version": 2,
+ "createdAt": 1700000000,
+}
+
+
+def _make_from_config_options(**overrides: Any) -> OptimizationFromConfigOptions:
+ defaults: Dict[str, Any] = dict(
+ project_key="my-project",
+ context_choices=[LD_CONTEXT],
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="The answer is 4.")),
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+ )
+ defaults.update(overrides)
+ return OptimizationFromConfigOptions(**defaults)
+
+
+def _make_mock_api_client() -> MagicMock:
+ mock = MagicMock()
+ mock.post_agent_optimization_result = MagicMock(return_value="result-uuid-789")
+ mock.patch_agent_optimization_result = MagicMock()
+ mock.get_model_configs = MagicMock(return_value=[])
+ return mock
+
+
+# ---------------------------------------------------------------------------
+# _build_options_from_config
+# ---------------------------------------------------------------------------
+
+
+class TestBuildOptionsFromConfig:
+ def setup_method(self):
+ self.client = _make_client()
+ self.client._agent_key = "my-agent"
+ self.client._initialize_class_members_from_config(_make_agent_config())
+ self.client._options = _make_options()
+ self.api_client = _make_mock_api_client()
+
+ def _build(self, config=None, options=None) -> OptimizationOptions:
+ return self.client._build_options_from_config(
+ config or dict(_API_CONFIG),
+ options or _make_from_config_options(),
+ self.api_client,
+ optimization_key="opt-key-123",
+ run_id="run-uuid-456",
+ model_configs=[],
+ )
+
+ def test_acceptance_statements_mapped_to_judges(self):
+ result = self._build()
+ assert "acceptance-statement-0" in result.judges
+ judge = result.judges["acceptance-statement-0"]
+ assert judge.acceptance_statement == "Be accurate."
+ assert judge.threshold == 0.9
+
+ def test_multiple_acceptance_statements_get_indexed_keys(self):
+ config = dict(_API_CONFIG, acceptanceStatements=[
+ {"statement": "First.", "threshold": 0.8},
+ {"statement": "Second.", "threshold": 0.7},
+ ])
+ result = self._build(config=config)
+ assert "acceptance-statement-0" in result.judges
+ assert "acceptance-statement-1" in result.judges
+ assert result.judges["acceptance-statement-0"].acceptance_statement == "First."
+ assert result.judges["acceptance-statement-1"].acceptance_statement == "Second."
+
+ def test_judges_mapped_by_key(self):
+ config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
+ {"key": "accuracy", "threshold": 0.85},
+ ])
+ result = self._build(config=config)
+ assert "accuracy" in result.judges
+ judge = result.judges["accuracy"]
+ assert judge.judge_key == "accuracy"
+ assert judge.threshold == 0.85
+
+ def test_acceptance_statements_and_judges_merged(self):
+ config = dict(_API_CONFIG,
+ acceptanceStatements=[{"statement": "Be brief.", "threshold": 0.8}],
+ judges=[{"key": "accuracy", "threshold": 0.9}],
+ )
+ result = self._build(config=config)
+ assert "acceptance-statement-0" in result.judges
+ assert "accuracy" in result.judges
+
+ def test_raises_when_no_judges_no_ground_truth_no_on_turn(self):
+ config = dict(_API_CONFIG, acceptanceStatements=[], judges=[])
+ with pytest.raises(ValueError, match="no acceptance statements or judges"):
+ self._build(config=config)
+
+ def test_ground_truth_responses_alone_does_not_pass_no_criteria_check(self):
+ # groundTruthResponses is not yet implemented as standalone criteria;
+ # OptimizationOptions still requires judges or on_turn.
+ config = dict(_API_CONFIG, acceptanceStatements=[], judges=[], groundTruthResponses=["4"])
+ with pytest.raises((ValueError, Exception)):
+ self._build(config=config)
+
+ def test_on_turn_satisfies_no_judges_requirement(self):
+ config = dict(_API_CONFIG, acceptanceStatements=[], judges=[])
+ options = _make_from_config_options(on_turn=lambda ctx: True)
+ result = self._build(config=config, options=options)
+ assert result.on_turn is not None
+
+ def test_empty_variable_choices_defaults_to_single_empty_dict(self):
+ config = dict(_API_CONFIG, variableChoices=[])
+ result = self._build(config=config)
+ assert result.variable_choices == [{}]
+
+ def test_non_empty_variable_choices_passed_through(self):
+ result = self._build()
+ assert result.variable_choices == [{"language": "English"}]
+
+ def test_empty_user_input_options_becomes_none(self):
+ config = dict(_API_CONFIG, userInputOptions=[])
+ result = self._build(config=config)
+ assert result.user_input_options is None
+
+ def test_non_empty_user_input_options_passed_through(self):
+ result = self._build()
+ assert result.user_input_options == ["What is 2+2?"]
+
+ def test_max_attempts_from_config(self):
+ result = self._build()
+ assert result.max_attempts == 3
+
+ def test_model_choices_provider_prefix_stripped(self):
+ config = dict(_API_CONFIG, modelChoices=["OpenAI.gpt-4o", "Anthropic.claude-opus-4-5"])
+ result = self._build(config=config)
+ assert result.model_choices == ["gpt-4o", "claude-opus-4-5"]
+
+ def test_judge_model_provider_prefix_stripped(self):
+ config = dict(_API_CONFIG, judgeModel="OpenAI.gpt-4o")
+ result = self._build(config=config)
+ assert result.judge_model == "gpt-4o"
+
+ def test_model_choices_without_prefix_unchanged(self):
+ result = self._build()
+ assert result.model_choices == ["gpt-4o", "gpt-4o-mini"]
+
+ def test_judge_model_without_prefix_unchanged(self):
+ result = self._build()
+ assert result.judge_model == "gpt-4o"
+
+ def test_model_with_multiple_dots_only_prefix_stripped(self):
+ config = dict(_API_CONFIG, judgeModel="Anthropic.claude-opus-4.6")
+ result = self._build(config=config)
+ assert result.judge_model == "claude-opus-4.6"
+
+ def test_callbacks_forwarded_from_options(self):
+ handle_agent = AsyncMock(return_value=OptimizationResponse(output="ok"))
+ handle_judge = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+ options = _make_from_config_options(
+ handle_agent_call=handle_agent,
+ handle_judge_call=handle_judge,
+ on_passing_result=MagicMock(),
+ on_failing_result=MagicMock(),
+ )
+ result = self._build(options=options)
+ assert result.handle_agent_call is handle_agent
+ assert result.handle_judge_call is handle_judge
+ assert result.on_passing_result is options.on_passing_result
+ assert result.on_failing_result is options.on_failing_result
+
+ def test_persist_and_forward_posts_result_on_status_update(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={},
+ completion_response="The answer is 4.",
+ current_instructions="Be helpful.",
+ current_parameters={"temperature": 0.7},
+ current_variables={"language": "English"},
+ current_model="gpt-4o",
+ user_input="What is 2+2?",
+ iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ self.api_client.post_agent_optimization_result.assert_called_once()
+ call_args = self.api_client.post_agent_optimization_result.call_args
+ assert call_args[0][0] == "my-project"
+ assert call_args[0][1] == "opt-key-123"
+
+ def test_persist_and_forward_payload_has_correct_field_names(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={"j": JudgeResult(score=0.9, rationale="Good.")},
+ completion_response="Paris.",
+ current_instructions="Be helpful.",
+ current_parameters={"temperature": 0.5},
+ current_variables={},
+ current_model="gpt-4o",
+ user_input="Capital of France?",
+ iteration=2,
+ )
+ result.on_status_update("evaluating", ctx)
+ # POST payload contains the camelCase iteration-level fields
+ post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+ assert post_payload["instructions"] == "Be helpful."
+ assert post_payload["parameters"] == {"temperature": 0.5}
+ assert post_payload["userInput"] == "Capital of France?"
+ assert post_payload["iteration"] == 2
+ # Telemetry and scores are in the PATCH payload
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ assert patch_payload["completionResponse"] == "Paris."
+ assert "j" in patch_payload["scores"]
+
+ def test_persist_and_forward_scores_include_threshold_for_known_judges(self):
+ # Build with a config that has a known acceptance-statement judge (threshold=0.9)
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={"acceptance-statement-0": JudgeResult(score=0.85, rationale="Close.")},
+ completion_response="An answer.",
+ current_instructions="Be helpful.",
+ current_parameters={},
+ current_variables={},
+ iteration=1,
+ )
+ result.on_status_update("evaluating", ctx)
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ score_entry = patch_payload["scores"]["acceptance-statement-0"]
+ assert score_entry["score"] == 0.85
+ assert score_entry["rationale"] == "Close."
+ assert score_entry["threshold"] == 0.9
+
+ def test_persist_and_forward_scores_omit_threshold_for_unknown_judge_key(self):
+ # A score whose key doesn't match any configured judge should not include threshold
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={"unknown-judge": JudgeResult(score=0.5, rationale="Unknown.")},
+ completion_response="Answer.",
+ current_instructions="",
+ current_parameters={},
+ current_variables={},
+ iteration=1,
+ )
+ result.on_status_update("evaluating", ctx)
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ score_entry = patch_payload["scores"]["unknown-judge"]
+ assert score_entry["score"] == 0.5
+ assert "threshold" not in score_entry
+
+ def test_persist_and_forward_includes_run_id_and_version(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+ assert post_payload["runId"] == "run-uuid-456"
+ assert post_payload["agentOptimizationVersion"] == 2
+
+ def test_second_call_same_iteration_does_not_post_again(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ result.on_status_update("evaluating", ctx)
+ # POST is called only once (first encounter of iteration 1)
+ assert self.api_client.post_agent_optimization_result.call_count == 1
+ # PATCH is called twice
+ assert self.api_client.patch_agent_optimization_result.call_count == 2
+
+ def test_each_new_iteration_posts_a_new_record(self):
+ result = self._build()
+ ctx1 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ ctx2 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=2,
+ )
+ result.on_status_update("generating", ctx1)
+ result.on_status_update("generating", ctx2)
+ assert self.api_client.post_agent_optimization_result.call_count == 2
+
+ @pytest.mark.parametrize("sdk_status,expected_status,expected_activity", [
+ ("init", "RUNNING", "PENDING"),
+ ("generating", "RUNNING", "GENERATING"),
+ ("evaluating", "RUNNING", "EVALUATING"),
+ ("generating variation", "RUNNING", "GENERATING_VARIATION"),
+ ("validating", "RUNNING", "EVALUATING"),
+ ("turn completed", "RUNNING", "COMPLETED"),
+ ("success", "PASSED", "COMPLETED"),
+ ("failure", "FAILED", "COMPLETED"),
+ ])
+ def test_status_mapping(self, sdk_status, expected_status, expected_activity):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update(sdk_status, ctx)
+ # status and activity are in the PATCH payload, not the POST payload
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ assert patch_payload["status"] == expected_status
+ assert patch_payload["activity"] == expected_activity
+
+ def test_user_on_status_update_chained_after_post_and_patch(self):
+ call_order = []
+ self.api_client.post_agent_optimization_result.side_effect = (
+ lambda *a, **kw: call_order.append("post") or "result-id"
+ )
+ self.api_client.patch_agent_optimization_result.side_effect = (
+ lambda *a, **kw: call_order.append("patch")
+ )
+ user_cb = MagicMock(side_effect=lambda s, c: call_order.append("user"))
+ options = _make_from_config_options(on_status_update=user_cb)
+ result = self._build(options=options)
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ assert call_order == ["post", "patch", "user"]
+
+ def test_user_on_status_update_exception_does_not_propagate(self):
+ options = _make_from_config_options(
+ on_status_update=MagicMock(side_effect=RuntimeError("cb boom"))
+ )
+ result = self._build(options=options)
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("generating", ctx) # must not raise
+
+ def test_post_payload_does_not_contain_history(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+ assert "history" not in post_payload
+
+ @pytest.mark.parametrize("status", [
+ "init", "generating", "evaluating", "generating variation",
+ "validating", "turn completed", "success", "failure",
+ ])
+ def test_variation_included_in_patch_for_all_statuses(self, status):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={},
+ completion_response="answer",
+ current_instructions="Be concise.",
+ current_parameters={"temperature": 0.3},
+ current_variables={},
+ current_model="gpt-4o",
+ iteration=1,
+ )
+ result.on_status_update(status, ctx)
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ assert "variation" in patch_payload
+ assert patch_payload["variation"]["instructions"] == "Be concise."
+ assert patch_payload["variation"]["parameters"] == {"temperature": 0.3}
+
+ @pytest.mark.parametrize("status", ["generating", "evaluating", "success"])
+ def test_model_config_key_prefers_global_in_variation(self, status):
+ model_configs = [
+ {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+ {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+ ]
+ result = self.client._build_options_from_config(
+ dict(_API_CONFIG),
+ _make_from_config_options(),
+ self.api_client,
+ optimization_key="opt-key-123",
+ run_id="run-uuid-456",
+ model_configs=model_configs,
+ )
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="instr",
+ current_parameters={}, current_variables={}, current_model="gpt-4o",
+ iteration=1,
+ )
+ result.on_status_update(status, ctx)
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ assert patch_payload["variation"]["modelConfigKey"] == "global.gpt-4o"
+
+ @pytest.mark.parametrize("status", ["generating", "evaluating", "success"])
+ def test_model_config_key_resolved_in_variation(self, status):
+ model_configs = [{"id": "gpt-4o", "key": "OpenAI.gpt-4o"}]
+ result = self.client._build_options_from_config(
+ dict(_API_CONFIG),
+ _make_from_config_options(),
+ self.api_client,
+ optimization_key="opt-key-123",
+ run_id="run-uuid-456",
+ model_configs=model_configs,
+ )
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="instr",
+ current_parameters={}, current_variables={}, current_model="gpt-4o",
+ iteration=1,
+ )
+ result.on_status_update(status, ctx)
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ assert patch_payload["variation"]["modelConfigKey"] == "OpenAI.gpt-4o"
+
+ def test_generation_latency_cast_to_int(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, duration_ms=123.7,
+ iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+ assert patch_payload["generationLatency"] == 123
+ assert isinstance(patch_payload["generationLatency"], int)
+
+ def test_last_optimization_result_id_updated_on_post(self):
+ result = self._build()
+ ctx = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("generating", ctx)
+ assert self.client._last_optimization_result_id == "result-uuid-789"
+
+ def test_validation_sub_iterations_do_not_create_new_records(self):
+ """Validation sub-iterations should be folded into the parent iteration's record."""
+ result = self._build()
+ ctx_main = OptimizationContext(
+ scores={}, completion_response="a", current_instructions="i",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ ctx_val1 = OptimizationContext(
+ scores={}, completion_response="b", current_instructions="i",
+ current_parameters={}, current_variables={}, iteration=2,
+ )
+ ctx_val2 = OptimizationContext(
+ scores={}, completion_response="c", current_instructions="i",
+ current_parameters={}, current_variables={}, iteration=3,
+ )
+ result.on_status_update("generating", ctx_main) # POST iter 1
+ result.on_status_update("evaluating", ctx_main) # PATCH iter 1
+ result.on_status_update("validating", ctx_main) # enter validation; PATCH iter 1
+ result.on_status_update("generating", ctx_val1) # validation sub-iter → folded to iter 1
+ result.on_status_update("evaluating", ctx_val1) # folded to iter 1
+ result.on_status_update("generating", ctx_val2) # validation sub-iter → folded to iter 1
+ result.on_status_update("evaluating", ctx_val2) # folded to iter 1
+ result.on_status_update("success", ctx_val2) # folded to iter 1; reset validation
+
+ # Only one POST for the single main iteration
+ assert self.api_client.post_agent_optimization_result.call_count == 1
+ post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+ assert post_payload["iteration"] == 1
+
+ def test_validation_success_patches_parent_iteration_record(self):
+ """success event during validation should PATCH the main iteration's record, not a new one."""
+ result = self._build()
+ ctx_main = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=2,
+ )
+ ctx_val = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=3,
+ )
+ result.on_status_update("generating", ctx_main)
+ result.on_status_update("validating", ctx_main)
+ result.on_status_update("generating", ctx_val)
+ result.on_status_update("success", ctx_val)
+
+ # PATCH for success should use the result_id of the parent (iter 2) record
+ patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+ success_patch = next(
+ c for c in patch_calls if c[0][3].get("status") == "PASSED"
+ )
+ # Third positional arg is result_id — it should be the one returned from the POST for iter 2
+ assert success_patch[0][2] == "result-uuid-789"
+
+ def test_validation_phase_resets_after_turn_completed(self):
+ """After turn completed, subsequent main-loop iterations create their own records."""
+ result = self._build()
+ ctx1 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ ctx_val = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=2,
+ )
+ ctx2 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=2,
+ )
+ result.on_status_update("generating", ctx1) # POST iter 1
+ result.on_status_update("validating", ctx1) # enter validation
+ result.on_status_update("generating", ctx_val) # folded to iter 1
+ result.on_status_update("turn completed", ctx_val) # reset validation phase
+ result.on_status_update("generating", ctx2) # POST iter 2 (new main attempt)
+
+ assert self.api_client.post_agent_optimization_result.call_count == 2
+
+ def test_init_iteration_closed_when_first_real_iteration_begins(self):
+ """The init record (iter 0) must receive a RUNNING:COMPLETED patch before iter 1 starts."""
+ result = self._build()
+ ctx0 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=0,
+ )
+ ctx1 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ result.on_status_update("init", ctx0) # POST iter 0, PATCH RUNNING:PENDING
+ result.on_status_update("generating", ctx1) # should close iter 0, then POST iter 1
+
+ # iter 0 POSTed + iter 1 POSTed
+ assert self.api_client.post_agent_optimization_result.call_count == 2
+ patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+ # Patches: (1) init PENDING, (2) auto-close COMPLETED, (3) generating GENERATING
+ assert len(patch_calls) == 3
+ payloads = [c[0][3] for c in patch_calls]
+ assert payloads[0]["status"] == "RUNNING"
+ assert payloads[0]["activity"] == "PENDING"
+ assert "variation" in payloads[0]
+ assert payloads[1] == {"status": "RUNNING", "activity": "COMPLETED"} # auto-close patch has no variation
+ assert payloads[2]["status"] == "RUNNING"
+ assert payloads[2]["activity"] == "GENERATING"
+ assert "variation" in payloads[2]
+
+ def test_non_final_gt_sample_closed_when_next_sample_begins(self):
+ """In a GT batch, each sample except the last should receive a RUNNING:COMPLETED patch
+ when the next sample's generating event fires."""
+ result = self._build()
+ ctx1 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, user_input="What is 2+2?", iteration=1,
+ )
+ ctx2 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, user_input="What is 3+3?", iteration=2,
+ )
+ ctx3 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, user_input="What is 4+4?", iteration=3,
+ )
+ result.on_status_update("generating", ctx1) # POST iter 1
+ result.on_status_update("evaluating", ctx1) # PATCH iter 1 (EVALUATING)
+ result.on_status_update("generating", ctx2) # should auto-close iter 1, then POST iter 2
+ result.on_status_update("evaluating", ctx2) # PATCH iter 2 (EVALUATING)
+ result.on_status_update("generating", ctx3) # should auto-close iter 2, then POST iter 3
+
+ patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+ activities = [c[0][3].get("activity") for c in patch_calls]
+ # Expected sequence: GENERATING, EVALUATING, COMPLETED (auto-close 1),
+ # GENERATING, EVALUATING, COMPLETED (auto-close 2), GENERATING
+ assert activities.count("COMPLETED") >= 2, (
+ f"Expected at least 2 COMPLETED patches, got: {activities}"
+ )
+ # The auto-close patches must appear BEFORE the subsequent GENERATING patches
+ completed_indices = [i for i, a in enumerate(activities) if a == "COMPLETED"]
+ generating_indices = [i for i, a in enumerate(activities) if a == "GENERATING"]
+ # Each auto-close patch should precede the next generating patch
+ assert completed_indices[0] < generating_indices[1]
+ assert completed_indices[1] < generating_indices[2]
+
+ def test_terminal_event_clears_open_iteration_so_next_generating_does_not_double_close(self):
+ """After a terminal event (turn completed), the next generating should not try to
+ close the already-closed iteration again."""
+ result = self._build()
+ ctx1 = OptimizationContext(
+ scores={}, completion_response="answer", current_instructions="Be helpful.",
+ current_parameters={}, current_variables={}, iteration=1,
+ )
+ ctx2 = OptimizationContext(
+ scores={}, completion_response="", current_instructions="",
+ current_parameters={}, current_variables={}, iteration=2,
+ )
+ result.on_status_update("generating", ctx1) # open iter 1
+ result.on_status_update("turn completed", ctx1) # close iter 1 explicitly
+ result.on_status_update("generating", ctx2) # new iter — should NOT re-close iter 1
+
+ patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+ # The only RUNNING:COMPLETED patch should be from "turn completed", not from the
+ # auto-close triggered by iter 2's generating event.
+ completed_patches = [
+ c for c in patch_calls
+ if c[0][3].get("status") == "RUNNING" and c[0][3].get("activity") == "COMPLETED"
+ ]
+ assert len(completed_patches) == 1, (
+ "Expected exactly one RUNNING:COMPLETED patch (from turn completed), not a duplicate"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Token limiting
+# ---------------------------------------------------------------------------
+
+
+class TestTokenLimiting:
+ """Tests that the process halts and marks itself failed when token usage
+ meets or exceeds the configured token_limit."""
+
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ # -- chaos (optimize_from_options) -----------------------------------
+
+ async def test_chaos_stops_when_token_limit_exceeded_on_first_iteration(self):
+ """Token limit exceeded after the first agent turn should immediately fail."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Some answer.",
+ usage=TokenUsage(total=500, input=300, output=200),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=499, # limit is below the 500 tokens returned
+ max_attempts=5,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ # Should have called the agent exactly once then stopped
+ assert handle_agent_call.call_count == 1
+ assert client._last_run_succeeded is False
+
+ async def test_chaos_does_not_stop_when_token_limit_not_exceeded(self):
+ """Process should continue normally when total tokens stay below limit."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Some answer.",
+ usage=TokenUsage(total=100, input=60, output=40),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=10000,
+ max_attempts=3,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert client._last_run_succeeded is True
+
+ async def test_chaos_stops_when_limit_reached_exactly(self):
+ """gte logic: limit == total usage should trigger failure."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Some answer.",
+ usage=TokenUsage(total=500, input=300, output=200),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=500, # exactly equal — should trigger
+ max_attempts=5,
+ )
+ await client.optimize_from_options("test-agent", options)
+ assert handle_agent_call.call_count == 1
+ assert client._last_run_succeeded is False
+
+ async def test_chaos_judge_tokens_accumulate_toward_limit(self):
+ """Judge token usage is included in the running total."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Some answer.",
+ usage=TokenUsage(total=100, input=60, output=40),
+ )
+ )
+ # Judge response contributes 450 tokens — combined with agent's 100 → 550 > 200
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output=JUDGE_PASS_RESPONSE,
+ usage=TokenUsage(total=450, input=300, output=150),
+ )
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=200, # 100 (agent) + 450 (judge) = 550 > 200
+ max_attempts=5,
+ )
+ await client.optimize_from_options("test-agent", options)
+ assert client._last_run_succeeded is False
+
+ async def test_chaos_no_limit_does_not_enforce_token_cap(self):
+ """When token_limit is None, no cap is applied regardless of usage."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Some answer.",
+ usage=TokenUsage(total=999999, input=500000, output=499999),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ # no token_limit set
+ max_attempts=3,
+ )
+ result = await client.optimize_from_options("test-agent", options)
+ assert client._last_run_succeeded is True
+
+ async def test_chaos_on_failing_result_called_on_token_limit(self):
+ """on_failing_result callback is fired when token limit halts the run."""
+ on_failing = MagicMock()
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Some answer.",
+ usage=TokenUsage(total=500, input=300, output=200),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=100,
+ max_attempts=5,
+ on_failing_result=on_failing,
+ )
+ await client.optimize_from_options("test-agent", options)
+ on_failing.assert_called_once()
+
+ async def test_chaos_accumulates_across_multiple_iterations(self):
+ """Tokens from successive iterations add up until the limit is hit."""
+ # Each agent call returns 100 tokens; limit is 250, so it trips on the 3rd call
+ agent_responses = [
+ OptimizationResponse(output="Bad.", usage=TokenUsage(total=100, input=60, output=40)),
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation (no usage)
+ OptimizationResponse(output="Still bad.", usage=TokenUsage(total=100, input=60, output=40)),
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation (no usage)
+ OptimizationResponse(output="Still bad.", usage=TokenUsage(total=100, input=60, output=40)),
+ ]
+ handle_agent_call = AsyncMock(side_effect=agent_responses)
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=250, # 100+100 = 200 ok; 200+100 = 300 ≥ 250 → stop on 3rd
+ max_attempts=10,
+ )
+ await client.optimize_from_options("test-agent", options)
+ assert client._last_run_succeeded is False
+ # 3 agent calls + 2 variation calls = 5 total; no more
+ assert handle_agent_call.call_count == 5
+
+ async def test_chaos_token_limit_in_validation_phase_stops_run(self):
+ """Exceeding the limit during the validation phase also halts the run."""
+ # The main iteration passes judges (50 tokens), but each validation call
+ # adds 300 tokens, pushing the total over 200.
+ agent_responses = [
+ OptimizationResponse(output="Good answer.", usage=TokenUsage(total=50, input=30, output=20)), # main turn
+ OptimizationResponse(output="Validation answer.", usage=TokenUsage(total=300, input=200, output=100)), # validation
+ ]
+ handle_agent_call = AsyncMock(side_effect=agent_responses)
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=200, # 50 (main) + 300 (validation) = 350 > 200
+ max_attempts=5,
+ variable_choices=[{"language": "English"}, {"language": "French"}],
+ )
+ await client.optimize_from_options("test-agent", options)
+ assert client._last_run_succeeded is False
+
+ # -- ground truth (optimize_from_ground_truth_options) ---------------
+
+ async def test_gt_stops_when_token_limit_exceeded_on_first_sample(self):
+ """Token limit exceeded on the first sample should immediately fail the GT run."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Answer.",
+ usage=TokenUsage(total=600, input=400, output=200),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ opts = _make_gt_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=500, # 600 > 500 → trip on first sample
+ max_attempts=5,
+ )
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert client._last_run_succeeded is False
+ # Only one agent call should have happened
+ assert handle_agent_call.call_count == 1
+ # The offending context is still returned in the results list
+ assert len(results) == 1
+
+ async def test_gt_stops_mid_batch_when_limit_exceeded_on_second_sample(self):
+ """Token limit exceeded on the second of two samples stops after that sample."""
+ agent_responses = [
+ OptimizationResponse(output="Answer 1.", usage=TokenUsage(total=100, input=60, output=40)),
+ OptimizationResponse(output="Answer 2.", usage=TokenUsage(total=200, input=120, output=80)),
+ ]
+ handle_agent_call = AsyncMock(side_effect=agent_responses)
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ opts = _make_gt_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=250, # 100 + 200 = 300 ≥ 250 → trip on second sample
+ max_attempts=5,
+ )
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert client._last_run_succeeded is False
+ assert handle_agent_call.call_count == 2
+ # Both samples processed so far are in the results
+ assert len(results) == 2
+
+ async def test_gt_on_failing_result_called_on_token_limit(self):
+ """on_failing_result callback fires when GT run halts due to token limit."""
+ on_failing = MagicMock()
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Answer.",
+ usage=TokenUsage(total=600, input=400, output=200),
+ )
+ )
+ client = _make_client(self.mock_ldai)
+ opts = _make_gt_options(
+ handle_agent_call=handle_agent_call,
+ token_limit=100,
+ max_attempts=5,
+ on_failing_result=on_failing,
+ )
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+ on_failing.assert_called_once()
+
+ async def test_gt_no_limit_does_not_enforce_token_cap(self):
+ """When token_limit is None on GT options, no cap is applied."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Answer.",
+ usage=TokenUsage(total=999999, input=500000, output=499999),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ opts = _make_gt_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ # no token_limit
+ )
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert client._last_run_succeeded is True
+
+ async def test_gt_accumulates_tokens_across_samples_in_same_attempt(self):
+ """Tokens from all samples in the same attempt add up correctly."""
+ agent_responses = [
+ OptimizationResponse(output="Answer 1.", usage=TokenUsage(total=80, input=50, output=30)),
+ OptimizationResponse(output="Answer 2.", usage=TokenUsage(total=80, input=50, output=30)),
+ ]
+ handle_agent_call = AsyncMock(side_effect=agent_responses)
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+ opts = _make_gt_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=200, # 80 + 80 = 160 < 200, run should succeed
+ )
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert client._last_run_succeeded is True
+ assert len(results) == 2
+
+ # -- _total_token_usage reset between runs ---------------------------
+
+ async def test_total_token_usage_resets_between_runs(self):
+ """_total_token_usage is reset at the start of each run so a reused
+ client does not carry over counts from previous optimizations."""
+ handle_agent_call = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Answer.",
+ usage=TokenUsage(total=100, input=60, output=40),
+ )
+ )
+ handle_judge_call = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ client = _make_client(self.mock_ldai)
+
+ # First run accumulates tokens
+ options = _make_options(
+ handle_agent_call=handle_agent_call,
+ handle_judge_call=handle_judge_call,
+ token_limit=10000,
+ )
+ await client.optimize_from_options("test-agent", options)
+ assert client._total_token_usage > 0
+
+ # Second run starts fresh — use a tight limit that would fail if
+ # tokens from run 1 were carried over
+ handle_agent_call2 = AsyncMock(
+ return_value=OptimizationResponse(
+ output="Answer.",
+ usage=TokenUsage(total=50, input=30, output=20),
+ )
+ )
+ handle_judge_call2 = AsyncMock(
+ return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+ )
+ options2 = _make_options(
+ handle_agent_call=handle_agent_call2,
+ handle_judge_call=handle_judge_call2,
+ token_limit=10000,
+ )
+ await client.optimize_from_options("test-agent", options2)
+ assert client._last_run_succeeded is True
+
+
+# ---------------------------------------------------------------------------
+# optimize_from_config
+# ---------------------------------------------------------------------------
+
+
+class TestOptimizeFromConfig:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ def _make_client_with_key(self) -> OptimizationClient:
+ with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+ return _make_client(self.mock_ldai)
+
+ def _make_client_without_key(self) -> OptimizationClient:
+ with patch.dict("os.environ", {}, clear=True):
+ import os
+ os.environ.pop("LAUNCHDARKLY_API_KEY", None)
+ client = OptimizationClient(self.mock_ldai)
+ client._has_api_key = False
+ client._api_key = None
+ return client
+
+ async def test_raises_without_api_key(self):
+ client = self._make_client_without_key()
+ options = _make_from_config_options()
+ with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY is not set"):
+ await client.optimize_from_config("my-opt", options)
+
+ async def test_fetches_config_and_uses_ai_config_key(self):
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ options = _make_from_config_options()
+ await client.optimize_from_config("my-opt", options)
+
+ mock_api.get_agent_optimization.assert_called_once_with("my-project", "my-opt")
+ assert client._agent_key == "my-agent"
+
+ async def test_posts_result_on_each_status_event(self):
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ options = _make_from_config_options()
+ await client.optimize_from_config("my-opt", options)
+
+ assert mock_api.post_agent_optimization_result.call_count >= 1
+
+ async def test_user_on_status_update_called_during_run(self):
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+ statuses = []
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ options = _make_from_config_options(
+ on_status_update=lambda status, ctx: statuses.append(status)
+ )
+ await client.optimize_from_config("my-opt", options)
+
+ assert "generating" in statuses
+ assert "success" in statuses
+
+ async def test_custom_base_url_passed_to_api_client(self):
+ client = self._make_client_with_key()
+
+ with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+ instance = _make_mock_api_client()
+ instance.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+ MockLDApiClient.return_value = instance
+ options = _make_from_config_options(base_url="https://staging.launchdarkly.com")
+ await client.optimize_from_config("my-opt", options)
+
+ MockLDApiClient.assert_called_once_with(
+ "test-api-key", base_url="https://staging.launchdarkly.com"
+ )
+
+ async def test_no_base_url_does_not_pass_kwarg(self):
+ client = self._make_client_with_key()
+
+ with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+ instance = _make_mock_api_client()
+ instance.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+ MockLDApiClient.return_value = instance
+ options = _make_from_config_options()
+ await client.optimize_from_config("my-opt", options)
+
+ MockLDApiClient.assert_called_once_with("test-api-key")
+
+ async def test_returns_optimization_context_on_success(self):
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ options = _make_from_config_options()
+ result = await client.optimize_from_config("my-opt", options)
+
+ assert isinstance(result, OptimizationContext)
+ assert result.completion_response == "The answer is 4."
+
+
+# ---------------------------------------------------------------------------
+# GroundTruthSample / GroundTruthOptimizationOptions dataclass validation
+# ---------------------------------------------------------------------------
+
+
+class TestGroundTruthSampleDataclass:
+ def test_required_fields(self):
+ s = GroundTruthSample(user_input="hi", expected_response="hello")
+ assert s.user_input == "hi"
+ assert s.expected_response == "hello"
+ assert s.variables == {}
+
+ def test_variables_populated(self):
+ s = GroundTruthSample(user_input="hi", expected_response="hello", variables={"lang": "en"})
+ assert s.variables == {"lang": "en"}
+
+
+class TestGroundTruthOptimizationOptionsValidation:
+ def _make(self, **overrides) -> GroundTruthOptimizationOptions:
+ defaults = dict(
+ context_choices=[LD_CONTEXT],
+ ground_truth_responses=[
+ GroundTruthSample(user_input="q1", expected_response="a1"),
+ ],
+ max_attempts=3,
+ model_choices=["gpt-4o"],
+ judge_model="gpt-4o",
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="ans")),
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+ judges={
+ "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ },
+ )
+ defaults.update(overrides)
+ return GroundTruthOptimizationOptions(**defaults)
+
+ def test_valid_options_created(self):
+ opts = self._make()
+ assert len(opts.ground_truth_responses) == 1
+
+ def test_raises_empty_model_choices(self):
+ with pytest.raises(ValueError, match="model_choices"):
+ self._make(model_choices=[])
+
+ def test_raises_empty_ground_truth_responses(self):
+ with pytest.raises(ValueError, match="ground_truth_responses"):
+ self._make(ground_truth_responses=[])
+
+ def test_raises_no_judges_and_no_on_turn(self):
+ with pytest.raises(ValueError, match="judges or on_turn"):
+ self._make(judges=None, on_turn=None)
+
+ def test_on_turn_satisfies_criteria_requirement(self):
+ opts = self._make(judges=None, on_turn=lambda ctx: True)
+ assert opts.on_turn is not None
+
+
+# ---------------------------------------------------------------------------
+# _run_ground_truth_optimization / optimize_from_ground_truth_options
+# ---------------------------------------------------------------------------
+
+
+def _make_gt_options(**overrides) -> GroundTruthOptimizationOptions:
+ defaults: Dict[str, Any] = dict(
+ context_choices=[LD_CONTEXT],
+ ground_truth_responses=[
+ GroundTruthSample(user_input="What is 2+2?", expected_response="4", variables={"lang": "English"}),
+ GroundTruthSample(user_input="What is 3+3?", expected_response="6", variables={"lang": "English"}),
+ ],
+ max_attempts=3,
+ model_choices=["gpt-4o", "gpt-4o-mini"],
+ judge_model="gpt-4o",
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="The answer is correct.")),
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+ judges={
+ "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ },
+ )
+ defaults.update(overrides)
+ return GroundTruthOptimizationOptions(**defaults)
+
+
+def _make_winning_context(
+ model: str = "gpt-4o",
+ instructions: str = "Be helpful.",
+ parameters: Dict[str, Any] | None = None,
+) -> OptimizationContext:
+ """Return a minimal OptimizationContext representing a successful run."""
+ return OptimizationContext(
+ scores={},
+ completion_response="The answer is 4.",
+ current_instructions=instructions,
+ current_parameters=parameters or {},
+ current_variables={},
+ current_model=model,
+ iteration=1,
+ )
+
+
+def _make_api_client_for_commit(
+ existing_variation_keys: list | None = None,
+ model_configs: list | None = None,
+) -> MagicMock:
+ """Return a mock LDApiClient pre-configured for _commit_variation calls."""
+ mock = MagicMock()
+ existing = existing_variation_keys or []
+ mock.get_ai_config.return_value = {"variations": [{"key": k} for k in existing]}
+ mock.get_model_configs.return_value = model_configs if model_configs is not None else [
+ {"id": "gpt-4o", "key": "OpenAI.gpt-4o"},
+ {"id": "gpt-4o-mini", "key": "OpenAI.gpt-4o-mini"},
+ ]
+ mock.create_ai_config_variation.return_value = {"key": "new-variation"}
+ return mock
+
+
+class TestRunGroundTruthOptimization:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ def _make_client(self) -> OptimizationClient:
+ return _make_client(self.mock_ldai)
+
+ async def test_returns_list_of_contexts_on_success(self):
+ client = self._make_client()
+ opts = _make_gt_options()
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert isinstance(results, list)
+ assert len(results) == 2
+ for ctx in results:
+ assert isinstance(ctx, OptimizationContext)
+
+ async def test_each_context_has_correct_user_input(self):
+ client = self._make_client()
+ opts = _make_gt_options()
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert results[0].user_input == "What is 2+2?"
+ assert results[1].user_input == "What is 3+3?"
+
+ async def test_completion_response_set_on_each_context(self):
+ client = self._make_client()
+ opts = _make_gt_options(handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="42")))
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ for ctx in results:
+ assert ctx.completion_response == "42"
+
+ async def test_on_sample_result_called_per_sample(self):
+ client = self._make_client()
+ sample_results = []
+ opts = _make_gt_options(on_sample_result=lambda ctx: sample_results.append(ctx))
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert len(sample_results) == 2
+
+ async def test_on_passing_result_called_once_on_success(self):
+ client = self._make_client()
+ passing_calls = []
+ opts = _make_gt_options(on_passing_result=lambda ctx: passing_calls.append(ctx))
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert len(passing_calls) == 1
+
+ async def test_on_failing_result_called_when_max_attempts_exceeded(self):
+ client = self._make_client()
+ failing_calls = []
+ opts = _make_gt_options(
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)),
+ max_attempts=2,
+ on_failing_result=lambda ctx: failing_calls.append(ctx),
+ )
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert isinstance(results, list)
+ assert len(failing_calls) == 1
+
+ async def test_generates_variation_when_any_sample_fails(self):
+ client = self._make_client()
+ judge_responses = [
+ JUDGE_PASS_RESPONSE, # sample 1 attempt 1 — pass
+ JUDGE_FAIL_RESPONSE, # sample 2 attempt 1 — fail → trigger variation
+ JUDGE_PASS_RESPONSE, # sample 1 attempt 2 — pass
+ JUDGE_PASS_RESPONSE, # sample 2 attempt 2 — pass
+ ]
+ call_count = 0
+ async def side_effect(*args, **kwargs):
+ nonlocal call_count
+ resp = judge_responses[call_count]
+ call_count += 1
+ return OptimizationResponse(output=resp)
+
+ opts = _make_gt_options(
+ handle_judge_call=side_effect,
+ handle_agent_call=AsyncMock(side_effect=[
+ OptimizationResponse(output="ans1"),
+ OptimizationResponse(output="ans2"), # attempt 1 samples
+ OptimizationResponse(output=VARIATION_RESPONSE), # variation generation
+ OptimizationResponse(output="ans3"),
+ OptimizationResponse(output="ans4"), # attempt 2 samples
+ ]),
+ max_attempts=3,
+ )
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert isinstance(results, list)
+ assert len(results) == 2
+
+ async def test_iteration_numbers_are_linear_and_unique(self):
+ client = self._make_client()
+ opts = _make_gt_options()
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ iterations = [ctx.iteration for ctx in results]
+ assert len(set(iterations)) == len(iterations)
+
+ async def test_on_sample_result_exception_does_not_abort(self):
+ client = self._make_client()
+
+ def bad_callback(ctx):
+ raise RuntimeError("boom")
+
+ opts = _make_gt_options(on_sample_result=bad_callback)
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert len(results) == 2
+
+ async def test_variables_from_samples_used_per_evaluation(self):
+ client = self._make_client()
+ received_contexts = []
+ async def capture_agent_call(key, config, ctx, is_evaluation=False):
+ received_contexts.append(ctx)
+ return OptimizationResponse(output="response")
+
+ opts = _make_gt_options(
+ ground_truth_responses=[
+ GroundTruthSample(user_input="q1", expected_response="a1", variables={"lang": "English"}),
+ GroundTruthSample(user_input="q2", expected_response="a2", variables={"lang": "French"}),
+ ],
+ handle_agent_call=capture_agent_call,
+ )
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert received_contexts[0].current_variables == {"lang": "English"}
+ assert received_contexts[1].current_variables == {"lang": "French"}
+
+ async def test_model_falls_back_to_first_model_choice_when_agent_config_has_no_model(self):
+ """When the LD agent config has no model name the first model_choices entry is used."""
+ config_without_model = _make_agent_config(model_name="")
+ mock_ldai = _make_ldai_client(agent_config=config_without_model)
+ client = _make_client(mock_ldai)
+
+ observed_models = []
+ async def capture(key, config, ctx, is_evaluation=False):
+ observed_models.append(config.model.name if config.model else None)
+ return OptimizationResponse(output="answer")
+
+ opts = _make_gt_options(
+ handle_agent_call=capture,
+ model_choices=["gpt-4o", "gpt-4o-mini"],
+ )
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+ assert all(m == "gpt-4o" for m in observed_models), (
+ f"Expected all agent calls to use 'gpt-4o' (fallback), got: {observed_models}"
+ )
+
+ async def test_missing_instructions_raises_value_error(self):
+ """An agent config with no instructions raises ValueError before the loop starts."""
+ config_no_instructions = _make_agent_config(instructions="")
+ mock_ldai = _make_ldai_client(agent_config=config_no_instructions)
+ # variation() also needs to return no instructions so the fallback doesn't hide the gap.
+ mock_ldai._client.variation.return_value = {"instructions": ""}
+ client = _make_client(mock_ldai)
+
+ opts = _make_gt_options()
+ with pytest.raises(ValueError, match="has no instructions configured"):
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+
+# ---------------------------------------------------------------------------
+# expected_response in judge evaluation
+# ---------------------------------------------------------------------------
+
+
+class TestExpectedResponseInJudges:
+ def setup_method(self):
+ self.client = _make_client()
+ self.client._agent_key = "test-agent"
+ self.client._options = _make_options()
+ self.client._agent_config = _make_agent_config()
+ self.client._initialize_class_members_from_config(_make_agent_config())
+
+ async def test_expected_response_included_in_acceptance_judge_user_message(self):
+ captured_configs = []
+
+ async def capture_judge_call(key, config, ctx, is_evaluation=False):
+ captured_configs.append(config)
+ return OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+
+ self.client._options = _make_options(
+ judges={
+ "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ },
+ handle_judge_call=capture_judge_call,
+ )
+ await self.client._execute_agent_turn(
+ self.client._create_optimization_context(iteration=1, variables={}),
+ 1,
+ expected_response="The expected answer is 42.",
+ )
+ assert len(captured_configs) == 1
+ user_msg = captured_configs[0].messages[-1].content
+ assert "The expected answer is 42." in user_msg
+
+ async def test_expected_response_in_acceptance_judge_user_message(self):
+ captured_configs = []
+
+ async def capture_judge_call(key, config, ctx, is_evaluation=False):
+ captured_configs.append(config)
+ return OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+
+ self.client._options = _make_options(
+ judges={
+ "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ },
+ handle_judge_call=capture_judge_call,
+ )
+ await self.client._execute_agent_turn(
+ self.client._create_optimization_context(iteration=1, variables={}),
+ 1,
+ expected_response="gold standard",
+ )
+ user_msg = captured_configs[0].messages[1].content
+ assert "gold standard" in user_msg
+ assert "expected response" in user_msg.lower()
+ # Scoring instructions should now live in the user message, not the system prompt
+ system_msg = captured_configs[0].messages[0].content
+ assert "gold standard" not in system_msg
+
+ async def test_no_expected_response_leaves_judge_messages_unchanged(self):
+ captured_configs = []
+
+ async def capture_judge_call(key, config, ctx, is_evaluation=False):
+ captured_configs.append(config)
+ return OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+
+ self.client._options = _make_options(
+ judges={
+ "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+ },
+ handle_judge_call=capture_judge_call,
+ )
+ await self.client._execute_agent_turn(
+ self.client._create_optimization_context(iteration=1, variables={}),
+ 1,
+ )
+ user_msg = captured_configs[0].messages[-1].content
+ assert "expected response" not in user_msg.lower()
+
+
+# ---------------------------------------------------------------------------
+# _build_options_from_config — ground truth path
+# ---------------------------------------------------------------------------
+
+
+_API_CONFIG_WITH_GT: Dict[str, Any] = {
+ "id": "opt-gt-uuid",
+ "key": "my-gt-optimization",
+ "aiConfigKey": "my-agent",
+ "maxAttempts": 3,
+ "modelChoices": ["gpt-4o"],
+ "judgeModel": "gpt-4o",
+ "variableChoices": [{"lang": "English"}, {"lang": "French"}],
+ "acceptanceStatements": [{"statement": "Be accurate.", "threshold": 0.9}],
+ "judges": [],
+ "userInputOptions": ["What is 2+2?", "What is 3+3?"],
+ "groundTruthResponses": ["4", "6"],
+ "version": 1,
+ "createdAt": 1700000000,
+}
+
+
+class TestBuildOptionsFromConfigGroundTruth:
+ def setup_method(self):
+ self.client = _make_client()
+ self.client._agent_key = "my-agent"
+ self.client._initialize_class_members_from_config(_make_agent_config())
+ self.client._options = _make_options()
+ self.api_client = _make_mock_api_client()
+
+ def _build(self, config=None, options=None):
+ return self.client._build_options_from_config(
+ config or dict(_API_CONFIG_WITH_GT),
+ options or _make_from_config_options(),
+ self.api_client,
+ optimization_key="opt-gt-key",
+ run_id="run-uuid-789",
+ model_configs=[],
+ )
+
+ def test_returns_ground_truth_options_when_gt_present(self):
+ result = self._build()
+ assert isinstance(result, GroundTruthOptimizationOptions)
+
+ def test_samples_zipped_by_index(self):
+ result = self._build()
+ assert isinstance(result, GroundTruthOptimizationOptions)
+ assert len(result.ground_truth_responses) == 2
+ s0 = result.ground_truth_responses[0]
+ assert s0.user_input == "What is 2+2?"
+ assert s0.expected_response == "4"
+ assert s0.variables == {"lang": "English"}
+ s1 = result.ground_truth_responses[1]
+ assert s1.user_input == "What is 3+3?"
+ assert s1.expected_response == "6"
+ assert s1.variables == {"lang": "French"}
+
+ def test_model_choices_have_prefix_stripped(self):
+ config = dict(_API_CONFIG_WITH_GT)
+ config["modelChoices"] = ["OpenAI.gpt-4o"]
+ result = self._build(config=config)
+ assert isinstance(result, GroundTruthOptimizationOptions)
+ assert result.model_choices == ["gpt-4o"]
+
+ def test_raises_on_mismatched_lengths(self):
+ config = dict(_API_CONFIG_WITH_GT)
+ config["userInputOptions"] = ["only one input"]
+ with pytest.raises(ValueError, match="same length"):
+ self._build(config=config)
+
+ def test_returns_standard_options_when_no_gt(self):
+ config = dict(_API_CONFIG) # no groundTruthResponses
+ result = self._build(config=config)
+ assert isinstance(result, OptimizationOptions)
+
+ async def test_optimize_from_config_dispatches_to_gt_run(self):
+ mock_ldai = _make_ldai_client()
+ with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-key"}):
+ client = _make_client(mock_ldai)
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG_WITH_GT))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ options = _make_from_config_options(
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="correct answer")),
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+ )
+ result = await client.optimize_from_config("my-gt-opt", options)
+
+ assert isinstance(result, list)
+ assert len(result) == 2
+
+
+# ---------------------------------------------------------------------------
+# _acceptance_criteria_implies_duration_optimization
+# ---------------------------------------------------------------------------
+
+
+class TestAcceptanceCriteriaImpliesDurationOptimization:
+ def test_returns_false_when_judges_is_none(self):
+ assert _acceptance_criteria_implies_duration_optimization(None) is False
+
+ def test_returns_false_when_judges_is_empty(self):
+ assert _acceptance_criteria_implies_duration_optimization({}) is False
+
+ def test_returns_false_when_no_acceptance_statements(self):
+ judges = {"quality": OptimizationJudge(threshold=0.8, judge_key="judge-1")}
+ assert _acceptance_criteria_implies_duration_optimization(judges) is False
+
+ def test_returns_false_when_acceptance_statement_has_no_latency_keywords(self):
+ judges = {
+ "accuracy": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be accurate and complete.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is False
+
+ def test_detects_fast_keyword(self):
+ judges = {
+ "speed": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be fast.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_faster_keyword(self):
+ judges = {
+ "speed": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The agent should respond faster.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_latency_keyword(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The agent must have low latency.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_duration_keyword(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="Minimize the duration of each response.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_ms_keyword(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="Responses should complete in under 500ms.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_response_time_phrase(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response time should be minimized.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_efficient_keyword(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The model must be efficient.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_detects_snappy_keyword(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="Responses should feel snappy.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_case_insensitive_match(self):
+ judges = {
+ "perf": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The model must be EFFICIENT and FAST.",
+ )
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_returns_true_when_any_judge_matches(self):
+ judges = {
+ "accuracy": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be accurate.",
+ ),
+ "speed": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be fast.",
+ ),
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+ def test_returns_false_when_acceptance_statement_is_none(self):
+ judges = {
+ "quality": OptimizationJudge(threshold=0.8, acceptance_statement=None)
+ }
+ assert _acceptance_criteria_implies_duration_optimization(judges) is False
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_duration
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateDuration:
+ def setup_method(self):
+ self.client = _make_client()
+ self.client._options = _make_options()
+ self.client._agent_config = _make_agent_config()
+ self.client._initialize_class_members_from_config(_make_agent_config())
+
+ def _ctx(self, duration_ms, iteration=1):
+ return OptimizationContext(
+ scores={},
+ completion_response="response",
+ current_instructions="Do X.",
+ current_parameters={},
+ current_variables={},
+ iteration=iteration,
+ duration_ms=duration_ms,
+ )
+
+ def test_returns_true_when_history_is_empty(self):
+ self.client._history = []
+ assert self.client._evaluate_duration(self._ctx(5000)) is True
+
+ def test_returns_true_when_baseline_duration_is_none(self):
+ self.client._history = [self._ctx(None, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(5000, iteration=2)) is True
+
+ def test_returns_true_when_candidate_duration_is_none(self):
+ self.client._history = [self._ctx(2000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(None, iteration=2)) is True
+
+ def test_passes_when_candidate_is_more_than_20_percent_faster(self):
+ # baseline=2000ms, threshold=1600ms, candidate=1500ms → 1500 < 1600 → pass
+ self.client._history = [self._ctx(2000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(1500, iteration=2)) is True
+
+ def test_fails_when_candidate_is_exactly_at_threshold(self):
+ # baseline=2000ms, threshold=1600ms, candidate=1600ms → not strictly less → fail
+ self.client._history = [self._ctx(2000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(1600, iteration=2)) is False
+
+ def test_fails_when_improvement_is_less_than_20_percent(self):
+ # baseline=2000ms, threshold=1600ms, candidate=1800ms → 1800 >= 1600 → fail
+ self.client._history = [self._ctx(2000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(1800, iteration=2)) is False
+
+ def test_fails_when_candidate_matches_baseline(self):
+ self.client._history = [self._ctx(2000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(2000, iteration=2)) is False
+
+ def test_fails_when_candidate_is_slower_than_baseline(self):
+ self.client._history = [self._ctx(2000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(2500, iteration=2)) is False
+
+ def test_uses_history_index_zero_as_baseline_not_last(self):
+ # history[0] is 2000ms (baseline), history[-1] is 500ms (fast, but not the baseline)
+ first = self._ctx(2000, iteration=1)
+ later = self._ctx(500, iteration=2)
+ self.client._history = [first, later]
+ # candidate=1500ms < 2000 * 0.80 = 1600ms → pass (uses history[0], not history[-1])
+ assert self.client._evaluate_duration(self._ctx(1500, iteration=3)) is True
+
+ def test_pass_boundary_just_below_threshold(self):
+ # baseline=1000ms, threshold=800ms, candidate=799ms → pass
+ self.client._history = [self._ctx(1000, iteration=1)]
+ assert self.client._evaluate_duration(self._ctx(799, iteration=2)) is True
+
+
+# ---------------------------------------------------------------------------
+# Duration optimization — chaos mode wiring
+# ---------------------------------------------------------------------------
+
+
+class TestDurationOptimizationChaosMode:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ def _duration_judges(self, statement="The response must be fast."):
+ return {
+ "speed": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement=statement,
+ )
+ }
+
+ def _ctx_with(self, duration_ms, score=1.0, iteration=1):
+ return OptimizationContext(
+ scores={"speed": JudgeResult(score=score)},
+ completion_response="answer",
+ current_instructions="Do X.",
+ current_parameters={},
+ current_variables={"language": "English"},
+ iteration=iteration,
+ duration_ms=duration_ms,
+ )
+
+ async def test_duration_gate_triggers_variation_when_not_fast_enough(self):
+ """Judge passes but duration fails threshold → variation generated → second attempt succeeds."""
+ client = _make_client(self.mock_ldai)
+
+ # Iter 1: judge fails → history[0].duration_ms = 2000
+ # Iter 2: judge passes, duration 1800ms ≥ 2000 * 0.80 = 1600ms → duration fails → variation
+ # Iter 3: judge passes, duration 1500ms < 1600ms → passes → validation → success
+ execute_side_effects = [
+ self._ctx_with(duration_ms=2000, score=0.2, iteration=1), # iter 1: judge fails
+ self._ctx_with(duration_ms=1800, score=1.0, iteration=2), # iter 2: judge passes, duration fails
+ self._ctx_with(duration_ms=1500, score=1.0, iteration=3), # iter 3: both pass
+ self._ctx_with(duration_ms=1500, score=1.0, iteration=4), # validation
+ ]
+
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+ opts = _make_options(
+ handle_agent_call=handle_agent_call,
+ judges=self._duration_judges(),
+ max_attempts=5,
+ )
+
+ with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+ mock_execute.side_effect = execute_side_effects
+ result = await client.optimize_from_options("test-agent", opts)
+
+ assert result.duration_ms == 1500
+ # 2 variations generated (after iter 1 judge fail, after iter 2 duration fail)
+ assert handle_agent_call.call_count == 2
+ assert mock_execute.call_count == 4
+
+ async def test_duration_check_skipped_on_first_iteration_no_baseline(self):
+ """First iteration has no history → duration check always skipped → succeeds even if slow."""
+ client = _make_client(self.mock_ldai)
+
+ # Iter 1 (no history): judge passes, duration check skipped → validation
+ # Validation: judge passes, duration check still uses history[0] = None since nothing appended yet
+ execute_side_effects = [
+ self._ctx_with(duration_ms=9999, score=1.0, iteration=1), # iter 1: would fail if checked
+ self._ctx_with(duration_ms=9999, score=1.0, iteration=2), # validation
+ ]
+
+ opts = _make_options(
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")),
+ judges=self._duration_judges(),
+ max_attempts=3,
+ )
+
+ with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+ mock_execute.side_effect = execute_side_effects
+ result = await client.optimize_from_options("test-agent", opts)
+
+ # Succeeds because history is empty and duration check is skipped
+ assert result.duration_ms == 9999
+
+ async def test_no_duration_gate_when_acceptance_criteria_has_no_latency_keywords(self):
+ """Acceptance statement with no latency keywords → duration gate never applied."""
+ client = _make_client(self.mock_ldai)
+
+ # Judge passes on first try; duration would fail if gate were applied (same as baseline)
+ # but since acceptance criteria has no latency keywords, it should succeed anyway
+ execute_side_effects = [
+ self._ctx_with(duration_ms=2000, score=1.0, iteration=1),
+ self._ctx_with(duration_ms=2000, score=1.0, iteration=2), # validation
+ ]
+
+ non_latency_judges = {
+ "accuracy": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be accurate and complete.",
+ )
+ }
+ opts = _make_options(
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")),
+ judges=non_latency_judges,
+ max_attempts=3,
+ )
+
+ with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+ mock_execute.side_effect = execute_side_effects
+ # Manually seed history so _evaluate_duration would fire if incorrectly triggered
+ client._history = [self._ctx_with(duration_ms=2000, iteration=0)]
+ result = await client.optimize_from_options("test-agent", opts)
+
+ assert result is not None
+
+ async def test_evaluate_duration_called_in_validation_phase(self):
+ """Duration gate also runs on validation samples, not just the primary turn."""
+ client = _make_client(self.mock_ldai)
+
+ # Iter 1: judge fails → history[0].duration_ms = 2000
+ # Iter 2: judge passes, duration 1500ms → primary passes
+ # Validation sample: judge passes, duration 1800ms ≥ 1600ms → validation fails → variation
+ # Iter 3: judge passes, duration 1500ms → primary passes
+ # Validation: judge passes, duration 1500ms → validation passes → success
+ execute_side_effects = [
+ self._ctx_with(duration_ms=2000, score=0.2, iteration=1), # iter 1: judge fails
+ self._ctx_with(duration_ms=1500, score=1.0, iteration=2), # iter 2: passes
+ self._ctx_with(duration_ms=1800, score=1.0, iteration=3), # validation: duration fails
+ self._ctx_with(duration_ms=1500, score=1.0, iteration=4), # iter 3: passes
+ self._ctx_with(duration_ms=1500, score=1.0, iteration=5), # validation: passes
+ ]
+
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+ opts = _make_options(
+ handle_agent_call=handle_agent_call,
+ judges=self._duration_judges(),
+ max_attempts=5,
+ )
+
+ with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+ mock_execute.side_effect = execute_side_effects
+ result = await client.optimize_from_options("test-agent", opts)
+
+ assert result.duration_ms == 1500
+ assert mock_execute.call_count == 5
+
+
+# ---------------------------------------------------------------------------
+# Duration optimization — ground truth mode wiring
+# ---------------------------------------------------------------------------
+
+
+class TestDurationOptimizationGroundTruthMode:
+ def setup_method(self):
+ self.mock_ldai = _make_ldai_client()
+
+ def _duration_judges(self):
+ return {
+ "speed": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be fast.",
+ )
+ }
+
+ def _gt_ctx(self, duration_ms, score=1.0, iteration=1, user_input="q"):
+ return OptimizationContext(
+ scores={"speed": JudgeResult(score=score)},
+ completion_response="answer",
+ current_instructions="Do X.",
+ current_parameters={},
+ current_variables={},
+ iteration=iteration,
+ duration_ms=duration_ms,
+ user_input=user_input,
+ )
+
+ async def test_duration_gate_applied_per_sample_in_ground_truth_mode(self):
+ """In GT mode, the duration check fires per sample, not just once per attempt."""
+ client = _make_client(self.mock_ldai)
+
+ # Attempt 1:
+ # Sample 1: judge fails (score 0.2) → all_passed = False
+ # Sample 2: judge passes → duration skipped (history empty for sample 2)
+ # → history extended with attempt 1 results → variation generated
+ # Attempt 2:
+ # Sample 1: judge passes, duration 1800ms vs baseline history[0].duration_ms = 2000ms
+ # → 1800 >= 1600 → duration fails → sample_passed = False → all_passed = False
+ # (attempt 2 fails due to duration on sample 1)
+ # → variation generated
+ # Attempt 3:
+ # Sample 1: judge passes, duration 1500ms < 1600ms → passes
+ # Sample 2: judge passes, duration 1500ms (history[0] still 2000ms) → passes
+ # → all_passed = True → success
+ execute_side_effects = [
+ # Attempt 1
+ self._gt_ctx(duration_ms=2000, score=0.2, iteration=1, user_input="q1"),
+ self._gt_ctx(duration_ms=2000, score=1.0, iteration=2, user_input="q2"),
+ # Variation (not from _execute_agent_turn, from handle_agent_call)
+ # Attempt 2
+ self._gt_ctx(duration_ms=1800, score=1.0, iteration=3, user_input="q1"),
+ self._gt_ctx(duration_ms=1800, score=1.0, iteration=4, user_input="q2"),
+ # Variation
+ # Attempt 3
+ self._gt_ctx(duration_ms=1500, score=1.0, iteration=5, user_input="q1"),
+ self._gt_ctx(duration_ms=1500, score=1.0, iteration=6, user_input="q2"),
+ ]
+
+ handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+ opts = _make_gt_options(
+ handle_agent_call=handle_agent_call,
+ judges=self._duration_judges(),
+ max_attempts=5,
+ )
+
+ with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+ mock_execute.side_effect = execute_side_effects
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ assert isinstance(results, list)
+ for ctx in results:
+ assert ctx.duration_ms == 1500
+ # 2 variations generated
+ assert handle_agent_call.call_count == 2
+ assert mock_execute.call_count == 6
+
+ async def test_no_duration_gate_in_gt_mode_when_no_latency_keywords(self):
+ """In GT mode, duration gate is not applied when acceptance criteria has no latency keywords."""
+ client = _make_client(self.mock_ldai)
+
+ execute_side_effects = [
+ self._gt_ctx(duration_ms=5000, score=1.0, iteration=1, user_input="q1"),
+ self._gt_ctx(duration_ms=5000, score=1.0, iteration=2, user_input="q2"),
+ ]
+
+ non_latency_judges = {
+ "accuracy": OptimizationJudge(
+ threshold=0.8,
+ acceptance_statement="The response must be accurate.",
+ )
+ }
+ opts = _make_gt_options(
+ handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")),
+ judges=non_latency_judges,
+ max_attempts=3,
+ )
+
+ with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+ mock_execute.side_effect = execute_side_effects
+ results = await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ # Succeeds on first attempt even with slow duration (no latency keyword → no gate)
+ assert isinstance(results, list)
+ assert mock_execute.call_count == 2
+
+
+# ---------------------------------------------------------------------------
+# _commit_variation
+# ---------------------------------------------------------------------------
+
+
+class TestCommitVariation:
+ def _make_client(self) -> OptimizationClient:
+ with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+ return OptimizationClient(_make_ldai_client())
+
+ # --- key generation ---
+
+ def test_uses_output_key_as_variation_key(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+
+ key = client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="my-custom-key", api_client=api_client,
+ )
+
+ assert key == "my-custom-key"
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["key"] == "my-custom-key"
+ assert payload["name"] == "my-custom-key"
+
+ def test_generates_slug_when_output_key_is_none(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+
+ with patch("ldai_optimizer.client.generate_slug", return_value="fancy-panda"):
+ key = client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key=None, api_client=api_client,
+ )
+
+ assert key == "fancy-panda"
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["key"] == "fancy-panda"
+ assert payload["name"] == "fancy-panda"
+
+ # --- collision handling ---
+
+ def test_appends_hex_suffix_on_key_collision(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit(existing_variation_keys=["my-key"])
+
+ with patch("ldai_optimizer.client.random.randint", return_value=0x1234):
+ key = client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+ )
+
+ assert key == "my-key-1234"
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["key"] == "my-key-1234"
+
+ def test_no_suffix_when_key_does_not_collide(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit(existing_variation_keys=["other-key"])
+
+ key = client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+ )
+
+ assert key == "my-key"
+
+ def test_proceeds_with_candidate_when_get_ai_config_raises(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+ api_client.get_ai_config.side_effect = Exception("network error")
+
+ key = client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+ )
+
+ assert key == "my-key"
+ api_client.create_ai_config_variation.assert_called_once()
+
+ # --- payload shape ---
+
+ def test_payload_mode_is_agent(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["mode"] == "agent"
+
+ def test_payload_instructions_from_context(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+ ctx = _make_winning_context(instructions="You are a travel assistant.")
+
+ client._commit_variation(
+ ctx, project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["instructions"] == "You are a travel assistant."
+
+ def test_create_called_with_correct_project_and_config_key(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+
+ client._commit_variation(
+ _make_winning_context(), project_key="proj-abc",
+ ai_config_key="agent-xyz", output_key="k", api_client=api_client,
+ )
+
+ args = api_client.create_ai_config_variation.call_args[0]
+ assert args[0] == "proj-abc"
+ assert args[1] == "agent-xyz"
+
+ # --- modelConfigKey resolution ---
+
+ def test_model_config_key_resolved_via_api_match_on_id(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit(model_configs=[
+ {"id": "gpt-4o", "key": "OpenAI.gpt-4o"},
+ {"id": "claude-3", "key": "Anthropic.claude-3"},
+ ])
+
+ client._commit_variation(
+ _make_winning_context(model="gpt-4o"), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["modelConfigKey"] == "OpenAI.gpt-4o"
+
+ def test_model_config_key_falls_back_to_model_name_when_no_id_match(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit(model_configs=[
+ {"id": "claude-3", "key": "Anthropic.claude-3"},
+ ])
+
+ client._commit_variation(
+ _make_winning_context(model="gpt-4o"), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["modelConfigKey"] == "gpt-4o"
+
+ def test_model_config_key_prefers_global_over_non_global(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit(model_configs=[
+ {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+ {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+ ])
+
+ client._commit_variation(
+ _make_winning_context(model="gpt-4o"), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["modelConfigKey"] == "global.gpt-4o"
+
+ def test_model_config_key_falls_back_when_get_model_configs_raises(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+ api_client.get_model_configs.side_effect = Exception("network error")
+
+ client._commit_variation(
+ _make_winning_context(model="gpt-4o"), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["modelConfigKey"] == "gpt-4o"
+
+ # --- retry logic ---
+
+ def test_retries_on_transient_failure_and_succeeds(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+ api_client.create_ai_config_variation.side_effect = [
+ Exception("transient"),
+ {"key": "my-key"},
+ ]
+
+ key = client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+ )
+
+ assert key == "my-key"
+ assert api_client.create_ai_config_variation.call_count == 2
+
+ def test_raises_after_three_consecutive_failures(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+ api_client.create_ai_config_variation.side_effect = RuntimeError("permanent")
+
+ with pytest.raises(RuntimeError, match="permanent"):
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ assert api_client.create_ai_config_variation.call_count == 3
+
+ # --- LDApiClient construction ---
+
+ def test_creates_api_client_from_stored_key_when_none_provided(self):
+ client = self._make_client()
+
+ with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+ MockLDApiClient.return_value = _make_api_client_for_commit()
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k",
+ )
+
+ MockLDApiClient.assert_called_once_with("test-api-key")
+
+ def test_passes_base_url_when_creating_api_client(self):
+ client = self._make_client()
+
+ with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+ MockLDApiClient.return_value = _make_api_client_for_commit()
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k",
+ base_url="https://app.launchdarkly.us",
+ )
+
+ MockLDApiClient.assert_called_once_with(
+ "test-api-key", base_url="https://app.launchdarkly.us"
+ )
+
+ def test_reuses_provided_api_client_without_creating_new_one(self):
+ client = self._make_client()
+ api_client = _make_api_client_for_commit()
+
+ with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ MockLDApiClient.assert_not_called()
+
+ # --- tool key propagation ---
+
+ def test_toolkeys_included_in_payload_when_tools_present(self):
+ client = self._make_client()
+ client._initial_tool_keys = ["search-tool", "calculator"]
+ api_client = _make_api_client_for_commit()
+
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert payload["toolKeys"] == ["search-tool", "calculator"]
+
+ def test_toolkeys_not_in_payload_when_no_tools(self):
+ client = self._make_client()
+ client._initial_tool_keys = []
+ api_client = _make_api_client_for_commit()
+
+ client._commit_variation(
+ _make_winning_context(), project_key="my-project",
+ ai_config_key="my-agent", output_key="k", api_client=api_client,
+ )
+
+ payload = api_client.create_ai_config_variation.call_args[0][2]
+ assert "toolKeys" not in payload
+
+
+# ---------------------------------------------------------------------------
+# Tool key extraction from raw variation (_get_agent_config)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestGetAgentConfigToolKeyExtraction:
+ def _make_client_with_variation(self, raw_variation: dict) -> OptimizationClient:
+ mock_ldai = _make_ldai_client()
+ mock_ldai._client.variation.return_value = raw_variation
+ return _make_client(mock_ldai)
+
+ async def test_extracts_tool_keys_from_raw_variation(self):
+ raw = {
+ "instructions": AGENT_INSTRUCTIONS,
+ "tools": [
+ {"key": "search-tool", "version": 1},
+ {"key": "calculator", "version": 2},
+ ],
+ }
+ client = self._make_client_with_variation(raw)
+ await client._get_agent_config("test-agent", LD_CONTEXT)
+ assert client._initial_tool_keys == ["search-tool", "calculator"]
+
+ async def test_initial_tool_keys_empty_when_no_tools_in_variation(self):
+ raw = {"instructions": AGENT_INSTRUCTIONS}
+ client = self._make_client_with_variation(raw)
+ await client._get_agent_config("test-agent", LD_CONTEXT)
+ assert client._initial_tool_keys == []
+
+ async def test_initial_tool_keys_empty_when_tools_is_empty_list(self):
+ raw = {"instructions": AGENT_INSTRUCTIONS, "tools": []}
+ client = self._make_client_with_variation(raw)
+ await client._get_agent_config("test-agent", LD_CONTEXT)
+ assert client._initial_tool_keys == []
+
+ async def test_skips_tool_entries_without_key(self):
+ raw = {
+ "instructions": AGENT_INSTRUCTIONS,
+ "tools": [
+ {"key": "good-tool", "version": 1},
+ {"version": 2}, # missing key — should be skipped
+ ],
+ }
+ client = self._make_client_with_variation(raw)
+ await client._get_agent_config("test-agent", LD_CONTEXT)
+ assert client._initial_tool_keys == ["good-tool"]
+
+
+# ---------------------------------------------------------------------------
+# auto_commit in optimize_from_options
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestAutoCommitInOptimizeFromOptions:
+ def _make_client_with_key(self) -> OptimizationClient:
+ with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+ return OptimizationClient(_make_ldai_client())
+
+ def _make_client_without_key(self) -> OptimizationClient:
+ client = OptimizationClient(_make_ldai_client())
+ client._has_api_key = False
+ client._api_key = None
+ return client
+
+ async def test_commit_called_on_success_when_auto_commit_true(self):
+ client = self._make_client_with_key()
+ options = _make_options(auto_commit=True, project_key="my-project")
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_options("test-agent", options)
+
+ mock_commit.assert_called_once()
+
+ async def test_commit_not_called_when_auto_commit_false(self):
+ client = self._make_client_with_key()
+ options = _make_options() # auto_commit defaults to False
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_options("test-agent", options)
+
+ mock_commit.assert_not_called()
+
+ async def test_commit_not_called_when_run_fails(self):
+ client = self._make_client_with_key()
+ options = _make_options(
+ auto_commit=True,
+ project_key="my-project",
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)),
+ max_attempts=1,
+ )
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_options("test-agent", options)
+
+ mock_commit.assert_not_called()
+
+ async def test_succeeds_without_api_key_when_auto_commit_false(self):
+ client = self._make_client_without_key()
+ options = _make_options() # auto_commit defaults to False
+
+ with patch("ldai_optimizer.client.LDApiClient") as mock_api_cls:
+ result = await client.optimize_from_options("test-agent", options)
+
+ mock_api_cls.assert_not_called()
+ assert result is not None
+
+ async def test_raises_when_auto_commit_true_and_no_api_key(self):
+ client = self._make_client_without_key()
+ options = _make_options(auto_commit=True, project_key="my-project")
+
+ with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY"):
+ await client.optimize_from_options("test-agent", options)
+
+ async def test_raises_when_auto_commit_true_and_no_project_key(self):
+ client = self._make_client_with_key()
+ options = _make_options(auto_commit=True, project_key=None)
+
+ with pytest.raises(ValueError, match="project_key"):
+ await client.optimize_from_options("test-agent", options)
+
+ async def test_output_key_forwarded_to_commit(self):
+ client = self._make_client_with_key()
+ options = _make_options(
+ auto_commit=True, project_key="my-project", output_key="my-variation"
+ )
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_options("test-agent", options)
+
+ assert mock_commit.call_args[1]["output_key"] == "my-variation"
+
+ async def test_base_url_forwarded_to_commit(self):
+ client = self._make_client_with_key()
+ options = _make_options(
+ auto_commit=True,
+ project_key="my-project",
+ base_url="https://app.launchdarkly.us",
+ )
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_options("test-agent", options)
+
+ assert mock_commit.call_args[1]["base_url"] == "https://app.launchdarkly.us"
+
+ async def test_agent_key_used_as_ai_config_key(self):
+ client = self._make_client_with_key()
+ options = _make_options(auto_commit=True, project_key="my-project")
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_options("test-agent", options)
+
+ assert mock_commit.call_args[1]["ai_config_key"] == "test-agent"
+
+
+# ---------------------------------------------------------------------------
+# auto_commit in optimize_from_ground_truth_options
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestAutoCommitInOptimizeFromGroundTruthOptions:
+ def _make_client_with_key(self) -> OptimizationClient:
+ with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+ return OptimizationClient(_make_ldai_client())
+
+ def _make_client_without_key(self) -> OptimizationClient:
+ client = OptimizationClient(_make_ldai_client())
+ client._has_api_key = False
+ client._api_key = None
+ return client
+
+ async def test_commit_called_on_success_when_auto_commit_true(self):
+ client = self._make_client_with_key()
+ opts = _make_gt_options(auto_commit=True, project_key="my-project")
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ mock_commit.assert_called_once()
+
+ async def test_commit_not_called_when_auto_commit_false(self):
+ client = self._make_client_with_key()
+ opts = _make_gt_options() # auto_commit defaults to False
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ mock_commit.assert_not_called()
+
+ async def test_commit_not_called_when_run_fails(self):
+ client = self._make_client_with_key()
+ opts = _make_gt_options(
+ auto_commit=True,
+ project_key="my-project",
+ handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)),
+ max_attempts=1,
+ )
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ mock_commit.assert_not_called()
+
+ async def test_raises_when_auto_commit_true_and_no_api_key(self):
+ client = self._make_client_without_key()
+ opts = _make_gt_options(auto_commit=True, project_key="my-project")
+
+ with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY"):
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ async def test_raises_when_auto_commit_true_and_no_project_key(self):
+ client = self._make_client_with_key()
+ opts = _make_gt_options(auto_commit=True, project_key=None)
+
+ with pytest.raises(ValueError, match="project_key"):
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ async def test_output_key_forwarded_to_commit(self):
+ client = self._make_client_with_key()
+ opts = _make_gt_options(
+ auto_commit=True, project_key="my-project", output_key="my-variation"
+ )
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ assert mock_commit.call_args[1]["output_key"] == "my-variation"
+
+ async def test_base_url_forwarded_to_commit(self):
+ client = self._make_client_with_key()
+ opts = _make_gt_options(
+ auto_commit=True,
+ project_key="my-project",
+ base_url="https://app.launchdarkly.us",
+ )
+
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_ground_truth_options("test-agent", opts)
+
+ assert mock_commit.call_args[1]["base_url"] == "https://app.launchdarkly.us"
+
+
+# ---------------------------------------------------------------------------
+# auto_commit in optimize_from_config
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestAutoCommitInOptimizeFromConfig:
+ def _make_client_with_key(self) -> OptimizationClient:
+ with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+ return OptimizationClient(_make_ldai_client())
+
+ async def test_commit_called_by_default(self):
+ """auto_commit=True is the default for optimize_from_config."""
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_config("my-opt", _make_from_config_options())
+
+ mock_commit.assert_called_once()
+
+ async def test_commit_not_called_when_auto_commit_false(self):
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_config(
+ "my-opt", _make_from_config_options(auto_commit=False)
+ )
+
+ mock_commit.assert_not_called()
+
+ async def test_commit_receives_pre_built_api_client(self):
+ """The api_client created for fetching config is reused for _commit_variation."""
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_config("my-opt", _make_from_config_options())
+
+ assert mock_commit.call_args[1]["api_client"] is mock_api
+
+ async def test_output_key_forwarded_to_commit(self):
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_config(
+ "my-opt", _make_from_config_options(output_key="my-variation")
+ )
+
+ assert mock_commit.call_args[1]["output_key"] == "my-variation"
+
+ async def test_model_configs_forwarded_to_commit(self):
+ """Pre-fetched model configs are passed to _commit_variation to avoid extra API calls."""
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+ mock_api.get_model_configs = MagicMock(return_value=[{"id": "gpt-4o", "key": "OpenAI.gpt-4o"}])
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ with patch.object(client, "_commit_variation") as mock_commit:
+ await client.optimize_from_config("my-opt", _make_from_config_options())
+
+ assert mock_commit.call_args[1]["model_configs"] == [{"id": "gpt-4o", "key": "OpenAI.gpt-4o"}]
+
+ async def test_patches_created_variation_key_after_commit(self):
+ """After _commit_variation succeeds, the last result record is PATCHed with createdVariationKey."""
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ with patch.object(client, "_commit_variation", return_value="my-new-variation"):
+ client._last_optimization_result_id = "result-id-abc"
+ await client.optimize_from_config("my-opt", _make_from_config_options())
+
+ patch_calls = mock_api.patch_agent_optimization_result.call_args_list
+ variation_key_patch = next(
+ (c for c in patch_calls if c[0][3].get("createdVariationKey") == "my-new-variation"),
+ None,
+ )
+ assert variation_key_patch is not None, "Expected a PATCH with createdVariationKey"
+ # URL path uses the string key ("my-optimization"), not the UUID ("opt-uuid-123")
+ assert variation_key_patch[0][1] == "my-optimization"
+
+ async def test_optimization_key_in_post_url_uses_string_key_not_uuid(self):
+ """post_agent_optimization_result is called with config['key'], not config['id']."""
+ client = self._make_client_with_key()
+ mock_api = _make_mock_api_client()
+ mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+ with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+ await client.optimize_from_config("my-opt", _make_from_config_options())
+
+ post_call_args = mock_api.post_agent_optimization_result.call_args_list
+ assert len(post_call_args) >= 1
+ for call in post_call_args:
+ opt_key_arg = call[0][1]
+ # Must use the string key "my-optimization", never the UUID "opt-uuid-123"
+ assert opt_key_arg == "my-optimization", (
+ f"Expected string key 'my-optimization', got '{opt_key_arg}'"
+ )
diff --git a/packages/optimization/tests/test_ld_api_client.py b/packages/optimization/tests/test_ld_api_client.py
new file mode 100644
index 00000000..4faa750b
--- /dev/null
+++ b/packages/optimization/tests/test_ld_api_client.py
@@ -0,0 +1,371 @@
+"""Tests for ldai_optimizer.ld_api_client."""
+
+import json
+import urllib.error
+import urllib.request
+from io import BytesIO
+from typing import Any, Dict
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from ldai_optimizer.ld_api_client import (
+ AgentOptimizationConfig,
+ AgentOptimizationResultPost as OptimizationResultPayload,
+ LDApiClient,
+ LDApiError,
+ _MAX_RETRIES,
+ _parse_agent_optimization,
+)
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+_BASE_CONFIG: Dict[str, Any] = {
+ "id": "opt-uuid-123",
+ "key": "my-optimization",
+ "aiConfigKey": "my-agent",
+ "maxAttempts": 3,
+ "modelChoices": ["gpt-4o", "gpt-4o-mini"],
+ "judgeModel": "gpt-4o",
+ "variableChoices": [{"language": "English"}],
+ "acceptanceStatements": [{"statement": "Be accurate.", "threshold": 0.9}],
+ "judges": [],
+ "userInputOptions": ["What is 2+2?"],
+ "version": 1,
+ "createdAt": 1700000000,
+}
+
+
+def _make_config(**overrides: Any) -> Dict[str, Any]:
+ return {**_BASE_CONFIG, **overrides}
+
+
+def _mock_urlopen(response_data: Any, status: int = 200) -> MagicMock:
+ """Return a context-manager mock whose .read() returns JSON-encoded response_data."""
+ mock_resp = MagicMock()
+ mock_resp.read.return_value = json.dumps(response_data).encode()
+ mock_resp.__enter__ = lambda s: s
+ mock_resp.__exit__ = MagicMock(return_value=False)
+ return mock_resp
+
+
+# ---------------------------------------------------------------------------
+# _parse_agent_optimization
+# ---------------------------------------------------------------------------
+
+
+class TestParseAgentOptimization:
+ def test_valid_config_is_returned_unchanged(self):
+ config = _make_config()
+ result = _parse_agent_optimization(config)
+ assert result["id"] == "opt-uuid-123"
+ assert result["aiConfigKey"] == "my-agent"
+
+ def test_optional_fields_not_required(self):
+ config = _make_config()
+ # groundTruthResponses and metricKey are optional — should not raise
+ assert "groundTruthResponses" not in config
+ assert "metricKey" not in config
+ _parse_agent_optimization(config) # must not raise
+
+ def test_raises_on_non_dict_input(self):
+ with pytest.raises(ValueError, match="Expected a JSON object"):
+ _parse_agent_optimization(["not", "a", "dict"])
+
+ def test_raises_on_none_input(self):
+ with pytest.raises(ValueError, match="Expected a JSON object"):
+ _parse_agent_optimization(None)
+
+ @pytest.mark.parametrize("field", ["id", "key", "aiConfigKey", "judgeModel"])
+ def test_raises_on_missing_required_string_field(self, field: str):
+ config = _make_config()
+ del config[field]
+ with pytest.raises(ValueError, match=f"missing required field '{field}'"):
+ _parse_agent_optimization(config)
+
+ @pytest.mark.parametrize("field", ["maxAttempts", "version", "createdAt"])
+ def test_raises_on_missing_required_int_field(self, field: str):
+ config = _make_config()
+ del config[field]
+ with pytest.raises(ValueError, match=f"missing required field '{field}'"):
+ _parse_agent_optimization(config)
+
+ @pytest.mark.parametrize(
+ "field",
+ ["modelChoices", "variableChoices", "acceptanceStatements", "judges", "userInputOptions"],
+ )
+ def test_raises_on_missing_required_list_field(self, field: str):
+ config = _make_config()
+ del config[field]
+ with pytest.raises(ValueError, match=f"missing required field '{field}'"):
+ _parse_agent_optimization(config)
+
+ def test_raises_on_wrong_type_for_string_field(self):
+ config = _make_config(aiConfigKey=123)
+ with pytest.raises(ValueError, match="field 'aiConfigKey' must be a string"):
+ _parse_agent_optimization(config)
+
+ def test_raises_on_wrong_type_for_int_field(self):
+ config = _make_config(maxAttempts="three")
+ with pytest.raises(ValueError, match="field 'maxAttempts' must be an integer"):
+ _parse_agent_optimization(config)
+
+ def test_raises_on_wrong_type_for_list_field(self):
+ config = _make_config(modelChoices="gpt-4o")
+ with pytest.raises(ValueError, match="field 'modelChoices' must be a list"):
+ _parse_agent_optimization(config)
+
+ def test_raises_when_model_choices_is_empty(self):
+ config = _make_config(modelChoices=[])
+ with pytest.raises(ValueError, match="at least 1 entry"):
+ _parse_agent_optimization(config)
+
+ def test_collects_multiple_errors_in_one_raise(self):
+ config = _make_config()
+ del config["id"]
+ del config["maxAttempts"]
+ config["modelChoices"] = "bad"
+ with pytest.raises(ValueError) as exc_info:
+ _parse_agent_optimization(config)
+ msg = str(exc_info.value)
+ assert "id" in msg
+ assert "maxAttempts" in msg
+ assert "modelChoices" in msg
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient._request
+# ---------------------------------------------------------------------------
+
+
+class TestLDApiClientRequest:
+ def test_get_does_not_send_content_type(self):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client._request("GET", "/some/path")
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert "Content-Type" not in req.headers
+
+ def test_post_sends_content_type(self):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client._request("POST", "/some/path", body={"key": "value"})
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert req.get_header("Content-type") == "application/json"
+
+ def test_authorization_header_always_sent(self):
+ client = LDApiClient("my-api-key")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client._request("GET", "/path")
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert req.get_header("Authorization") == "my-api-key"
+
+ def test_raises_ld_api_error_on_http_error(self):
+ client = LDApiClient("test-key")
+ http_error = urllib.error.HTTPError(
+ url="http://x", code=404, msg="Not Found", hdrs=MagicMock(), fp=BytesIO(b"not found body")
+ )
+ with patch("urllib.request.urlopen", side_effect=http_error):
+ with pytest.raises(LDApiError) as exc_info:
+ client._request("GET", "/missing")
+ assert exc_info.value.status_code == 404
+ assert "404" in str(exc_info.value)
+
+ def test_raises_ld_api_error_on_url_error(self):
+ client = LDApiClient("test-key")
+ url_error = urllib.error.URLError(reason="Connection refused")
+ with patch("urllib.request.urlopen", side_effect=url_error):
+ with pytest.raises(LDApiError) as exc_info:
+ client._request("GET", "/path")
+ assert exc_info.value.status_code is None
+ assert "Connection refused" in str(exc_info.value)
+
+ def test_401_error_includes_api_key_hint(self):
+ client = LDApiClient("test-key")
+ http_error = urllib.error.HTTPError(
+ url="http://x", code=401, msg="Unauthorized", hdrs=MagicMock(), fp=BytesIO(b"")
+ )
+ with patch("urllib.request.urlopen", side_effect=http_error):
+ with pytest.raises(LDApiError, match="LAUNCHDARKLY_API_KEY"):
+ client._request("GET", "/path")
+
+ def test_404_error_includes_key_hint(self):
+ client = LDApiClient("test-key")
+ http_error = urllib.error.HTTPError(
+ url="http://x", code=404, msg="Not Found", hdrs=MagicMock(), fp=BytesIO(b"")
+ )
+ with patch("urllib.request.urlopen", side_effect=http_error):
+ with pytest.raises(LDApiError, match="project key"):
+ client._request("GET", "/path")
+
+ def test_custom_base_url_used_in_request(self):
+ client = LDApiClient("test-key", base_url="https://staging.launchdarkly.com")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client._request("GET", "/api/v2/test")
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert req.full_url.startswith("https://staging.launchdarkly.com")
+
+ def test_trailing_slash_stripped_from_base_url(self):
+ client = LDApiClient("test-key", base_url="https://app.launchdarkly.com/")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client._request("GET", "/api/v2/test")
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert "//" not in req.full_url.replace("https://", "")
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient.get_agent_optimization
+# ---------------------------------------------------------------------------
+
+
+class TestGetAgentOptimization:
+ def test_requests_correct_path(self):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen(_make_config())) as mock_open:
+ client.get_agent_optimization("my-project", "my-opt-key")
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert "/api/v2/projects/my-project/agent-optimizations/my-opt-key" in req.full_url
+
+ def test_returns_validated_config(self):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen(_make_config())):
+ result = client.get_agent_optimization("proj", "opt")
+ assert result["aiConfigKey"] == "my-agent"
+ assert result["maxAttempts"] == 3
+
+ def test_raises_on_invalid_response(self):
+ client = LDApiClient("test-key")
+ bad_response = {"id": "x"} # missing many required fields
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen(bad_response)):
+ with pytest.raises(ValueError, match="Invalid AgentOptimization response"):
+ client.get_agent_optimization("proj", "opt")
+
+ def test_raises_ld_api_error_on_http_404(self):
+ client = LDApiClient("test-key")
+ http_error = urllib.error.HTTPError(
+ url="http://x", code=404, msg="Not Found", hdrs=MagicMock(), fp=BytesIO(b"not found")
+ )
+ with patch("urllib.request.urlopen", side_effect=http_error):
+ with pytest.raises(LDApiError) as exc_info:
+ client.get_agent_optimization("proj", "missing-key")
+ assert exc_info.value.status_code == 404
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient.post_agent_optimization_result
+# ---------------------------------------------------------------------------
+
+
+class TestPostAgentOptimizationResult:
+ def _make_payload(self) -> OptimizationResultPayload:
+ return {
+ "run_id": "run-abc",
+ "config_optimization_version": 1,
+ "status": "RUNNING",
+ "activity": "GENERATING",
+ "iteration": 1,
+ "instructions": "You are a helpful assistant.",
+ "parameters": {"temperature": 0.7},
+ "completion_response": "The answer is 4.",
+ "scores": {},
+ }
+
+ def test_requests_correct_path(self):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client.post_agent_optimization_result("my-project", "opt-uuid", self._make_payload())
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ assert "/api/v2/projects/my-project/agent-optimizations/opt-uuid/results" in req.full_url
+
+ def test_sends_payload_as_json_body(self):
+ client = LDApiClient("test-key")
+ payload = self._make_payload()
+ with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+ client.post_agent_optimization_result("proj", "opt-id", payload)
+ req: urllib.request.Request = mock_open.call_args[0][0]
+ sent = json.loads(req.data.decode())
+ assert sent["run_id"] == "run-abc"
+ assert sent["status"] == "RUNNING"
+ assert sent["instructions"] == "You are a helpful assistant."
+
+ def test_swallows_http_errors_without_raising(self):
+ client = LDApiClient("test-key")
+ http_error = urllib.error.HTTPError(
+ url="http://x", code=500, msg="Server Error", hdrs=MagicMock(), fp=BytesIO(b"err")
+ )
+ with patch("urllib.request.urlopen", side_effect=http_error):
+ with patch("time.sleep"):
+ # must not raise even after all retries are exhausted
+ client.post_agent_optimization_result("proj", "opt-id", self._make_payload())
+
+ def test_swallows_url_errors_without_raising(self):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")):
+ with patch("time.sleep"):
+ client.post_agent_optimization_result("proj", "opt-id", self._make_payload())
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient retry behaviour
+# ---------------------------------------------------------------------------
+
+
+class TestLDApiClientRetry:
+ def _http_error(self, code: int) -> urllib.error.HTTPError:
+ return urllib.error.HTTPError(
+ url="http://x", code=code, msg="Error", hdrs=MagicMock(), fp=BytesIO(b"body")
+ )
+
+ def test_retryable_error_retries_max_times(self):
+ """A 429 or 5xx should be retried up to _MAX_RETRIES times then raise."""
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", side_effect=self._http_error(429)) as mock_open:
+ with patch("time.sleep"):
+ with pytest.raises(LDApiError) as exc_info:
+ client._request("GET", "/path")
+ assert mock_open.call_count == _MAX_RETRIES + 1
+ assert exc_info.value.status_code == 429
+
+ def test_non_retryable_error_raises_immediately(self):
+ """A 401, 403, or 404 should raise after a single attempt with no retries."""
+ for code in (400, 401, 403, 404):
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", side_effect=self._http_error(code)) as mock_open:
+ with patch("time.sleep") as mock_sleep:
+ with pytest.raises(LDApiError) as exc_info:
+ client._request("GET", "/path")
+ assert mock_open.call_count == 1, f"Expected 1 attempt for {code}, got {mock_open.call_count}"
+ mock_sleep.assert_not_called()
+ assert exc_info.value.status_code == code
+
+ def test_url_error_retries_max_times(self):
+ """Network-level errors should also be retried."""
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")) as mock_open:
+ with patch("time.sleep"):
+ with pytest.raises(LDApiError):
+ client._request("GET", "/path")
+ assert mock_open.call_count == _MAX_RETRIES + 1
+
+ def test_backoff_delays_are_exponential(self):
+ """Sleep durations should double on each retry: 1s, 2s, 4s."""
+ client = LDApiClient("test-key")
+ with patch("urllib.request.urlopen", side_effect=self._http_error(500)):
+ with patch("time.sleep") as mock_sleep:
+ with pytest.raises(LDApiError):
+ client._request("GET", "/path")
+ sleep_calls = [c.args[0] for c in mock_sleep.call_args_list]
+ assert sleep_calls == [1.0, 2.0, 4.0]
+
+ def test_succeeds_on_retry_after_transient_error(self):
+ """If a retryable error clears, the successful response should be returned."""
+ client = LDApiClient("test-key")
+ ok_response = _mock_urlopen({"result": "ok"})
+ side_effects = [self._http_error(500), ok_response]
+ with patch("urllib.request.urlopen", side_effect=side_effects) as mock_open:
+ with patch("time.sleep"):
+ result = client._request("GET", "/path")
+ assert result == {"result": "ok"}
+ assert mock_open.call_count == 2
diff --git a/packages/optimization/tests/test_package.py b/packages/optimization/tests/test_package.py
index 2123eb68..d7d29514 100644
--- a/packages/optimization/tests/test_package.py
+++ b/packages/optimization/tests/test_package.py
@@ -1,8 +1,8 @@
-"""Smoke tests for ldai_optimization."""
+"""Smoke tests for ldai_optimizer."""
import pytest
-from ldai_optimization import ApiAgentOptimizationClient, __version__
+from ldai_optimizer import OptimizationClient, __version__
def test_version_is_string():
@@ -10,7 +10,6 @@ def test_version_is_string():
assert len(__version__) > 0
-def test_optimize_not_implemented():
- client = ApiAgentOptimizationClient()
- with pytest.raises(NotImplementedError):
- client.optimize("example", {})
+def test_client_requires_ldai_client():
+ with pytest.raises(TypeError):
+ OptimizationClient() # type: ignore[call-arg]
diff --git a/release-please-config.json b/release-please-config.json
index cf0d738a..3cb7adc0 100644
--- a/release-please-config.json
+++ b/release-please-config.json
@@ -38,10 +38,8 @@
"versioning": "default",
"bump-minor-pre-major": true,
"include-v-in-tag": false,
- "extra-files": [
- "src/ldai_optimization/__init__.py"
- ],
- "component": "launchdarkly-server-sdk-ai-optimization"
+ "extra-files": ["src/ldai_optimizer/__init__.py"],
+ "component": "ldai_optimizer"
}
}
}