diff --git a/packages/optimization/Makefile b/packages/optimization/Makefile
index 12ad4476..2c9c3153 100644
--- a/packages/optimization/Makefile
+++ b/packages/optimization/Makefile
@@ -19,9 +19,9 @@ test: install
 .PHONY: lint
 lint: #! Run type analysis and linting checks
 lint: install
-	uv run mypy src/ldai_optimization
-	uv run isort --check --atomic src/ldai_optimization
-	uv run pycodestyle src/ldai_optimization
+	uv run mypy src/ldai_optimizer
+	uv run isort --check --atomic src/ldai_optimizer
+	uv run pycodestyle src/ldai_optimizer
 
 .PHONY: build
 build: #! Build distribution files
diff --git a/packages/optimization/README.md b/packages/optimization/README.md
index 4f1f64ed..3dc633ec 100644
--- a/packages/optimization/README.md
+++ b/packages/optimization/README.md
@@ -1,9 +1,6 @@
 # LaunchDarkly AI SDK — optimization
 
-[![Actions Status](https://github.com/launchdarkly/python-server-sdk-ai/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/launchdarkly/python-server-sdk-ai/actions/workflows/ci.yml)
-
-[![PyPI](https://img.shields.io/pypi/v/launchdarkly-server-sdk-ai-optimization.svg?maxAge=2592000)](https://pypi.org/project/launchdarkly-server-sdk-ai-optimization/)
-[![PyPI](https://img.shields.io/pypi/pyversions/launchdarkly-server-sdk-ai-optimization.svg)](https://pypi.org/project/launchdarkly-server-sdk-ai-optimization/)
+[![PyPI](https://img.shields.io/pypi/v/ldai_optimizer.svg?style=flat-square)](https://pypi.org/project/ldai_optimizer/)
 
 > [!CAUTION]
 > This package is in pre-release and not subject to backwards compatibility
@@ -11,17 +8,122 @@
 >
 > Pin to a specific minor version and review the [changelog](CHANGELOG.md) before upgrading.
 
-This package will provide helpers to run selected tools against the [LaunchDarkly API](https://apidocs.launchdarkly.com/) from SDK-based workflows. The public surface is not yet finalized; see [CHANGELOG.md](CHANGELOG.md) for updates.
+This package provides helpers for running iterative AI prompt optimization workflows from within LaunchDarkly SDK-based applications. It drives the optimization loop — generating candidate variations, evaluating them with judges, and optionally committing winners back to LaunchDarkly — while delegating all LLM calls to your own handler functions.
+
+## Requirements
+
+- Python `>=3.9`
+- A configured [LaunchDarkly server-side SDK](https://docs.launchdarkly.com/sdk/server-side/python) client
+- The [LaunchDarkly AI package](https://pypi.org/project/launchdarkly-server-sdk-ai/) (`launchdarkly-server-sdk-ai>=0.16.0`) — pulled in automatically as a dependency
+- **`LAUNCHDARKLY_API_KEY` environment variable** — required only when using `auto_commit=True` or `optimize_from_config`. Not needed for basic `optimize_from_options` runs without auto-commit.
+
+> [!NOTE]
+> **`LAUNCHDARKLY_API_KEY` is used exclusively for discrete LaunchDarkly REST API calls** (fetching configs, publishing results). It is never included in any LLM prompt and is never forwarded to your handler callbacks. All API calls made by this package are isolated; they have no access to your runtime environment beyond the key you explicitly provide via the environment variable.
 
 ## Installation
 
 ```bash
-pip install launchdarkly-server-sdk-ai-optimization
+pip install ldai_optimizer
 ```
 
-## Status
+## Quick Start
+
+### Basic optimization (`optimize_from_options`)
 
-- 3/24/26: Initial package creation
+No `LAUNCHDARKLY_API_KEY` required unless `auto_commit=True`.
+
+```python
+import ldclient
+from ldai import LDAIClient
+from ldai_optimizer import (
+    OptimizationClient,
+    OptimizationJudge,
+    OptimizationOptions,
+    OptimizationResponse,
+    LLMCallConfig,
+    LLMCallContext,
+)
+
+ldclient.set_config(ldclient.Config("sdk-your-sdk-key"))
+ld = LDAIClient(ldclient.get())
+client = OptimizationClient(ld)
+
+def handle_llm_call(
+    run_id: str,
+    config: LLMCallConfig,
+    context: LLMCallContext,
+    is_evaluation: bool,
+) -> OptimizationResponse:
+    # config.model, config.instructions, config.key are available
+    # context.user_input, context.current_variables are available
+    response = your_llm_client.chat(
+        model=config.model.name if config.model else "gpt-4o",
+        system=config.instructions,
+        user=context.user_input or "",
+    )
+    return OptimizationResponse(completion=response.text)
+
+result = await client.optimize_from_options(
+    OptimizationOptions(
+        agent_key="my-agent",
+        handle_agent_call=handle_llm_call,
+        judge_model="gpt-4o-mini",
+        judges={
+            "quality": OptimizationJudge(
+                threshold=1.0,
+                acceptance_statement="The response is accurate and concise.",
+            )
+        },
+        model_choices=["gpt-4o", "gpt-4o-mini"],
+        variable_choices=[{"user_id": "user-123"}],
+        user_input_choices=["What is my account balance?"],
+    )
+)
+```
+
+### Ground truth optimization
+
+```python
+from ldai_optimizer import GroundTruthOptimizationOptions, GroundTruthSample
+
+result = await client.optimize_from_options(
+    GroundTruthOptimizationOptions(
+        agent_key="my-agent",
+        handle_agent_call=handle_llm_call,
+        judge_model="gpt-4o-mini",
+        judges={
+            "accuracy": OptimizationJudge(
+                threshold=1.0,
+                acceptance_statement="The response matches the expected answer.",
+            )
+        },
+        model_choices=["gpt-4o", "gpt-4o-mini"],
+        ground_truth_responses=[
+            GroundTruthSample(
+                user_input="What is 2+2?",
+                ground_truth_response="4",
+            )
+        ],
+    )
+)
+```
+
+### Config-driven optimization (`optimize_from_config`)
+
+Requires `LAUNCHDARKLY_API_KEY`.
+
+```python
+from ldai_optimizer import OptimizationFromConfigOptions
+
+result = await client.optimize_from_config(
+    OptimizationFromConfigOptions(
+        config_key="my-optimization-config",
+        project_key="my-project",
+        handle_agent_call=handle_llm_call,
+        auto_commit=True,
+    )
+)
+```
 
 ## License
 
diff --git a/packages/optimization/pyproject.toml b/packages/optimization/pyproject.toml
index 20a6b241..d6c40ee4 100644
--- a/packages/optimization/pyproject.toml
+++ b/packages/optimization/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
-name = "launchdarkly-server-sdk-ai-optimization"
+name = "ldai_optimizer"
 version = "0.1.0"  # x-release-please-version
-description = "LaunchDarkly AI SDK optimization helpers"
+description = "LaunchDarkly AI tool — optimizer"
 authors = [{name = "LaunchDarkly", email = "dev@launchdarkly.com"}]
 license = {text = "Apache-2.0"}
 readme = "README.md"
@@ -42,7 +42,7 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build.targets.wheel]
-packages = ["src/ldai_optimization"]
+packages = ["src/ldai_optimizer"]
 
 [tool.mypy]
 python_version = "3.10"
diff --git a/packages/optimization/src/ldai_optimization/__init__.py b/packages/optimization/src/ldai_optimization/__init__.py
deleted file mode 100644
index a0b379c6..00000000
--- a/packages/optimization/src/ldai_optimization/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""LaunchDarkly AI SDK — optimization.
-
-This package will provide helpers to run selected tools against the LaunchDarkly API from SDK-based workflows.
-"""
-
-from ldai_optimization.client import ApiAgentOptimizationClient
-
-__version__ = "0.1.0"  # x-release-please-version
-
-__all__ = [
-    '__version__',
-    'ApiAgentOptimizationClient',
-]
diff --git a/packages/optimization/src/ldai_optimization/client.py b/packages/optimization/src/ldai_optimization/client.py
deleted file mode 100644
index 75c38589..00000000
--- a/packages/optimization/src/ldai_optimization/client.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""Client placeholder for LaunchDarkly API tool execution."""
-
-from typing import Any, Dict
-
-
-class ApiAgentOptimizationClient:
-    """Coordinates running supported tools against the LaunchDarkly API.
-
-    This type is scaffolding; concrete behavior will be added in a future release.
-    """
-
-    def optimize(self, tool_name: str, parameters: Dict[str, Any]) -> Any:
-        """Execute a supported LaunchDarkly API tool by name.
-
-        :param tool_name: Identifier of the tool to invoke.
-        :param parameters: Tool-specific request parameters.
-        :return: Tool-specific response data.
-        :raises NotImplementedError: Until the API integration is implemented.
-        """
-        raise NotImplementedError
diff --git a/packages/optimization/src/ldai_optimizer/__init__.py b/packages/optimization/src/ldai_optimizer/__init__.py
new file mode 100644
index 00000000..beb11cb7
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/__init__.py
@@ -0,0 +1,44 @@
+"""LaunchDarkly AI SDK — optimization.
+
+This package will provide helpers to run selected tools against the LaunchDarkly API from SDK-based workflows.
+"""
+
+from ldai.tracker import TokenUsage
+
+from ldai_optimizer.client import OptimizationClient
+from ldai_optimizer.dataclasses import (
+    AIJudgeCallConfig,
+    GroundTruthOptimizationOptions,
+    GroundTruthSample,
+    LLMCallConfig,
+    LLMCallContext,
+    OptimizationContext,
+    OptimizationFromConfigOptions,
+    OptimizationJudge,
+    OptimizationJudgeContext,
+    OptimizationOptions,
+    OptimizationResponse,
+    ToolDefinition,
+)
+from ldai_optimizer.ld_api_client import LDApiError
+
+__version__ = "0.0.0"
+
+__all__ = [
+    '__version__',
+    'AIJudgeCallConfig',
+    'GroundTruthOptimizationOptions',
+    'GroundTruthSample',
+    'LDApiError',
+    'LLMCallConfig',
+    'LLMCallContext',
+    'OptimizationClient',
+    'OptimizationContext',
+    'OptimizationFromConfigOptions',
+    'OptimizationJudge',
+    'OptimizationJudgeContext',
+    'OptimizationOptions',
+    'OptimizationResponse',
+    'TokenUsage',
+    'ToolDefinition',
+]
diff --git a/packages/optimization/src/ldai_optimizer/_slug_words.py b/packages/optimization/src/ldai_optimizer/_slug_words.py
new file mode 100644
index 00000000..564c2efb
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/_slug_words.py
@@ -0,0 +1,74 @@
+"""Word lists for slug generation.
+
+Adjectives and nouns curated from the coolname package's word files
+(Apache-2.0 licensed).  Used by generate_slug() to produce
+``adjective-noun`` variation keys (e.g. ``blazing-lobster``).
+
+640 × 454 possible combinations from the full coolname corpus would be
+overkill; ~100 × ~100 = ~10 000 combinations is sufficient given that
+_commit_variation already appends a hex suffix on collisions.
+"""
+
+_ADJECTIVES: tuple = (
+    # appearance / texture
+    "blazing", "bouncy", "brawny", "chubby", "curvy", "elastic", "ethereal",
+    "fluffy", "foamy", "furry", "fuzzy", "glaring", "hairy", "hissing",
+    "icy", "luminous", "lumpy", "misty", "noisy", "quiet", "quirky",
+    "radiant", "roaring", "ruddy", "shaggy", "shiny", "silent", "silky",
+    "singing", "skinny", "smooth", "soft", "spicy", "spiked", "sticky",
+    "tall", "venomous", "warm", "winged", "wooden",
+    # personality / disposition
+    "adorable", "amazing", "amiable", "calm", "charming", "cute",
+    "dainty", "easygoing", "elegant", "famous", "friendly", "funny",
+    "graceful", "gracious", "happy", "hilarious", "jolly", "jovial",
+    "kind", "laughing", "lovely", "mellow", "neat", "nifty", "noble",
+    "popular", "pretty", "refreshing", "spiffy", "stylish", "sweet",
+    "tactful", "whimsical",
+    # character / trait
+    "adventurous", "ambitious", "audacious", "bold", "brave", "cheerful",
+    "curious", "daring", "determined", "eager", "enthusiastic", "faithful",
+    "fearless", "fierce", "generous", "gentle", "gleeful", "grateful",
+    "hopeful", "humble", "intrepid", "lively", "loyal", "merry",
+    "mysterious", "optimistic", "passionate", "polite", "proud", "rebel",
+    "relaxed", "reliable", "resolute", "romantic", "sincere", "spirited",
+    "stalwart", "thankful", "upbeat", "valiant", "vigorous", "vivacious",
+    "zealous", "zippy",
+    # quality / impressiveness
+    "ancient", "awesome", "brilliant", "classic", "dazzling", "fabulous",
+    "fantastic", "glorious", "legendary", "magnificent", "majestic",
+    "marvellous", "miraculous", "phenomenal", "remarkable", "splendid",
+    "wonderful",
+    # size
+    "colossal", "enormous", "gigantic", "huge", "massive", "tiny",
+    "towering",
+)
+
+_NOUNS: tuple = (
+    # common mammals
+    "badger", "bat", "bear", "beaver", "bison", "bobcat", "buffalo",
+    "capybara", "cheetah", "chipmunk", "coyote", "dingo", "dormouse",
+    "elephant", "ermine", "ferret", "fox", "gazelle", "gibbon", "gorilla",
+    "groundhog", "hamster", "hare", "hedgehog", "hippo", "horse",
+    "hyena", "jaguar", "kangaroo", "koala", "leopard", "lion", "lynx",
+    "mammoth", "marmot", "meerkat", "mongoose", "monkey", "moose",
+    "otter", "panda", "panther", "porcupine", "puma", "rabbit",
+    "raccoon", "rhinoceros", "seal", "skunk", "sloth", "squirrel",
+    "tiger", "walrus", "weasel", "whale", "wolf", "wombat",
+    "wolverine", "zebra",
+    # birds
+    "condor", "crane", "crow", "dove", "eagle", "falcon", "flamingo",
+    "hawk", "heron", "hummingbird", "kingfisher", "macaw", "magpie",
+    "ostrich", "owl", "parrot", "peacock", "pelican", "penguin",
+    "phoenix", "puffin", "raven", "robin", "sparrow", "starling",
+    "stork", "swan", "toucan", "vulture",
+    # reptiles / amphibians / fish
+    "cobra", "crocodile", "gecko", "iguana", "jellyfish", "lobster",
+    "narwhal", "octopus", "orca", "python", "rattlesnake", "salmon",
+    "seahorse", "shark", "snake", "squid", "tortoise", "turtle",
+    "viper",
+    # legendary / breed
+    "basilisk", "chimera", "chupacabra", "dragon", "griffin",
+    "kraken", "pegasus", "unicorn", "wyvern",
+    "beagle", "bulldog", "collie", "corgi", "dalmatian", "husky",
+    "labrador", "poodle", "rottweiler",
+)
diff --git a/packages/optimization/src/ldai_optimizer/client.py b/packages/optimization/src/ldai_optimizer/client.py
new file mode 100644
index 00000000..c7927d4c
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/client.py
@@ -0,0 +1,2356 @@
+"""Client for LaunchDarkly AI agent optimization.
+
+Security note — LAUNCHDARKLY_API_KEY scope
+-------------------------------------------
+When set, the ``LAUNCHDARKLY_API_KEY`` environment variable is used solely to
+authenticate discrete LaunchDarkly REST API calls (e.g. fetching optimization
+configs, publishing results via ``auto_commit``). It is:
+
+- Never included in any LLM prompt.
+- Never forwarded to user-supplied ``handle_agent_call`` or ``handle_judge_call``
+  callbacks.
+- Never accessible to any external service other than the LaunchDarkly REST API.
+
+All LaunchDarkly API calls are isolated requests; they carry no information
+about the caller's broader runtime environment beyond the key itself.
+"""
+
+import dataclasses
+import json
+import logging
+import os
+import random
+import time
+import uuid
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from ldai import AIAgentConfig, AIJudgeConfig, AIJudgeConfigDefault, LDAIClient
+from ldai.models import LDMessage, ModelConfig
+from ldclient import Context
+
+from ldai_optimizer.dataclasses import (
+    AIJudgeCallConfig,
+    GroundTruthOptimizationOptions,
+    GroundTruthSample,
+    HandleJudgeCall,
+    JudgeResult,
+    OptimizationContext,
+    OptimizationFromConfigOptions,
+    OptimizationJudge,
+    OptimizationJudgeContext,
+    OptimizationOptions,
+    OptimizationResponse,
+    ToolDefinition,
+)
+from ldai_optimizer.ld_api_client import (
+    AgentOptimizationConfig,
+    AgentOptimizationResultPatch,
+    AgentOptimizationResultPost,
+    LDApiClient,
+)
+from ldai_optimizer.prompts import (
+    _acceptance_criteria_implies_duration_optimization,
+    build_message_history_text,
+    build_new_variation_prompt,
+    build_reasoning_history,
+)
+from ldai_optimizer.util import (
+    RedactionFilter,
+    await_if_needed,
+    extract_json_from_response,
+    generate_slug,
+    interpolate_variables,
+    restore_variable_placeholders,
+    validate_variation_response,
+)
+
+logger = logging.getLogger(__name__)
+logger.addFilter(RedactionFilter())
+
+
+def _find_model_config(
+    model_name: str, configs: List[Dict[str, Any]]
+) -> Optional[Dict[str, Any]]:
+    """Find the best matching model config for a given model name.
+
+    When multiple configs share the same ``id``, the one marked ``global=True``
+    is preferred over project-specific configs. Falls back to the first
+    non-global match if no global entry exists.
+
+    :param model_name: The model id to look up.
+    :param configs: List of model config dicts from the LD API.
+    :return: Best-matching model config dict, or None if no match.
+    """
+    matching = [mc for mc in configs if mc.get("id") == model_name]
+    if not matching:
+        return None
+    global_match = next((mc for mc in matching if mc.get("global") is True), None)
+    return global_match if global_match is not None else matching[0]
+
+
+def _strip_provider_prefix(model: str) -> str:
+    """Strip the provider prefix from a model identifier returned by the LD API.
+
+    API model keys are formatted as "Provider.model-name" (e.g. "OpenAI.gpt-5",
+    "Anthropic.claude-opus-4.6"). Only the part after the first period is needed
+    by the underlying LLM clients. If no period is present the string is returned
+    unchanged.
+
+    :param model: Raw model string from the API.
+    :return: Model name with provider prefix removed.
+    """
+    return model.split(".", 1)[-1]
+
+
+def _compute_validation_count(pool_size: int) -> int:
+    """Compute how many validation samples to run after a candidate passes in chaos mode.
+
+    Scales with the size of the available input/variable pool so that larger
+    option sets receive proportionally more validation coverage, capped at 5.
+    The floor of 2 ensures at least a minimal cross-check even for small pools.
+
+    :param pool_size: Total number of distinct choices in the sampling pool
+        (user_input_options count when provided, otherwise variable_choices count).
+    :return: Number of validation samples to run (between 2 and 5 inclusive).
+    """
+    return min(5, max(2, pool_size // 4))
+
+
+# Maximum number of attempts for variation generation. Transient empty or
+# unparseable responses from the LLM are retried up to this many times before
+# the variation step is treated as a failure.
+_MAX_VARIATION_RETRIES = 3
+
+# Duration gate: a candidate must be at least this much faster than the baseline
+# (history[0].duration_ms) to pass the duration check when acceptance criteria
+# imply a latency optimization goal. 0.80 means the candidate must clock in at
+# under 80% of the baseline — i.e. at least 20% improvement.
+_DURATION_TOLERANCE = 0.80
+
+# Maps SDK status strings to the API status/activity values expected by
+# agent_optimization_result records. Defined at module level to avoid
+# allocating the dict on every on_status_update invocation.
+_OPTIMIZATION_STATUS_MAP: Dict[str, Dict[str, str]] = {
+    "init": {"status": "RUNNING", "activity": "PENDING"},
+    "generating": {"status": "RUNNING", "activity": "GENERATING"},
+    "evaluating": {"status": "RUNNING", "activity": "EVALUATING"},
+    "generating variation": {"status": "RUNNING", "activity": "GENERATING_VARIATION"},
+    "validating": {"status": "RUNNING", "activity": "EVALUATING"},
+    "turn completed": {"status": "RUNNING", "activity": "COMPLETED"},
+    "success": {"status": "PASSED", "activity": "COMPLETED"},
+    "failure": {"status": "FAILED", "activity": "COMPLETED"},
+}
+
+
+class OptimizationClient:
+    _options: OptimizationOptions
+    _ldClient: LDAIClient
+    _agent_config: AIAgentConfig
+    _has_api_key: bool
+    _api_key: Optional[str]
+    _agent_key: str
+    _initial_instructions: str
+
+    def __init__(self, ldClient: LDAIClient) -> None:
+        self._ldClient = ldClient
+        self._last_run_succeeded: bool = False
+        self._last_succeeded_context: Optional[OptimizationContext] = None
+        self._last_optimization_result_id: Optional[str] = None
+        self._initial_tool_keys: List[str] = []
+        self._total_token_usage: int = 0
+
+        if os.environ.get("LAUNCHDARKLY_API_KEY"):
+            self._has_api_key = True
+            self._api_key = os.environ.get("LAUNCHDARKLY_API_KEY")
+        else:
+            self._has_api_key = False
+            self._api_key = None
+            logger.warning(
+                "LAUNCHDARKLY_API_KEY is not set, functionality will be limited"
+            )
+
+    def _initialize_class_members_from_config(
+        self, agent_config: AIAgentConfig
+    ) -> None:
+        if not agent_config.instructions:
+            raise ValueError(
+                f"Agent '{agent_config.key}' has no instructions configured. "
+                "Ensure the agent flag has instructions set before running an optimization."
+            )
+        self._current_instructions = agent_config.instructions
+        self._current_parameters: Dict[str, Any] = (
+            agent_config.model._parameters if agent_config.model else None
+        ) or {}
+        self._current_model: Optional[str] = (
+            agent_config.model.name if agent_config.model else None
+        )
+        self._history: List[OptimizationContext] = []
+
+    def _build_agent_config_for_context(
+        self, ctx: OptimizationContext, skip_interpolation: bool = False
+    ) -> AIAgentConfig:
+        """
+        Construct an AIAgentConfig that reflects the current optimization iteration.
+
+        Uses the instructions, model, and parameters from the given context so the
+        caller receives the variation being evaluated rather than the original base config.
+        ``{{placeholder}}`` tokens in the instructions are substituted using
+        ctx.current_variables at call time so the stored template is never mutated.
+
+        :param ctx: The OptimizationContext for this iteration
+        :param skip_interpolation: When True, skip variable interpolation on the
+            instructions. Use this when the instructions are a meta-prompt (e.g. a
+            variation-generation prompt) that deliberately contains ``{{key}}`` tokens
+            as text for the LLM to read rather than as runtime substitution targets.
+        :return: A fresh AIAgentConfig populated from the context's current state
+        """
+        instructions = (
+            interpolate_variables(ctx.current_instructions, ctx.current_variables)
+            if ctx.current_variables and not skip_interpolation
+            else ctx.current_instructions
+        )
+        return AIAgentConfig(
+            key=self._agent_key,
+            enabled=True,
+            create_tracker=self._agent_config.create_tracker,
+            model=ModelConfig(
+                name=ctx.current_model or "",
+                parameters=ctx.current_parameters,
+            ),
+            instructions=instructions,
+            provider=self._agent_config.provider,
+        )
+
+    def _create_optimization_context(
+        self,
+        iteration: int,
+        variables: Dict[str, Any],
+        user_input: Optional[str] = None,
+        completion_response: str = "",
+        scores: Optional[Dict[str, JudgeResult]] = None,
+    ) -> OptimizationContext:
+        """
+        Create an OptimizeContext with current state.
+
+        :param iteration: Current iteration number
+        :param variables: Variable set chosen for this iteration
+        :param user_input: Optional user input for this iteration
+        :param completion_response: Completion response string
+        :param scores: Optional dictionary of judge results
+        :return: A new OptimizeContext instance
+        """
+        flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history]
+        return OptimizationContext(
+            scores=scores or {},
+            completion_response=completion_response,
+            current_instructions=self._current_instructions,
+            current_parameters=self._current_parameters.copy(),
+            current_variables=variables,
+            current_model=self._current_model,
+            user_input=user_input,
+            history=tuple(flat_history),
+            iteration=iteration,
+        )
+
+    @property
+    def _judge_call(self) -> HandleJudgeCall:
+        """Return the judge callable, falling back to handle_agent_call when not set."""
+        return self._options.handle_judge_call or self._options.handle_agent_call
+
+    def _safe_status_update(
+        self,
+        status: Literal[
+            "init",
+            "generating",
+            "evaluating",
+            "generating variation",
+            "validating",
+            "turn completed",
+            "success",
+            "failure",
+        ],
+        context: OptimizationContext,
+        iteration: int,
+    ) -> None:
+        """
+        Safely call on_status_update callback, catching and logging errors.
+
+        :param status: The status string to pass to the callback
+        :param context: The optimization context to pass to the callback
+        :param iteration: Current iteration number for logging
+        """
+        if self._options.on_status_update:
+            try:
+                self._options.on_status_update(status, context.copy_without_history())
+            except Exception:
+                logger.exception(
+                    "[Iteration %d] -> on_status_update callback failed", iteration
+                )
+
+    def _judge_config(
+        self,
+        judge_key: str,
+        context: Context,
+        default: AIJudgeConfigDefault,
+        variables: Dict[str, Any],
+    ) -> AIJudgeConfig:
+        """
+        Fetch a judge configuration from the LaunchDarkly client.
+
+        Thin wrapper around LDAIClient.judge_config so callers do not need a
+        direct reference to the client.
+
+        :param judge_key: The key for the judge configuration in LaunchDarkly
+        :param context: The evaluation context
+        :param default: Fallback config when the flag is disabled or unreachable
+        :param variables: Template variables for instruction interpolation
+        :return: The resolved AIJudgeConfig
+        """
+        return self._ldClient.judge_config(judge_key, context, default, variables)
+
+    def _serialize_scores(
+        self, judge_results: Dict[str, JudgeResult]
+    ) -> Dict[str, Any]:
+        """
+        Convert judge results to a JSON-serializable dictionary.
+
+        :param judge_results: Dictionary of judge keys to JudgeResult instances
+        :return: Dictionary suitable for json.dumps
+        """
+        return {key: result.to_json() for key, result in judge_results.items()}
+
+    def _extract_agent_tools(self, parameters: Dict[str, Any]) -> List[ToolDefinition]:
+        """
+        Extract and normalise the tools list from agent parameters.
+
+        Reads the ``tools`` key from *parameters* (if present) and converts
+        every entry to a ToolDefinition so judges receive typed objects.
+
+        :param parameters: The agent's current_parameters dict
+        :return: List of ToolDefinition instances, empty list if no tools are configured
+        """
+        raw_tools = parameters.get("tools", [])
+        if not raw_tools:
+            return []
+        if not isinstance(raw_tools, list):
+            raw_tools = [raw_tools]
+
+        result = []
+        for tool in raw_tools:
+            if isinstance(tool, ToolDefinition):
+                result.append(tool)
+            elif hasattr(tool, "to_dict"):
+                result.append(ToolDefinition.from_dict(tool.to_dict()))
+            elif isinstance(tool, dict):
+                result.append(ToolDefinition.from_dict(tool))
+        return result
+
+    def _parse_judge_response(
+        self,
+        response_str: str,
+        judge_key: str,
+        judge_identifier: str,
+        iteration: int,
+        clamp_score: bool = True,
+    ) -> JudgeResult:
+        """
+        Parse a structured LLM judge response into a JudgeResult.
+
+        Expects a JSON object with "score" (float) and optionally "rationale"
+        (str). On any parsing failure, logs the exception and returns a zero score.
+
+        :param response_str: Raw string response from the judge LLM
+        :param judge_key: Key used to identify this judge in results dicts
+        :param judge_identifier: Human-readable identifier for log messages
+        :param iteration: Current iteration number for logging
+        :param clamp_score: When True, clamps score to [0.0, 1.0]
+        :return: Parsed JudgeResult, or a zero-score result on failure
+        """
+        try:
+            response_data = extract_json_from_response(response_str)
+            score = float(response_data.get("score", 0.0))
+            if clamp_score:
+                score = max(0.0, min(1.0, score))
+            rationale = response_data.get("rationale")
+            return JudgeResult(score=score, rationale=rationale)
+        except Exception:
+            logger.exception(
+                "[Iteration %d] -> Failed to parse judge response for %s",
+                iteration,
+                judge_identifier,
+            )
+            return JudgeResult(score=0.0, rationale=None)
+
+    async def _call_judges(
+        self,
+        completion_response: str,
+        iteration: int,
+        user_input: str,
+        variables: Optional[Dict[str, Any]] = None,
+        agent_tools: Optional[List[ToolDefinition]] = None,
+        expected_response: Optional[str] = None,
+        agent_duration_ms: Optional[float] = None,
+    ) -> Dict[str, JudgeResult]:
+        """
+        Call all judges in parallel (auto-path).
+
+        For judges with judge_key: Fetches judge config on-demand from LaunchDarkly SDK.
+        For judges with acceptance_statement: Uses handle_judge_call callback.
+
+        :param completion_response: The agent's completion response to evaluate
+        :param iteration: Current iteration number
+        :param user_input: The user's question for this turn, forwarded to judges so
+            they know what was actually asked (the current turn is not yet in
+            self._history when judges run)
+        :param variables: The variable set that was used during the agent generation
+        :param agent_tools: Normalised list of tool dicts that were available to the agent
+        :param expected_response: Optional ground truth expected response. When provided,
+            judges are instructed to factor it into their scoring alongside acceptance criteria.
+        :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
+            Forwarded to acceptance judges whose statement implies a latency goal so they
+            can mention the duration change in their rationale.
+        :return: Dictionary of judge results (score and rationale)
+        """
+        if not self._options.judges:
+            return {}
+
+        resolved_variables: Dict[str, Any] = variables or {}
+        resolved_agent_tools: List[ToolDefinition] = agent_tools or []
+
+        logger.info("[Iteration %d] -> Executing evaluation...", iteration)
+        reasoning_history = build_reasoning_history(self._history)
+        judge_results: Dict[str, JudgeResult] = {}
+
+        judge_count = len(self._options.judges)
+        for idx, (judge_key, optimization_judge) in enumerate(
+            self._options.judges.items(), 1
+        ):
+            judge_type = (
+                "config" if optimization_judge.judge_key is not None else "acceptance"
+            )
+            logger.info(
+                "[Iteration %d] -> Running judge %d/%d '%s' (%s)...",
+                iteration,
+                idx,
+                judge_count,
+                judge_key,
+                judge_type,
+            )
+            try:
+                if optimization_judge.judge_key is not None:
+                    result = await self._evaluate_config_judge(
+                        judge_key,
+                        optimization_judge,
+                        completion_response,
+                        iteration,
+                        reasoning_history,
+                        user_input=user_input,
+                        variables=resolved_variables,
+                        agent_tools=resolved_agent_tools,
+                        expected_response=expected_response,
+                    )
+                    judge_results[judge_key] = result
+                else:
+                    result = await self._evaluate_acceptance_judge(
+                        judge_key,
+                        optimization_judge,
+                        completion_response,
+                        iteration,
+                        reasoning_history,
+                        user_input=user_input,
+                        variables=resolved_variables,
+                        agent_tools=resolved_agent_tools,
+                        expected_response=expected_response,
+                        agent_duration_ms=agent_duration_ms,
+                    )
+                    judge_results[judge_key] = result
+
+                threshold = (
+                    optimization_judge.threshold
+                    if optimization_judge.threshold is not None
+                    else 1.0
+                )
+                passed = result.score >= threshold
+                logger.debug(
+                    "[Iteration %d] -> Judge '%s' scored %.3f (threshold=%.3f) -> %s%s",
+                    iteration,
+                    judge_key,
+                    result.score,
+                    threshold,
+                    "PASSED" if passed else "FAILED",
+                    f" | {result.rationale}" if result.rationale else "",
+                )
+            except Exception:
+                logger.exception(
+                    "[Iteration %d] -> Judge %s evaluation failed", iteration, judge_key
+                )
+                judge_results[judge_key] = JudgeResult(score=0.0, rationale=None)
+
+        judge_results_json = self._serialize_scores(judge_results)
+        logger.debug(
+            "[Iteration %d] -> Evaluation result: %s",
+            iteration,
+            json.dumps(judge_results_json, indent=2),
+        )
+        return judge_results
+
+    async def _evaluate_config_judge(
+        self,
+        judge_key: str,
+        optimization_judge: "OptimizationJudge",
+        completion_response: str,
+        iteration: int,
+        reasoning_history: str,
+        user_input: str,
+        variables: Optional[Dict[str, Any]] = None,
+        agent_tools: Optional[List[ToolDefinition]] = None,
+        expected_response: Optional[str] = None,
+    ) -> JudgeResult:
+        """
+        Evaluate using a config-type judge (with judge_key).
+
+        :param judge_key: The key for this judge in the judges dict
+        :param optimization_judge: The optimization judge configuration
+        :param completion_response: The agent's completion response to evaluate
+        :param iteration: Current iteration number
+        :param reasoning_history: Formatted string of reasoning from previous iterations
+        :param user_input: The user's question for this turn
+        :param variables: The variable set that was used during agent generation
+        :param agent_tools: Normalised list of tool dicts that were available to the agent
+        :param expected_response: Optional ground truth expected response. When provided,
+            injected into template variables and judge messages.
+        :return: The judge result with score and rationale
+        """
+        # Config-type judge: fetch judge config on-demand from LaunchDarkly SDK
+        input_text = self._current_instructions or ""
+        # Combine current instructions, history, and current question for message_history
+        message_history_text = build_message_history_text(
+            self._history, input_text, reasoning_history, user_input
+        )
+
+        # Merge agent variables so the judge's LD-managed instructions can reference
+        # {{variable_name}} tokens alongside the standard judge template variables.
+        template_variables: Dict[str, Any] = {
+            **(variables or {}),
+            "message_history": message_history_text,
+            "response_to_evaluate": completion_response,
+        }
+        if expected_response is not None:
+            template_variables["expected_response"] = expected_response
+
+        assert optimization_judge.judge_key is not None
+        judge_config = self._judge_config(
+            optimization_judge.judge_key,
+            self._options.context_choices[0],
+            AIJudgeConfigDefault(enabled=False),
+            template_variables,
+        )
+
+        if not judge_config.enabled:
+            logger.warning(
+                "[Iteration %d] -> Judge %s is disabled",
+                iteration,
+                optimization_judge.judge_key,
+            )
+            return JudgeResult(score=0.0, rationale=None)
+
+        if not judge_config.messages:
+            logger.warning(
+                "[Iteration %d] -> Judge %s has no messages",
+                iteration,
+                optimization_judge.judge_key,
+            )
+            return JudgeResult(score=0.0, rationale=None)
+
+        # Split messages into system and user turns.
+        # System turns are joined into a single instructions string (agents SDK path).
+        # All messages are forwarded as-is for the completions path.
+        system_parts = []
+        user_parts = []
+        for msg in judge_config.messages:
+            if msg.role == "system":
+                system_parts.append(
+                    msg.content
+                    + " Return your response as a JSON object with 'score' and 'rationale' fields."
+                )
+            elif msg.role == "user":
+                user_parts.append(msg.content)
+
+        instructions = "\n\n".join(system_parts)
+        judge_user_input = (
+            "\n\n".join(user_parts)
+            if user_parts
+            else f"Here is the response to evaluate: {completion_response}"
+        )
+
+        if expected_response is not None:
+            judge_user_input += (
+                f"\n\nHere is the expected response: {expected_response}"
+                "\n\nEvaluate the actual response against both the acceptance criteria AND "
+                "how closely it matches the expected response. Factor both into your score."
+            )
+
+        # Rebuild the message list with the updated system content so completions users
+        # receive the same scoring instructions that are baked into `instructions`.
+        updated_messages: List[LDMessage] = [
+            LDMessage(role="system", content=instructions),
+            LDMessage(role="user", content=judge_user_input),
+        ]
+
+        # Always use the global judge_model; model parameters (temperature, etc.) from
+        # the judge flag are still forwarded, but the model name is never overridden.
+        model_name = self._options.judge_model
+        model_params: Dict[str, Any] = {}
+        tools: List[ToolDefinition] = []
+        if judge_config.model and judge_config.model._parameters:
+            existing_tools = judge_config.model._parameters.get("tools")
+            if existing_tools:
+                raw = (
+                    existing_tools
+                    if isinstance(existing_tools, list)
+                    else [existing_tools]
+                )
+                for t in raw:
+                    if isinstance(t, ToolDefinition):
+                        tools.append(t)
+                    elif hasattr(t, "to_dict"):
+                        tools.append(ToolDefinition.from_dict(t.to_dict()))
+                    elif isinstance(t, dict):
+                        tools.append(ToolDefinition.from_dict(t))
+            model_params = {
+                k: v for k, v in judge_config.model._parameters.items() if k != "tools"
+            }
+
+        # Prepend agent tools so the judge can call them when verifying the response
+        if agent_tools:
+            tools = list(agent_tools) + tools
+
+        tool_params = {"tools": [t.to_dict() for t in tools]} if tools else {}
+        judge_call_config = AIJudgeCallConfig(
+            key=judge_key,
+            model=ModelConfig(
+                name=model_name,
+                parameters={**model_params, **tool_params},
+            ),
+            instructions=instructions,
+            messages=updated_messages,
+        )
+
+        judge_ctx = OptimizationJudgeContext(
+            user_input=judge_user_input,
+            current_variables=variables or {},
+        )
+
+        _judge_start = time.monotonic()
+        result = self._judge_call(
+            judge_key, judge_call_config, judge_ctx, True
+        )
+        judge_response: OptimizationResponse = await await_if_needed(result)
+        judge_duration_ms = (time.monotonic() - _judge_start) * 1000
+        judge_response_str = judge_response.output
+
+        logger.debug(
+            "[Iteration %d] -> Judge response (%s): %s",
+            iteration,
+            judge_key,
+            judge_response_str,
+        )
+
+        # Parse judge response — expect structured JSON output
+        judge_identifier = optimization_judge.judge_key or judge_key
+        judge_result = self._parse_judge_response(
+            judge_response_str,
+            judge_key,
+            judge_identifier,
+            iteration,
+            clamp_score=False,
+        )
+        return dataclasses.replace(judge_result, duration_ms=judge_duration_ms, usage=judge_response.usage)
+
+    async def _evaluate_acceptance_judge(
+        self,
+        judge_key: str,
+        optimization_judge: "OptimizationJudge",
+        completion_response: str,
+        iteration: int,
+        reasoning_history: str,
+        user_input: str,
+        variables: Optional[Dict[str, Any]] = None,
+        agent_tools: Optional[List[ToolDefinition]] = None,
+        expected_response: Optional[str] = None,
+        agent_duration_ms: Optional[float] = None,
+    ) -> JudgeResult:
+        """
+        Evaluate using an acceptance statement judge.
+
+        :param judge_key: The key for this judge in the judges dict
+        :param optimization_judge: The optimization judge configuration
+        :param completion_response: The agent's completion response to evaluate
+        :param iteration: Current iteration number
+        :param reasoning_history: Formatted string of reasoning from previous iterations
+        :param user_input: The user's question for this turn
+        :param variables: The variable set that was used during agent generation
+        :param agent_tools: Normalised list of tool dicts that were available to the agent
+        :param expected_response: Optional ground truth expected response. When provided,
+            injected into instructions and judge message so the judge can score actual vs. expected.
+        :param agent_duration_ms: Wall-clock duration of the agent call in milliseconds.
+            When the acceptance statement implies a latency goal, the judge is instructed
+            to mention the duration change in its rationale.
+        :return: The judge result with score and rationale
+        """
+        if not optimization_judge.acceptance_statement:
+            logger.error(
+                "[Iteration %d] -> Judge %s has no acceptance_statement",
+                iteration,
+                judge_key,
+            )
+            return JudgeResult(score=0.0, rationale=None)
+
+        resolved_variables = variables or {}
+        resolved_agent_tools = agent_tools or []
+
+        # Build message history including the current user question
+        message_history_text = build_message_history_text(
+            self._history, "", reasoning_history, user_input
+        )
+
+        # Build instructions for the judge
+        instructions = (
+            "You are a judge that evaluates the response to the user's question.\n\n"
+            "Here is the statement that you should evaluate the response against: "
+            f"'{optimization_judge.acceptance_statement}'\n"
+            f"Here is the history of all messages between the user and the assistant: {message_history_text}\n"
+            "You should score the response based on how well it meets the acceptance statement "
+            "using a score between 0.0 and 1.0.\n"
+            "A score of 0.0 means it does not match at all, while a score of 1.0 means it matches perfectly.\n"
+            "A score of 0.3-0.7 means it matches partially, while a score of 0.7-1.0 means it matches well.\n"
+            "A score of 0.0-0.3 means that it does not match well at all. "
+            "You can return any value between 0.0 and 1.0.\n"
+            "You should also provide a rationale for your score.\n"
+            "Return your response as a JSON object with 'score' and 'rationale' fields.\n\n"
+            'Example: {"score": 0.8, "rationale": "The response matches the acceptance statement well."}'
+        )
+
+        if (
+            agent_duration_ms is not None
+            and _acceptance_criteria_implies_duration_optimization(
+                {judge_key: optimization_judge}
+            )
+        ):
+            baseline_ms = (
+                self._history[0].duration_ms
+                if self._history and self._history[0].duration_ms is not None
+                else None
+            )
+            instructions += (
+                f"\n\nThe acceptance criteria for this judge includes a latency/duration goal. "
+                f"The agent's response took {agent_duration_ms:.0f}ms to generate. "
+            )
+            if baseline_ms is not None:
+                delta_ms = agent_duration_ms - baseline_ms
+                direction = "faster" if delta_ms < 0 else "slower"
+                instructions += (
+                    f"The baseline duration (first iteration) was {baseline_ms:.0f}ms. "
+                    f"This response was {abs(delta_ms):.0f}ms {direction} than the baseline. "
+                )
+            instructions += (
+                "Please mention the duration and any change from baseline in your rationale."
+            )
+
+        if resolved_variables:
+            instructions += f"\n\nThe following variables were available to the agent: {json.dumps(resolved_variables)}"
+
+        if resolved_agent_tools:
+            tool_names = [t.name for t in resolved_agent_tools]
+            instructions += (
+                "\n\nThe following tools were available to the agent and "
+                f"may be called by you to verify the response: {json.dumps(tool_names)}."
+                "\nIf verifying the response requires looking up external information, "
+                "call the appropriate tool before scoring. "
+                "You should only call the tools for the most recent response, "
+                "and should only call the tools if necessary. "
+                "Assume that previous feedback will have addressed bad tool call results from prior iterations."
+            )
+
+        # Agent tools are passed through so the judge can invoke them for verification
+        tools: List[ToolDefinition] = list(resolved_agent_tools)
+
+        judge_user_input = f"Here is the response to evaluate: {completion_response}"
+        if expected_response is not None:
+            judge_user_input += (
+                f"\n\nHere is the expected response: {expected_response}"
+                "\n\nEvaluate the actual response against both the acceptance statement AND "
+                "how closely it matches the expected response. Factor both into your score."
+            )
+
+        tool_params = {"tools": [t.to_dict() for t in tools]} if tools else {}
+        judge_call_config = AIJudgeCallConfig(
+            key=judge_key,
+            model=ModelConfig(
+                name=self._options.judge_model,
+                parameters=tool_params,
+            ),
+            instructions=instructions,
+            messages=[
+                LDMessage(role="system", content=instructions),
+                LDMessage(role="user", content=judge_user_input),
+            ],
+        )
+
+        judge_ctx = OptimizationJudgeContext(
+            user_input=judge_user_input,
+            current_variables=resolved_variables,
+        )
+
+        _judge_start = time.monotonic()
+        result = self._judge_call(
+            judge_key, judge_call_config, judge_ctx, True
+        )
+        judge_response: OptimizationResponse = await await_if_needed(result)
+        judge_duration_ms = (time.monotonic() - _judge_start) * 1000
+        judge_response_str = judge_response.output
+
+        logger.debug(
+            "[Iteration %d] -> Judge response (%s): %s",
+            iteration,
+            judge_key,
+            judge_response_str,
+        )
+
+        # Parse judge response — expect structured JSON output with score and rationale
+        judge_result = self._parse_judge_response(
+            judge_response_str, judge_key, judge_key, iteration, clamp_score=True
+        )
+        return dataclasses.replace(judge_result, duration_ms=judge_duration_ms, usage=judge_response.usage)
+
+    async def _get_agent_config(
+        self, agent_key: str, context: Context
+    ) -> AIAgentConfig:
+        """
+        Fetch the agent configuration, replacing the instructions with the raw variation
+        template so that {{placeholder}} tokens are preserved for client-side interpolation.
+
+        agent_config() is called normally so we get a fully populated AIAgentConfig
+        (including the tracker). We then call variation() separately to retrieve the
+        unrendered instruction template and swap it in, keeping everything else intact.
+
+        :param agent_key: The key for the agent to get the configuration for
+        :param context: The evaluation context
+        :return: AIAgentConfig with raw {{placeholder}} instruction templates intact
+        """
+        try:
+            agent_config = self._ldClient.agent_config(agent_key, context)
+
+            # variation() returns the raw JSON before chevron.render(), so instructions
+            # still contain {{placeholder}} tokens rather than empty strings.
+            raw_variation = self._ldClient._client.variation(agent_key, context, {})
+            raw_instructions = raw_variation.get(
+                "instructions", agent_config.instructions
+            )
+            if not raw_instructions:
+                raise ValueError(
+                    f"Agent '{agent_key}' has no instructions configured. "
+                    "Ensure the agent flag has instructions set before running an optimization."
+                )
+            self._initial_instructions = raw_instructions
+
+            raw_tools = raw_variation.get("tools", [])
+            self._initial_tool_keys = [
+                t["key"]
+                for t in raw_tools
+                if isinstance(t, dict) and "key" in t
+            ]
+
+            agent_config = dataclasses.replace(
+                agent_config, instructions=raw_instructions
+            )
+            self._initialize_class_members_from_config(agent_config)
+            return agent_config
+        except Exception:
+            logger.exception("[Optimization] -> Failed to get agent configuration")
+            raise
+
+    async def optimize_from_options(
+        self, agent_key: str, options: OptimizationOptions
+    ) -> Any:
+        """Execute an optimization on the given agent with the given options.
+
+        :param agent_key: Identifier of the agent to optimize.
+        :param options: Optimization options.
+        :return: Optimization result.
+        """
+        if options.auto_commit:
+            if not self._has_api_key:
+                raise ValueError(
+                    "auto_commit requires LAUNCHDARKLY_API_KEY to be set"
+                )
+            if not options.project_key:
+                raise ValueError(
+                    "auto_commit requires project_key to be set on OptimizationOptions"
+                )
+        self._agent_key = agent_key
+        context = random.choice(options.context_choices)
+        agent_config = await self._get_agent_config(agent_key, context)
+        result = await self._run_optimization(agent_config, options)
+        if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context:
+            self._commit_variation(
+                self._last_succeeded_context,
+                project_key=options.project_key,  # type: ignore[arg-type]
+                ai_config_key=agent_key,
+                output_key=options.output_key,
+                base_url=options.base_url,
+            )
+        return result
+
+    async def optimize_from_ground_truth_options(
+        self, agent_key: str, options: GroundTruthOptimizationOptions
+    ) -> List[OptimizationContext]:
+        """Execute a ground truth optimization on the given agent.
+
+        Unlike optimize_from_options (which tests random choices until one passes),
+        this path evaluates all N ground truth samples in each attempt and only
+        succeeds when every sample passes its judges. A new variation is generated
+        whenever any sample fails, and all N samples are re-evaluated from scratch
+        with the updated configuration, up to max_attempts.
+
+        :param agent_key: Identifier of the agent to optimize.
+        :param options: Ground truth optimization options including the ordered sample list.
+        :return: List of OptimizationContexts from the final attempt (one per sample).
+        """
+        if options.auto_commit:
+            if not self._has_api_key:
+                raise ValueError(
+                    "auto_commit requires LAUNCHDARKLY_API_KEY to be set"
+                )
+            if not options.project_key:
+                raise ValueError(
+                    "auto_commit requires project_key to be set on GroundTruthOptimizationOptions"
+                )
+        self._agent_key = agent_key
+        context = random.choice(options.context_choices)
+        agent_config = await self._get_agent_config(agent_key, context)
+        result = await self._run_ground_truth_optimization(agent_config, options)
+        if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context:
+            self._commit_variation(
+                self._last_succeeded_context,
+                project_key=options.project_key,  # type: ignore[arg-type]
+                ai_config_key=agent_key,
+                output_key=options.output_key,
+                base_url=options.base_url,
+            )
+        return result
+
+    async def _run_ground_truth_optimization(
+        self,
+        agent_config: AIAgentConfig,
+        gt_options: GroundTruthOptimizationOptions,
+    ) -> List[OptimizationContext]:
+        """Run the ground truth optimization loop.
+
+        Uses the "bridge" pattern to reuse existing internal methods (judge evaluation,
+        variation generation, status callbacks) for the ground truth optimization.
+
+        :param agent_config: Agent configuration from LaunchDarkly.
+        :param gt_options: Ground truth options supplied by the caller.
+        :return: List of OptimizationContexts from the final attempt (one per sample).
+        """
+        bridge = OptimizationOptions(
+            context_choices=gt_options.context_choices,
+            max_attempts=gt_options.max_attempts,
+            model_choices=gt_options.model_choices,
+            judge_model=gt_options.judge_model,
+            variable_choices=[s.variables for s in gt_options.ground_truth_responses],
+            handle_agent_call=gt_options.handle_agent_call,
+            handle_judge_call=gt_options.handle_judge_call,
+            judges=gt_options.judges,
+            on_turn=gt_options.on_turn,
+            on_passing_result=gt_options.on_passing_result,
+            on_failing_result=gt_options.on_failing_result,
+            on_status_update=gt_options.on_status_update,
+            token_limit=gt_options.token_limit,
+        )
+        self._options = bridge
+        self._agent_config = agent_config
+        self._last_run_succeeded = False
+        self._last_succeeded_context = None
+        self._last_optimization_result_id = None
+        self._total_token_usage = 0
+        self._initialize_class_members_from_config(agent_config)
+
+        # Seed from the first model choice on the first iteration
+        # so agent calls never receive an empty model string.
+        if not self._current_model and bridge.model_choices:
+            self._current_model = bridge.model_choices[0]
+            logger.debug(
+                "[GT] -> No model in agent config; defaulting to first model choice: %s",
+                self._current_model,
+            )
+
+        samples = gt_options.ground_truth_responses
+        n = len(samples)
+
+        initial_context = self._create_optimization_context(
+            iteration=0,
+            variables=samples[0].variables,
+        )
+        self._safe_status_update("init", initial_context, 0)
+
+        # Attempt tracks the current "batch" loop that runs
+        # through all N samples. Iteration in this context refers to the
+        # total number of batch runs so far.
+        attempt = 0
+        while True:
+            attempt += 1
+            logger.info(
+                "[GT Attempt %d/%d] -> Starting ground truth run (%d samples, model=%s)",
+                attempt,
+                gt_options.max_attempts,
+                n,
+                self._current_model,
+            )
+
+            attempt_results: List[OptimizationContext] = []
+            all_passed = True
+            failed_count = 0
+
+            # Now iterate through each individual sample in the batch,
+            # creating a new context for each sample + running judges etc.
+            for i, sample in enumerate(samples):
+                linear_iter = (attempt - 1) * n + i + 1
+                truncated = len(sample.user_input) > 100
+                logger.info(
+                    "[GT Attempt %d] -> Sample %d/%d (user_input=%.100s%s)",
+                    attempt,
+                    i + 1,
+                    n,
+                    sample.user_input,
+                    "..." if truncated else "",
+                )
+
+                optimize_context = self._create_optimization_context(
+                    iteration=linear_iter,
+                    user_input=sample.user_input,
+                    variables=sample.variables,
+                )
+
+                self._safe_status_update("generating", optimize_context, linear_iter)
+                optimize_context = await self._execute_agent_turn(
+                    optimize_context,
+                    linear_iter,
+                    expected_response=sample.expected_response,
+                )
+                self._accumulate_tokens(optimize_context)
+                if self._is_token_limit_exceeded():
+                    logger.error(
+                        "[GT Attempt %d] -> Token limit exceeded on sample %d (total=%d)",
+                        attempt,
+                        i + 1,
+                        self._total_token_usage,
+                    )
+                    attempt_results.append(optimize_context)
+                    self._last_run_succeeded = False
+                    self._last_succeeded_context = None
+                    self._safe_status_update("failure", optimize_context, linear_iter)
+                    if self._options.on_failing_result:
+                        try:
+                            self._options.on_failing_result(optimize_context)
+                        except Exception:
+                            logger.exception(
+                                "[GT Attempt %d] -> on_failing_result callback failed", attempt
+                            )
+                    return attempt_results
+
+                # Per-sample pass/fail check
+                if self._options.on_turn is not None:
+                    try:
+                        sample_passed = self._options.on_turn(optimize_context)
+                    except Exception:
+                        logger.exception(
+                            "[GT Attempt %d] -> Sample %d on_turn evaluation failed",
+                            attempt,
+                            i + 1,
+                        )
+                        sample_passed = False
+                else:
+                    sample_passed = self._evaluate_response(optimize_context)
+
+                if sample_passed and _acceptance_criteria_implies_duration_optimization(
+                    self._options.judges
+                ):
+                    sample_passed = self._evaluate_duration(optimize_context)
+
+                if not sample_passed:
+                    logger.info(
+                        "[GT Attempt %d] -> Sample %d/%d FAILED",
+                        attempt,
+                        i + 1,
+                        n,
+                    )
+                    all_passed = False
+                    failed_count += 1
+                else:
+                    logger.debug(
+                        "[GT Attempt %d] -> Sample %d/%d passed",
+                        attempt,
+                        i + 1,
+                        n,
+                    )
+
+                attempt_results.append(optimize_context)
+
+                if gt_options.on_sample_result is not None:
+                    try:
+                        gt_options.on_sample_result(optimize_context)
+                    except Exception:
+                        logger.exception(
+                            "[GT Attempt %d] -> on_sample_result callback failed for sample %d",
+                            attempt,
+                            i + 1,
+                        )
+
+            last_ctx = attempt_results[-1]
+
+            if all_passed:
+                logger.info(
+                    "[GT Attempt %d] -> All %d samples passed — optimization succeeded",
+                    attempt,
+                    n,
+                )
+                self._last_run_succeeded = True
+                self._last_succeeded_context = last_ctx
+                self._safe_status_update("success", last_ctx, last_ctx.iteration)
+                if self._options.on_passing_result:
+                    try:
+                        self._options.on_passing_result(last_ctx)
+                    except Exception:
+                        logger.exception(
+                            "[GT Attempt %d] -> on_passing_result callback failed", attempt
+                        )
+                return attempt_results
+
+            # We've hit max attempts for the batches, bail at this point
+            if attempt >= gt_options.max_attempts:
+                logger.warning(
+                    "[GT Optimization] -> Failed after %d attempt(s) — not all samples passed",
+                    attempt,
+                )
+                self._last_run_succeeded = False
+                self._last_succeeded_context = None
+                self._safe_status_update("failure", last_ctx, last_ctx.iteration)
+                if self._options.on_failing_result:
+                    try:
+                        self._options.on_failing_result(last_ctx)
+                    except Exception:
+                        logger.exception(
+                            "[GT Attempt %d] -> on_failing_result callback failed", attempt
+                        )
+                return attempt_results
+
+            # Append all N results to history so the variation generator has full context
+            # from all of the previous samples
+            self._history.extend(attempt_results)
+
+            logger.info(
+                "[GT Attempt %d] -> %d/%d samples failed — generating new variation",
+                attempt,
+                failed_count,
+                n,
+            )
+            try:
+                await self._generate_new_variation(last_ctx.iteration, last_ctx.current_variables)
+            except Exception:
+                logger.exception(
+                    "[GT Attempt %d] -> Variation generation failed", attempt
+                )
+                self._last_run_succeeded = False
+                self._last_succeeded_context = None
+                self._safe_status_update("failure", last_ctx, last_ctx.iteration)
+                if self._options.on_failing_result:
+                    try:
+                        self._options.on_failing_result(last_ctx)
+                    except Exception:
+                        logger.exception(
+                            "[GT Attempt %d] -> on_failing_result callback failed", attempt
+                        )
+                return attempt_results
+
+            self._safe_status_update("turn completed", last_ctx, last_ctx.iteration)
+
+        # Every branch inside the while True loop returns explicitly (success, max-attempts
+        # exhaustion, or variation-generation failure). This line is structurally unreachable,
+        # but without it type checkers infer the return type as List[OptimizationContext] | None
+        # because they don't always treat `while True` as exhaustive. The RuntimeError makes
+        # the intent unambiguous and causes a loud failure if that invariant is ever broken.
+        raise RuntimeError("unreachable: ground truth loop exited without returning")
+
+    def _apply_new_variation_response(
+        self,
+        response_data: Dict[str, Any],
+        variation_ctx: OptimizationContext,
+        response_str: str,
+        iteration: int,
+    ) -> OptimizationContext:
+        """
+        Validate the parsed variation response, mutate instance state, and return
+        an updated OptimizationContext reflecting the new configuration.
+
+        Updates self._current_instructions, self._current_parameters, and
+        self._current_model in place so subsequent turns use the new configuration.
+
+        :param response_data: Parsed JSON dict from the LLM variation response
+        :param variation_ctx: The context that was sent to the LLM (used to carry history/iteration)
+        :param response_str: The raw response string (stored as completion_response)
+        :param iteration: Current iteration number for logging
+        :return: A new OptimizationContext populated with the updated configuration
+        """
+        validation_errors = validate_variation_response(response_data)
+        if validation_errors:
+            logger.debug(
+                "[Iteration %d] -> Variation response failed validation: %s. "
+                "Received fields: %s. Full response_data: %s",
+                iteration,
+                "; ".join(validation_errors),
+                list(response_data.keys()),
+                json.dumps(response_data, indent=2),
+            )
+            raise ValueError(
+                f"Variation response failed validation: {'; '.join(validation_errors)}. "
+                f"Received fields: {list(response_data.keys())}"
+            )
+
+        self._current_instructions = response_data["current_instructions"]
+
+        # Post-process: replace any leaked variable values back to {{key}} form.
+        # This is a deterministic safety net for when the LLM ignores the prompt
+        # instructions and hardcodes a concrete value (e.g. "user-123") instead
+        # of the placeholder ("{{user_id}}").
+        self._current_instructions, placeholder_warnings = restore_variable_placeholders(
+            self._current_instructions,
+            self._options.variable_choices,
+        )
+        for msg in placeholder_warnings:
+            logger.warning("[Iteration %d] -> %s", iteration, msg)
+
+        self._current_parameters = response_data["current_parameters"]
+
+        # Update model — it should always be provided since it's required in the schema
+        model_value = (
+            response_data.get("model", "").strip()
+            if isinstance(response_data.get("model"), str)
+            else response_data.get("model")
+        )
+        if not model_value:
+            logger.warning(
+                "[Iteration %d] -> Model field is empty or None in response, keeping current model %s",
+                iteration,
+                self._current_model,
+            )
+        elif model_value not in self._options.model_choices:
+            logger.warning(
+                "[Iteration %d] -> Model '%s' not in model_choices %s, keeping current model %s",
+                iteration,
+                model_value,
+                self._options.model_choices,
+                self._current_model,
+            )
+        else:
+            old_model = self._current_model
+            self._current_model = model_value
+
+            # Log regardless of whether we change the model so that logs
+            # are consistently structured
+            if old_model != self._current_model:
+                logger.info(
+                    "[Iteration %d] -> Model updated from '%s' to '%s'",
+                    iteration,
+                    old_model,
+                    self._current_model,
+                )
+            else:
+                logger.debug(
+                    "[Iteration %d] -> Keeping model '%s'",
+                    iteration,
+                    self._current_model,
+                )
+
+        logger.debug(
+            "[Iteration %d] -> New variation generated: instructions='%s', model=%s, parameters=%s",
+            iteration,
+            self._current_instructions,
+            self._current_model,
+            self._current_parameters,
+        )
+
+        # Create a new context with the updated values for return
+        return OptimizationContext(
+            scores={},
+            completion_response=response_str,
+            current_instructions=self._current_instructions,
+            current_parameters=self._current_parameters.copy(),
+            current_variables=variation_ctx.current_variables,
+            current_model=self._current_model,
+            user_input=None,
+            history=variation_ctx.history,
+            iteration=variation_ctx.iteration,
+        )
+
+    async def _generate_new_variation(
+        self, iteration: int, variables: Dict[str, Any]
+    ) -> OptimizationContext:
+        """
+        Generate new variation for next iteration (auto-path).
+
+        Calls handle_agent_call to generate a new variation and updates current_instructions
+        and current_parameters based on the returned OptimizeContext.
+
+        :param iteration: The current iteration number for logging
+        :param variables: The variable set for this iteration, chosen once by the caller
+        """
+        logger.info("[Iteration %d] -> Generating new variation...", iteration)
+
+        # Create a context for status update before generating the variation
+        status_ctx = self._create_optimization_context(
+            iteration=iteration,
+            variables=variables,
+        )
+        self._safe_status_update("generating variation", status_ctx, iteration)
+
+        optimize_for_duration = _acceptance_criteria_implies_duration_optimization(
+            self._options.judges
+        )
+        instructions = build_new_variation_prompt(
+            self._history,
+            self._options.judges,
+            self._current_model,
+            self._current_instructions,
+            self._current_parameters,
+            self._options.model_choices,
+            self._options.variable_choices,
+            self._initial_instructions,
+            optimize_for_duration=optimize_for_duration,
+        )
+
+        # Create a flat history list (without nested history) to avoid exponential growth
+        flat_history = [prev_ctx.copy_without_history() for prev_ctx in self._history]
+
+        # Create context for variation generation — low temperature for deterministic output.
+        variation_ctx = OptimizationContext(
+            scores={},
+            completion_response="",
+            current_instructions=instructions,
+            current_parameters={
+                "temperature": 0.1,
+            },
+            current_variables=variables,
+            current_model=self._current_model,
+            user_input=None,
+            history=tuple(flat_history),
+            iteration=len(self._history) + 1,
+        )
+
+        # Call handle_agent_call to generate new variation; expects a JSON string
+        # matching the structured output schema (current_instructions, current_parameters, model).
+        # Retry up to _MAX_VARIATION_RETRIES times to handle transient empty or unparseable
+        # responses (e.g. when the agent SDK returns the LLM's post-tool-call empty text
+        # instead of the tool result).
+        agent_config = self._build_agent_config_for_context(variation_ctx, skip_interpolation=True)
+        response_data = None
+        response_str = ""
+        for attempt in range(1, _MAX_VARIATION_RETRIES + 1):
+            result = self._options.handle_agent_call(
+                self._agent_key,
+                agent_config,
+                variation_ctx,
+                False,
+            )
+            variation_response: OptimizationResponse = await await_if_needed(result)
+            response_str = variation_response.output
+            try:
+                response_data = extract_json_from_response(response_str)
+                break
+            except ValueError:
+                if attempt == _MAX_VARIATION_RETRIES:
+                    raise
+                logger.warning(
+                    "[Iteration %d] -> Variation response empty or unparseable "
+                    "(attempt %d/%d), retrying...",
+                    iteration,
+                    attempt,
+                    _MAX_VARIATION_RETRIES,
+                )
+
+        assert response_data is not None  # loop always raises or breaks with data
+        return self._apply_new_variation_response(
+            response_data, variation_ctx, response_str, iteration
+        )
+
+    async def optimize_from_config(
+        self, optimization_config_key: str, options: OptimizationFromConfigOptions
+    ) -> Any:
+        """Optimize an agent using a configuration fetched from the LaunchDarkly API.
+
+        The agent key, judge configuration, model choices, and other optimization
+        parameters are all sourced from the remote agent optimization config. The
+        caller only needs to provide the execution callbacks and evaluation contexts.
+
+        Iteration results are automatically persisted to the LaunchDarkly API so
+        the UI can display live run progress.
+
+        :param optimization_config_key: Key of the agent optimization config to fetch.
+        :param options: User-provided callbacks and evaluation contexts.
+        :return: Optimization result (OptimizationContext from the final iteration).
+        """
+        if not self._has_api_key:
+            raise ValueError(
+                "LAUNCHDARKLY_API_KEY is not set, so optimize_from_config is not available"
+            )
+
+        assert self._api_key is not None
+        api_client = LDApiClient(
+            self._api_key,
+            **({"base_url": options.base_url} if options.base_url else {}),
+        )
+        config = api_client.get_agent_optimization(options.project_key, optimization_config_key)
+
+        self._agent_key = config["aiConfigKey"]
+        optimization_key: str = config["key"]
+        run_id = str(uuid.uuid4())
+
+        model_configs: List[Dict[str, Any]] = []
+        try:
+            model_configs = api_client.get_model_configs(options.project_key)
+        except Exception as exc:
+            logger.debug("Could not pre-fetch model configs: %s", exc)
+
+        context = random.choice(options.context_choices)
+        # _get_agent_config calls _initialize_class_members_from_config internally;
+        # _run_optimization calls it again to reset history before the loop starts.
+        agent_config = await self._get_agent_config(self._agent_key, context)
+
+        optimization_options = self._build_options_from_config(
+            config, options, api_client, optimization_key, run_id, model_configs
+        )
+        if isinstance(optimization_options, GroundTruthOptimizationOptions):
+            result = await self._run_ground_truth_optimization(agent_config, optimization_options)
+        else:
+            result = await self._run_optimization(agent_config, optimization_options)
+
+        if options.auto_commit and self._last_run_succeeded and self._last_succeeded_context:
+            created_key = self._commit_variation(
+                self._last_succeeded_context,
+                project_key=options.project_key,
+                ai_config_key=config["aiConfigKey"],
+                output_key=options.output_key,
+                api_client=api_client,
+                model_configs=model_configs,
+            )
+            if created_key and self._last_optimization_result_id:
+                api_client.patch_agent_optimization_result(
+                    options.project_key,
+                    optimization_key,
+                    self._last_optimization_result_id,
+                    {"createdVariationKey": created_key},
+                )
+        return result
+
+    def _build_options_from_config(
+        self,
+        config: AgentOptimizationConfig,
+        options: OptimizationFromConfigOptions,
+        api_client: LDApiClient,
+        optimization_key: str,
+        run_id: str,
+        model_configs: Optional[List[Dict[str, Any]]] = None,
+    ) -> "Union[OptimizationOptions, GroundTruthOptimizationOptions]":
+        """Map a fetched AgentOptimization config + user options into the appropriate options type.
+
+        When the config contains groundTruthResponses, the three lists (groundTruthResponses,
+        userInputOptions, variableChoices) are zipped by index into GroundTruthSample objects
+        and a GroundTruthOptimizationOptions is returned. Otherwise a standard OptimizationOptions
+        is returned.
+
+        Acceptance statements and judge configs from the API are merged into a single
+        judges dict. An on_status_update closure is injected to persist each iteration
+        result to the LaunchDarkly API; any user-supplied on_status_update is chained
+        after the persistence call.
+
+        :param config: Validated AgentOptimizationConfig from the API.
+        :param options: User-provided options from optimize_from_config.
+        :param api_client: Initialised LDApiClient for result persistence.
+        :param optimization_key: String key of the parent agent_optimization record.
+        :param run_id: UUID that groups all result records for this run.
+        :param model_configs: Pre-fetched list of model config dicts for resolving modelConfigKey.
+        :return: OptimizationOptions or GroundTruthOptimizationOptions.
+        """
+        judges: Dict[str, OptimizationJudge] = {}
+
+        for i, stmt in enumerate(config["acceptanceStatements"]):
+            key = f"acceptance-statement-{i}"
+            judges[key] = OptimizationJudge(
+                threshold=float(stmt.get("threshold", 0.95)),
+                acceptance_statement=stmt["statement"],
+            )
+
+        for judge in config["judges"]:
+            judges[judge["key"]] = OptimizationJudge(
+                threshold=float(judge.get("threshold", 0.95)),
+                judge_key=judge["key"],
+            )
+
+        raw_ground_truth: List[str] = config.get("groundTruthResponses") or []
+        has_ground_truth = bool(raw_ground_truth)
+        if not judges and options.on_turn is None:
+            raise ValueError(
+                "The optimization config has no acceptance statements or judges, and no on_turn "
+                "callback was provided. At least one is required to evaluate optimization results."
+            )
+
+        project_key = options.project_key
+        config_version: int = config["version"]
+        _cached_model_configs: List[Dict[str, Any]] = list(model_configs or [])
+
+        # Maps logical iteration number → result record id. Each new main-loop
+        # iteration (plus the init iteration 0) POSTs a fresh record; subsequent
+        # status events for that same iteration PATCH the existing record.
+        _iteration_result_ids: Dict[int, str] = {}
+
+        # Validation phase tracking. When a candidate passes initial checks the
+        # SDK fires validation sub-iterations (val_iter = main_iter + 1, +2, …).
+        # These are internal cross-checks and should NOT create separate records;
+        # instead they are folded back into the parent main-loop iteration's record.
+        _in_validation_phase: bool = False
+        _validation_parent_iteration: int = -1
+
+        # Tracks the most recently opened (POSTed) iteration so we can close it
+        # with a RUNNING:COMPLETED patch when the next iteration begins. Without
+        # this, iterations that don't naturally receive a terminal event (e.g. the
+        # init iteration 0, or non-final GT samples) are left in a stale state.
+        _last_open_iteration: int = -1
+
+        def _resolve_model_config_key(model_name: str) -> str:
+            if not model_name:
+                return ""
+            match = _find_model_config(model_name, _cached_model_configs)
+            return match["key"] if match else model_name
+
+        def _persist_and_forward(
+            status: Literal[
+                "init",
+                "generating",
+                "evaluating",
+                "generating variation",
+                "validating",
+                "turn completed",
+                "success",
+                "failure",
+            ],
+            ctx: OptimizationContext,
+        ) -> None:
+            nonlocal _in_validation_phase, _validation_parent_iteration, _last_open_iteration
+            # _safe_status_update (the caller) already wraps this entire function in
+            # a try/except, so errors here are caught and logged without aborting the run.
+            mapped = _OPTIMIZATION_STATUS_MAP.get(
+                status, {"status": "RUNNING", "activity": "PENDING"}
+            )
+            snapshot = ctx.copy_without_history()
+
+            # "validating" fires with the parent main-loop iteration's context, so
+            # we capture that number as the anchor for all subsequent validation events.
+            if status == "validating":
+                _in_validation_phase = True
+                _validation_parent_iteration = snapshot.iteration
+
+            # Any event whose ctx.iteration differs from the validation anchor is a
+            # validation sub-iteration; fold it back to the parent's record.
+            if _in_validation_phase and snapshot.iteration != _validation_parent_iteration:
+                logical_iteration = _validation_parent_iteration
+            else:
+                logical_iteration = snapshot.iteration
+
+            # When a new iteration begins (generating), close out whatever iteration
+            # was last open so it doesn't remain in a non-terminal state. This covers
+            # the init iteration (0 → 1) and GT batches where non-final samples never
+            # receive an explicit terminal event.
+            if (
+                status == "generating"
+                and _last_open_iteration >= 0
+                and logical_iteration != _last_open_iteration
+            ):
+                prev_result_id = _iteration_result_ids.get(_last_open_iteration)
+                if prev_result_id:
+                    api_client.patch_agent_optimization_result(
+                        project_key,
+                        optimization_key,
+                        prev_result_id,
+                        {"status": "RUNNING", "activity": "COMPLETED"},
+                    )
+                _last_open_iteration = -1
+
+            # Phase 1: POST to create the record on first encounter of each logical iteration.
+            if logical_iteration not in _iteration_result_ids:
+                post_payload: AgentOptimizationResultPost = {
+                    "runId": run_id,
+                    "agentOptimizationVersion": config_version,
+                    "iteration": logical_iteration,
+                    "instructions": snapshot.current_instructions,
+                }
+                if snapshot.current_parameters:
+                    post_payload["parameters"] = snapshot.current_parameters
+                if snapshot.user_input:
+                    post_payload["userInput"] = snapshot.user_input
+                result_id = api_client.post_agent_optimization_result(
+                    project_key, optimization_key, post_payload
+                )
+                if result_id:
+                    _iteration_result_ids[logical_iteration] = result_id
+                    self._last_optimization_result_id = result_id
+                    _last_open_iteration = logical_iteration
+
+            # Phase 2: PATCH the record with current status and available telemetry.
+            result_id = _iteration_result_ids.get(logical_iteration)
+            if result_id:
+                patch: AgentOptimizationResultPatch = {
+                    "status": mapped["status"],
+                    "activity": mapped["activity"],
+                }
+                if snapshot.completion_response:
+                    patch["completionResponse"] = snapshot.completion_response
+                if snapshot.scores:
+                    patch["scores"] = {
+                        k: {
+                            **v.to_json(),
+                            **({"threshold": judges[k].threshold} if k in judges else {}),
+                        }
+                        for k, v in snapshot.scores.items()
+                    }
+                if snapshot.duration_ms is not None:
+                    patch["generationLatency"] = int(snapshot.duration_ms)
+                if snapshot.usage is not None:
+                    patch["generationTokens"] = {
+                        "total": snapshot.usage.total,
+                        "input": snapshot.usage.input,
+                        "output": snapshot.usage.output,
+                    }
+                eval_latencies = {
+                    k: v.duration_ms
+                    for k, v in snapshot.scores.items()
+                    if v.duration_ms is not None
+                }
+                if eval_latencies:
+                    patch["evaluationLatencies"] = eval_latencies
+                eval_tokens = {
+                    k: {"total": v.usage.total, "input": v.usage.input, "output": v.usage.output}
+                    for k, v in snapshot.scores.items()
+                    if v.usage is not None
+                }
+                if eval_tokens:
+                    patch["evaluationTokens"] = eval_tokens
+                patch["variation"] = {
+                    "instructions": snapshot.current_instructions,
+                    "parameters": snapshot.current_parameters,
+                    "modelConfigKey": _resolve_model_config_key(snapshot.current_model or ""),
+                }
+                api_client.patch_agent_optimization_result(
+                    project_key, optimization_key, result_id, patch
+                )
+
+            # Reset tracking state after terminal events so the next main-loop
+            # attempt starts fresh.
+            if status in ("turn completed", "success", "failure"):
+                _in_validation_phase = False
+                _validation_parent_iteration = -1
+                _last_open_iteration = -1
+
+            if options.on_status_update:
+                try:
+                    options.on_status_update(status, ctx)
+                except Exception:
+                    logger.exception("User on_status_update callback failed for status=%s", status)
+
+        # If we have ground truth responses, we provide a different
+        # configuration options type that contains the bundled GroundTruthSamples
+        # so that the ultimate output is correctly formatted.
+        if has_ground_truth:
+            user_inputs: List[str] = config["userInputOptions"] or []
+            variable_choices_raw: List[Dict[str, Any]] = config["variableChoices"] or []
+
+            if len(raw_ground_truth) != len(user_inputs) or len(raw_ground_truth) != len(variable_choices_raw):
+                raise ValueError(
+                    f"groundTruthResponses ({len(raw_ground_truth)}), userInputOptions "
+                    f"({len(user_inputs)}), and variableChoices ({len(variable_choices_raw)}) "
+                    "must all have the same length when groundTruthResponses is provided."
+                )
+
+            gt_samples = [
+                GroundTruthSample(
+                    user_input=user_inputs[idx],
+                    expected_response=raw_ground_truth[idx],
+                    variables=variable_choices_raw[idx],
+                )
+                for idx in range(len(raw_ground_truth))
+            ]
+
+            return GroundTruthOptimizationOptions(
+                context_choices=options.context_choices,
+                ground_truth_responses=gt_samples,
+                max_attempts=config["maxAttempts"],
+                model_choices=[_strip_provider_prefix(m) for m in config["modelChoices"]],
+                judge_model=_strip_provider_prefix(config["judgeModel"]),
+                handle_agent_call=options.handle_agent_call,
+                handle_judge_call=options.handle_judge_call,
+                judges=judges or None,
+                on_turn=options.on_turn,
+                on_sample_result=options.on_sample_result,
+                on_passing_result=options.on_passing_result,
+                on_failing_result=options.on_failing_result,
+                on_status_update=_persist_and_forward,
+                token_limit=config.get("tokenLimit"),
+            )
+
+        variable_choices: List[Dict[str, Any]] = config["variableChoices"] or [{}]
+        user_input_options: Optional[List[str]] = config["userInputOptions"] or None
+
+        return OptimizationOptions(
+            context_choices=options.context_choices,
+            max_attempts=config["maxAttempts"],
+            model_choices=[_strip_provider_prefix(m) for m in config["modelChoices"]],
+            judge_model=_strip_provider_prefix(config["judgeModel"]),
+            variable_choices=variable_choices,
+            handle_agent_call=options.handle_agent_call,
+            handle_judge_call=options.handle_judge_call,
+            judges=judges or None,
+            user_input_options=user_input_options,
+            on_turn=options.on_turn,
+            on_passing_result=options.on_passing_result,
+            on_failing_result=options.on_failing_result,
+            on_status_update=_persist_and_forward,
+            token_limit=config.get("tokenLimit"),
+        )
+
+    async def _execute_agent_turn(
+        self,
+        optimize_context: OptimizationContext,
+        iteration: int,
+        expected_response: Optional[str] = None,
+    ) -> OptimizationContext:
+        """
+        Run the agent call and judge scoring for one optimization turn.
+
+        Returns a new OptimizationContext with completion_response and scores
+        populated, leaving the input context unchanged. Variables are read from
+        optimize_context.current_variables and interpolated into the agent's
+        instructions at call time so the stored template is never mutated.
+
+        :param optimize_context: The context for this turn (instructions, model, history, etc.)
+        :param iteration: Current iteration number for logging and status callbacks
+        :param expected_response: Optional ground truth expected response. When provided,
+            injected into judge context so judges can score actual vs. expected.
+        :return: Updated context with completion_response and scores filled in
+        """
+        logger.info(
+            "[Iteration %d] -> Calling agent (model=%s)...",
+            iteration,
+            optimize_context.current_model,
+        )
+        try:
+            _agent_start = time.monotonic()
+            result = self._options.handle_agent_call(
+                self._agent_key,
+                self._build_agent_config_for_context(optimize_context),
+                optimize_context,
+                False,
+            )
+            agent_response: OptimizationResponse = await await_if_needed(result)
+            agent_duration_ms = (time.monotonic() - _agent_start) * 1000
+            completion_response = agent_response.output
+            logger.debug(
+                "[Iteration %d] -> Agent response: %.300s%s",
+                iteration,
+                completion_response,
+                "..." if len(completion_response) > 300 else "",
+            )
+        except Exception:
+            logger.exception("[Iteration %d] -> Agent call failed", iteration)
+            if self._options.on_failing_result:
+                self._options.on_failing_result(optimize_context)
+            raise
+
+        scores: Dict[str, JudgeResult] = {}
+        if self._options.judges:
+            agent_tools = self._extract_agent_tools(optimize_context.current_parameters)
+            scores = await self._call_judges(
+                completion_response,
+                iteration,
+                user_input=optimize_context.user_input or "",
+                variables=optimize_context.current_variables,
+                agent_tools=agent_tools,
+                expected_response=expected_response,
+                agent_duration_ms=agent_duration_ms,
+            )
+
+        # Build the fully-populated result context before firing the evaluating event so
+        # the PATCH includes scores, generationLatency, and completionResponse. This is
+        # particularly important for non-final GT samples which receive no further status
+        # events — without this, those fields would never be written to their API records.
+        result_ctx = dataclasses.replace(
+            optimize_context,
+            completion_response=completion_response,
+            scores=scores,
+            duration_ms=agent_duration_ms,
+            usage=agent_response.usage,
+        )
+
+        if self._options.judges:
+            self._safe_status_update("evaluating", result_ctx, iteration)
+
+        return result_ctx
+
+    def _accumulate_tokens(self, optimize_context: OptimizationContext) -> None:
+        """Add token usage from a completed turn to the running total.
+
+        Sums the agent's token usage and each judge's token usage from the given
+        context and adds them to ``_total_token_usage``.
+
+        :param optimize_context: The completed turn context containing usage data.
+        """
+        if optimize_context.usage is not None:
+            self._total_token_usage += optimize_context.usage.total or 0
+        for judge_result in optimize_context.scores.values():
+            if judge_result.usage is not None:
+                self._total_token_usage += judge_result.usage.total or 0
+
+    def _is_token_limit_exceeded(self) -> bool:
+        """Return True if the accumulated token usage has met or exceeded the configured limit.
+
+        Returns False when no token limit is set so callers can use this as a
+        simple guard without needing to check for ``None`` themselves.
+
+        :return: True if token limit is set and ``_total_token_usage >= token_limit``.
+        """
+        limit: Optional[int] = getattr(self._options, "token_limit", None)
+        return limit is not None and self._total_token_usage >= limit
+
+    def _evaluate_response(self, optimize_context: OptimizationContext) -> bool:
+        """
+        Determine whether the current iteration's scores meet all judge thresholds.
+
+        A judge without an explicit threshold is treated as requiring a perfect
+        score of 1.0. Returns True immediately when no judges are configured.
+
+        :param optimize_context: The completed turn context containing scores
+        :return: True if all judges passed, False if any judge failed or is missing
+        """
+        if not self._options.judges:
+            return True
+
+        for judge_key, optimization_judge in self._options.judges.items():
+            result = optimize_context.scores.get(judge_key)
+            if result is None:
+                return False
+            threshold = (
+                optimization_judge.threshold
+                if optimization_judge.threshold is not None
+                else 1.0
+            )
+            if result.score < threshold:
+                return False
+
+        return True
+
+    def _evaluate_duration(self, optimize_context: OptimizationContext) -> bool:
+        """
+        Check whether the candidate's duration meets the improvement target vs. the baseline.
+
+        The baseline is history[0].duration_ms — the very first completed iteration,
+        representing the original unoptimized configuration's latency. The candidate
+        must be at least _DURATION_TOLERANCE faster (default: 20% improvement).
+
+        Returns True without blocking when no baseline is available (empty history or
+        history[0].duration_ms is None), or when the candidate's duration_ms was not
+        captured. This avoids penalising configurations when timing data is missing.
+
+        :param optimize_context: The completed turn context containing duration_ms
+        :return: True if the duration requirement is met or cannot be checked
+        """
+        if not self._history or self._history[0].duration_ms is None:
+            return True
+        if optimize_context.duration_ms is None:
+            return True
+        baseline = self._history[0].duration_ms
+        passed = optimize_context.duration_ms < baseline * _DURATION_TOLERANCE
+        if not passed:
+            logger.warning(
+                "[Iteration %d] -> Duration check failed: %.0fms >= baseline %.0fms * %.0f%% (%.0fms)",
+                optimize_context.iteration,
+                optimize_context.duration_ms,
+                baseline,
+                _DURATION_TOLERANCE * 100,
+                baseline * _DURATION_TOLERANCE,
+            )
+        return passed
+
+    def _handle_success(
+        self, optimize_context: OptimizationContext, iteration: int
+    ) -> Any:
+        """
+        Handle a successful optimization result.
+
+        Fires the "success" status update, invokes on_passing_result if set,
+        and returns the winning OptimizationContext.
+
+        :param optimize_context: The context from the passing iteration
+        :param iteration: Current iteration number for logging
+        :return: The passing OptimizationContext
+        """
+        logger.info("[Iteration %d] -> Optimization succeeded", iteration)
+        self._last_run_succeeded = True
+        self._last_succeeded_context = optimize_context
+        self._safe_status_update("success", optimize_context, iteration)
+        if self._options.on_passing_result:
+            try:
+                self._options.on_passing_result(optimize_context)
+            except Exception:
+                logger.exception(
+                    "[Iteration %d] -> on_passing_result callback failed", iteration
+                )
+        return optimize_context
+
+    def _handle_failure(
+        self, optimize_context: OptimizationContext, iteration: int
+    ) -> Any:
+        """
+        Handle a failed optimization result (max attempts reached).
+
+        Fires the "failure" status update, invokes on_failing_result if set,
+        and returns the last OptimizationContext.
+
+        :param optimize_context: The context from the final iteration
+        :param iteration: Current iteration number for logging
+        :return: The last OptimizationContext
+        """
+        logger.warning(
+            "[Optimization] -> Optimization failed after %d attempt(s)", iteration
+        )
+        self._last_run_succeeded = False
+        self._last_succeeded_context = None
+        self._safe_status_update("failure", optimize_context, iteration)
+        if self._options.on_failing_result:
+            try:
+                self._options.on_failing_result(optimize_context)
+            except Exception:
+                logger.exception(
+                    "[Iteration %d] -> on_failing_result callback failed", iteration
+                )
+        return optimize_context
+
+    def _commit_variation(
+        self,
+        optimize_context: OptimizationContext,
+        project_key: str,
+        ai_config_key: str,
+        output_key: Optional[str],
+        api_client: Optional[LDApiClient] = None,
+        base_url: Optional[str] = None,
+        model_configs: Optional[List[Dict[str, Any]]] = None,
+    ) -> str:
+        """Commit the winning optimization context as a new AI Config variation.
+
+        Determines a unique variation key (from output_key or an auto-generated
+        adjective-noun slug), checks for collisions against existing variation keys,
+        appends a random hex suffix if the key is taken, then POSTs the new variation
+        with up to 2 retries before raising on persistent failure.
+
+        :param optimize_context: The winning OptimizationContext.
+        :param project_key: LaunchDarkly project key.
+        :param ai_config_key: The AI Config key to add the variation to.
+        :param output_key: Desired variation key/name; auto-generated if None.
+        :param api_client: Optional pre-built LDApiClient to reuse (e.g. from optimize_from_config).
+        :param base_url: Optional base URL override forwarded to a newly created LDApiClient.
+        :return: The created variation key.
+        :raises LDApiError: If the variation cannot be created after retries.
+        """
+        if api_client is None:
+            assert self._api_key is not None
+            api_client = LDApiClient(
+                self._api_key,
+                **({"base_url": base_url} if base_url else {}),
+            )
+
+        candidate = output_key if output_key else generate_slug()
+
+        try:
+            ai_config = api_client.get_ai_config(project_key, ai_config_key)
+            existing_keys = {v["key"] for v in ai_config.get("variations", [])}
+        except Exception:
+            logger.warning(
+                "Could not fetch AI Config to check variation key collisions; proceeding with candidate key."
+            )
+            existing_keys = set()
+
+        if candidate in existing_keys:
+            suffix = "%04x" % random.randint(0, 0xFFFF)
+            candidate = f"{candidate}-{suffix}"
+            logger.info("Variation key collision detected; using '%s' instead.", candidate)
+
+        model_name = optimize_context.current_model or ""
+        model_config_key = model_name  # fallback if lookup fails
+        try:
+            configs_to_search = (
+                model_configs if model_configs is not None else api_client.get_model_configs(project_key)
+            )
+            match = _find_model_config(model_name, configs_to_search)
+            if match:
+                model_config_key = match["key"]
+            else:
+                logger.debug(
+                    "No model config found for model id '%s'; using model name as key.", model_name
+                )
+        except Exception as exc:
+            logger.debug("Could not fetch model configs to resolve modelConfigKey: %s", exc)
+
+        payload: Dict[str, Any] = {
+            "key": candidate,
+            "name": candidate,
+            "mode": "agent",
+            "instructions": optimize_context.current_instructions,
+            "modelConfigKey": model_config_key,
+        }
+        if self._initial_tool_keys:
+            payload["toolKeys"] = list(self._initial_tool_keys)
+
+        last_exc: Optional[Exception] = None
+        for attempt in range(1, 4):
+            try:
+                api_client.create_ai_config_variation(project_key, ai_config_key, payload)
+                logger.info(
+                    "Auto-committed variation '%s' to AI Config '%s'.", candidate, ai_config_key
+                )
+                return candidate
+            except Exception as exc:
+                last_exc = exc
+                if attempt < 3:
+                    logger.warning(
+                        "Failed to create variation (attempt %d/3): %s. Retrying...", attempt, exc
+                    )
+
+        raise last_exc  # type: ignore[misc]
+
+    async def _run_validation_phase(
+        self,
+        passing_context: OptimizationContext,
+        iteration: int,
+    ) -> "tuple[bool, OptimizationContext]":
+        """Run additional evaluations against distinct random samples to confirm a passing candidate.
+
+        Mirrors the sampling logic of _run_optimization: each validation turn selects
+        a user_input from user_input_options (when provided) AND a variables dict from
+        variable_choices independently. The validation count and distinctness guarantee
+        are driven by whichever pool is larger — user_input_options when present,
+        otherwise variable_choices — ensuring validation turns use inputs the passing
+        turn did not.
+
+        If all samples pass, the caller should proceed to _handle_success. If any
+        sample fails, the caller should treat the result as a normal failed attempt
+        and generate a new variation.
+
+        Validation turns are numbered sequentially in logs (iteration + 1, + 2, …)
+        for readability, but this numbering is internal only — the caller's iteration
+        counter is never advanced by this method so validation samples do not consume
+        the attempt budget.
+
+        :param passing_context: The OptimizationContext from the turn that just passed.
+        :param iteration: The iteration number of the passing turn; used as the
+            base for validation log line numbering only.
+        :return: Tuple of (all_passed, last_context).
+        """
+        options = self._options
+
+        # Determine the primary axis of distinctness and the pool size.
+        # user_input_options drives the count when present; otherwise variable_choices does.
+        # In either case, both user_input and variables are selected per-sample just as
+        # they are in the main optimization loop.
+        if options.user_input_options:
+            primary_pool: List[str] = options.user_input_options
+            passing_input: Optional[str] = passing_context.user_input
+            remaining_inputs: List[str] = [
+                inp for inp in primary_pool if inp != passing_input
+            ]
+            pool_size = len(primary_pool)
+        else:
+            var_pool: List[Dict[str, Any]] = options.variable_choices
+            passing_vars: Dict[str, Any] = passing_context.current_variables
+            remaining_vars: List[Dict[str, Any]] = [
+                v for v in var_pool if v != passing_vars
+            ]
+            pool_size = len(var_pool)
+
+        validation_count = _compute_validation_count(pool_size)
+        # Cap to the number of distinct remaining items, but never below 1.
+        # When the pool is exhausted (e.g. only one variable choice), sample
+        # with replacement from the full pool so at least one validation run
+        # always executes.
+        if options.user_input_options:
+            available = len(remaining_inputs)
+        else:
+            available = len(remaining_vars)
+
+        allow_repeats = available == 0
+        if allow_repeats:
+            validation_count = 1
+        else:
+            validation_count = min(validation_count, available)
+
+        logger.info(
+            "[Iteration %d] -> Candidate passed — entering validation phase (%d sample(s)%s)",
+            iteration,
+            validation_count,
+            ", repeated draw" if allow_repeats else "",
+        )
+        self._safe_status_update("validating", passing_context, iteration)
+
+        # Sample primary items, falling back to the full pool when no distinct
+        # items remain so the minimum-1 floor is always satisfied.
+        if options.user_input_options:
+            source_inputs = primary_pool if allow_repeats else remaining_inputs
+            sampled_inputs: List[str] = random.sample(source_inputs, validation_count)
+        else:
+            source_vars = var_pool if allow_repeats else remaining_vars
+            sampled_vars: List[Dict[str, Any]] = random.sample(source_vars, validation_count)
+
+        last_ctx = passing_context
+        for i in range(validation_count):
+            val_iter = iteration + i + 1
+            if options.user_input_options:
+                user_input: Optional[str] = sampled_inputs[i]
+                variables: Dict[str, Any] = random.choice(options.variable_choices)
+            else:
+                user_input = None
+                variables = sampled_vars[i]
+
+            logger.info(
+                "[Validation %d/%d] -> Running sample (iteration=%d)",
+                i + 1,
+                validation_count,
+                val_iter,
+            )
+
+            val_ctx = self._create_optimization_context(
+                iteration=val_iter,
+                user_input=user_input,
+                variables=variables,
+            )
+            self._safe_status_update("generating", val_ctx, val_iter)
+            val_ctx = await self._execute_agent_turn(val_ctx, val_iter)
+            self._accumulate_tokens(val_ctx)
+            if self._is_token_limit_exceeded():
+                logger.error(
+                    "[Validation %d/%d] -> Token limit exceeded (total=%d)",
+                    i + 1,
+                    validation_count,
+                    self._total_token_usage,
+                )
+                return False, val_ctx
+
+            if options.on_turn is not None:
+                try:
+                    sample_passed = options.on_turn(val_ctx)
+                except Exception:
+                    logger.exception(
+                        "[Validation %d/%d] -> on_turn evaluation failed", i + 1, validation_count
+                    )
+                    sample_passed = False
+            else:
+                sample_passed = self._evaluate_response(val_ctx)
+
+            if sample_passed and _acceptance_criteria_implies_duration_optimization(
+                self._options.judges
+            ):
+                sample_passed = self._evaluate_duration(val_ctx)
+
+            last_ctx = val_ctx
+
+            if not sample_passed:
+                logger.info(
+                    "[Validation %d/%d] -> FAILED (iteration=%d) — candidate rejected",
+                    i + 1,
+                    validation_count,
+                    val_iter,
+                )
+                return False, last_ctx
+
+            logger.debug(
+                "[Validation %d/%d] -> passed (iteration=%d)",
+                i + 1,
+                validation_count,
+                val_iter,
+            )
+
+        logger.info(
+            "[Iteration %d] -> All %d validation sample(s) passed — candidate confirmed",
+            iteration,
+            validation_count,
+        )
+        return True, last_ctx
+
+    async def _run_optimization(
+        self, agent_config: AIAgentConfig, options: OptimizationOptions
+    ) -> Any:
+        """Run an optimization on the given agent with the given options.
+
+        :param agent_config: Agent configuration from LaunchDarkly.
+        :param options: Optimization options.
+        :return: Optimization result.
+        """
+        self._options = options
+        self._agent_config = agent_config
+        self._last_run_succeeded = False
+        self._last_succeeded_context = None
+        self._last_optimization_result_id = None
+        self._total_token_usage = 0
+        self._initialize_class_members_from_config(agent_config)
+
+        # If the LD flag doesn't carry a model name, seed from the first model choice
+        # so agent calls never receive an empty model string.
+        if not self._current_model and options.model_choices:
+            self._current_model = options.model_choices[0]
+            logger.debug(
+                "[Optimization] -> No model in agent config; defaulting to first model choice: %s",
+                self._current_model,
+            )
+
+        initial_context = self._create_optimization_context(
+            iteration=0,
+            variables=random.choice(options.variable_choices),
+        )
+
+        self._safe_status_update("init", initial_context, 0)
+
+        iteration = 0
+        while True:
+            iteration += 1
+            logger.info(
+                "[Iteration %d] -> Starting (attempt %d/%d, model=%s)",
+                iteration,
+                iteration,
+                self._options.max_attempts,
+                self._current_model,
+            )
+            user_input = None
+            if self._options.user_input_options:
+                user_input = random.choice(self._options.user_input_options)
+            if user_input:
+                logger.debug("[Iteration %d] -> User input: %s", iteration, user_input)
+
+            optimize_context = self._create_optimization_context(
+                iteration=iteration,
+                user_input=user_input,
+                # Pick a fresh variable set each turn for call-time interpolation
+                variables=random.choice(self._options.variable_choices),
+            )
+
+            self._safe_status_update("generating", optimize_context, iteration)
+            optimize_context = await self._execute_agent_turn(
+                optimize_context, iteration
+            )
+            self._accumulate_tokens(optimize_context)
+            if self._is_token_limit_exceeded():
+                logger.error(
+                    "[Iteration %d] -> Token limit exceeded (total=%d)",
+                    iteration,
+                    self._total_token_usage,
+                )
+                return self._handle_failure(optimize_context, iteration)
+
+            # Manual path: on_turn callback gives caller full control over pass/fail
+            if self._options.on_turn is not None:
+                try:
+                    on_turn_result = self._options.on_turn(optimize_context)
+                except Exception:
+                    logger.exception(
+                        "[Iteration %d] -> on_turn evaluation failed", iteration
+                    )
+                    on_turn_result = False
+
+                initial_passed = on_turn_result
+                if initial_passed:
+                    logger.info(
+                        "[Iteration %d] -> on_turn returned True — turn passed",
+                        iteration,
+                    )
+            else:
+                # Auto-path: judge scores determine pass/fail via _evaluate_response
+                initial_passed = self._evaluate_response(optimize_context)
+                if initial_passed:
+                    logger.info(
+                        "[Iteration %d] -> All judges passed — turn succeeded",
+                        iteration,
+                    )
+
+            if initial_passed and _acceptance_criteria_implies_duration_optimization(
+                self._options.judges
+            ):
+                initial_passed = self._evaluate_duration(optimize_context)
+
+            if initial_passed:
+                all_valid, last_ctx = await self._run_validation_phase(
+                    optimize_context, iteration
+                )
+                if all_valid:
+                    return self._handle_success(optimize_context, iteration)
+                if self._is_token_limit_exceeded():
+                    return self._handle_failure(last_ctx, iteration)
+                # Validation failed — treat as a normal failed attempt.
+                # Use optimize_context (the main iteration) for terminal API events so
+                # the persisted record's completionResponse and userInput stay aligned.
+                # last_ctx (the failing validation run) goes into history so the
+                # variation generator can see what went wrong.
+                logger.info(
+                    "[Iteration %d] -> Validation failed — generating new variation (attempt %d/%d)",
+                    iteration,
+                    iteration,
+                    self._options.max_attempts,
+                )
+                if iteration >= self._options.max_attempts:
+                    return self._handle_failure(optimize_context, iteration)
+                self._history.append(last_ctx)
+                try:
+                    await self._generate_new_variation(
+                        iteration, last_ctx.current_variables
+                    )
+                except Exception:
+                    logger.exception(
+                        "[Iteration %d] -> variation generation failed", iteration
+                    )
+                    return self._handle_failure(optimize_context, iteration)
+                self._safe_status_update("turn completed", optimize_context, iteration)
+                continue
+
+            # Initial turn failed
+            if self._options.on_turn is not None:
+                logger.info(
+                    "[Iteration %d] -> on_turn returned False — turn failed (attempt %d/%d)",
+                    iteration,
+                    iteration,
+                    self._options.max_attempts,
+                )
+            else:
+                logger.info(
+                    "[Iteration %d] -> One or more judges failed (attempt %d/%d) — generating new variation",
+                    iteration,
+                    iteration,
+                    self._options.max_attempts,
+                )
+            if iteration >= self._options.max_attempts:
+                return self._handle_failure(optimize_context, iteration)
+            self._history.append(optimize_context)
+            try:
+                await self._generate_new_variation(
+                    iteration, optimize_context.current_variables
+                )
+            except Exception:
+                logger.exception(
+                    "[Iteration %d] -> variation generation failed", iteration
+                )
+                return self._handle_failure(optimize_context, iteration)
+            self._safe_status_update("turn completed", optimize_context, iteration)
+            continue
diff --git a/packages/optimization/src/ldai_optimizer/dataclasses.py b/packages/optimization/src/ldai_optimizer/dataclasses.py
new file mode 100644
index 00000000..9e52e046
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/dataclasses.py
@@ -0,0 +1,488 @@
+"""Dataclasses for the LaunchDarkly AI optimization package."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import (
+    Any,
+    Awaitable,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Union,
+)
+
+from ldai import AIAgentConfig
+from ldai.models import LDMessage, ModelConfig
+from ldai.tracker import TokenUsage
+from ldclient import Context
+from typing_extensions import Protocol
+
+
+@dataclass
+class OptimizationResponse:
+    """The return value for both ``handle_agent_call`` and ``handle_judge_call`` callbacks.
+
+    :param output: The text output produced by the LLM.
+    :param usage: Optional token usage for this call. Set fields to 0 or omit entirely
+        if token tracking is not available for the framework being used.
+    """
+
+    output: str
+    usage: Optional[TokenUsage] = None
+
+
+@dataclass
+class JudgeResult:
+    """Result from a judge evaluation."""
+
+    score: float
+    rationale: Optional[str] = None
+    duration_ms: Optional[float] = None
+    usage: Optional[TokenUsage] = None
+
+    def to_json(self) -> Dict[str, Any]:
+        """
+        Convert the judge result to a JSON-serializable dictionary.
+
+        :return: Dictionary representation of the judge result that can be serialized with json.dumps()
+        """
+        result: Dict[str, Any] = {
+            "score": self.score,
+            "rationale": self.rationale,
+            "duration_ms": self.duration_ms,
+        }
+        if self.usage is not None:
+            result["usage"] = {
+                "total": self.usage.total,
+                "input": self.usage.input,
+                "output": self.usage.output,
+            }
+        return result
+
+
+@dataclass
+class ToolDefinition:
+    """
+    Generic tool definition for enforcing structured output from LLM responses.
+
+    This tool can be used with any LLM provider to ensure responses conform to
+    a specific JSON schema. The tool takes the LLM's response and returns
+    parsed and validated data according to the input_schema.
+    """
+
+    name: str
+    description: str
+    input_schema: Dict[str, Any]  # JSON schema defining the expected output structure
+    type: Literal["function"] = "function"
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert the tool definition to a dictionary format compatible with LLM APIs.
+
+        :return: Dictionary representation of the tool
+        """
+        return {
+            "name": self.name,
+            "description": self.description,
+            "input_schema": self.input_schema,
+            "type": self.type,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ToolDefinition":
+        """
+        Construct a ToolDefinition from a plain dictionary.
+
+        :param data: Dictionary with at least a ``name`` key; ``description`` and
+            ``input_schema`` default to empty values when absent.
+        :return: A new ToolDefinition instance
+        """
+        return cls(
+            name=data.get("name", ""),
+            description=data.get("description", ""),
+            input_schema=data.get("input_schema", {}),
+            type=data.get("type", "function"),
+        )
+
+
+class LLMCallConfig(Protocol):
+    """Structural protocol satisfied by both ``AIAgentConfig`` and ``AIJudgeCallConfig``.
+
+    Use this as the config parameter type when you want a single handler function
+    that can be passed to both ``handle_agent_call`` and ``handle_judge_call``::
+
+        async def handle_llm_call(
+            key: str,
+            config: LLMCallConfig,
+            context: LLMCallContext,
+            is_evaluation: bool,
+        ) -> OptimizationResponse:
+            model_name = config.model.name if config.model else "gpt-4o"
+            instructions = config.instructions or ""
+            tools = config.model.get_parameter("tools") if config.model else []
+            ...
+
+        OptimizationOptions(
+            handle_agent_call=handle_llm_call,
+            handle_judge_call=handle_llm_call,
+            ...
+        )
+    """
+
+    @property
+    def key(self) -> str: ...
+    @property
+    def model(self) -> Optional[ModelConfig]: ...
+    @property
+    def instructions(self) -> Optional[str]: ...
+
+
+class LLMCallContext(Protocol):
+    """Structural protocol satisfied by both ``OptimizationContext`` and ``OptimizationJudgeContext``.
+
+    Use alongside ``LLMCallConfig`` when writing a single handler for both
+    ``handle_agent_call`` and ``handle_judge_call``.
+    """
+
+    @property
+    def user_input(self) -> Optional[str]: ...
+    @property
+    def current_variables(self) -> Dict[str, Any]: ...
+
+
+@dataclass
+class AIJudgeCallConfig:
+    """
+    Configuration passed to ``handle_judge_call``.
+
+    Carries everything needed to run a judge in either paradigm:
+
+    * **Completions path** — pass ``messages`` directly to ``chat.completions.create``.
+      The full system + user turn sequence is already assembled and interpolated.
+    * **Agents path** — use ``instructions`` as the system prompt and
+      ``OptimizationJudgeContext.user_input`` as the ``Runner.run`` input.
+
+    Both fields are always populated, regardless of whether the judge comes from a
+    LaunchDarkly flag (config judge) or an inline acceptance statement.
+    """
+
+    key: str
+    model: ModelConfig
+    instructions: str
+    messages: List[LDMessage]
+
+
+@dataclass
+class Message:
+    """A message in a conversation."""
+
+    role: Literal["system", "user", "assistant"]
+    content: str
+
+    def to_dict(self) -> Dict[str, str]:
+        """Convert message to dictionary format."""
+        return {
+            "role": self.role,
+            "content": self.content,
+        }
+
+
+@dataclass
+class OptimizationJudge:
+    threshold: float
+    judge_key: Optional[str] = None
+    acceptance_statement: Optional[str] = None
+
+
+@dataclass
+class OptimizationContext:
+    """Context for a single optimization iteration."""
+
+    scores: Dict[str, JudgeResult]  # the scores and rationales from the judges, if configured
+    completion_response: str
+    current_instructions: str
+    current_parameters: Dict[str, Any]
+    # variable set chosen for this iteration; interpolated into instructions at call time
+    current_variables: Dict[str, Any]
+    current_model: Optional[str] = None  # the current model being used
+    user_input: Optional[str] = None  # the user input message for this iteration
+    history: Sequence[OptimizationContext] = field(
+        default_factory=list
+    )  # previous context items
+    iteration: int = 0  # current iteration number
+    duration_ms: Optional[float] = None  # wall-clock time for the agent call in milliseconds
+    usage: Optional[TokenUsage] = None  # token usage reported by the agent for this iteration
+
+    def copy_without_history(self) -> OptimizationContext:
+        """
+        Create a copy of this context without the history field (for flattening).
+
+        :return: A new OptimizeContext with the same data but empty history
+        """
+        return OptimizationContext(
+            scores=self.scores,
+            completion_response=self.completion_response,
+            current_instructions=self.current_instructions,
+            current_parameters=self.current_parameters,
+            current_variables=self.current_variables,
+            current_model=self.current_model,
+            user_input=self.user_input,
+            history=(),  # Empty history to keep it flat
+            iteration=self.iteration,
+            duration_ms=self.duration_ms,
+            usage=self.usage,
+        )
+
+    def to_json(self) -> Dict[str, Any]:
+        """
+        Convert the optimization context to a JSON-serializable dictionary.
+
+        :return: Dictionary representation of the context that can be serialized with json.dumps()
+        """
+        scores_dict = {}
+        for judge_key, judge_result in self.scores.items():
+            scores_dict[judge_key] = judge_result.to_json()
+
+        history_list = [ctx.to_json() for ctx in self.history]
+
+        result: Dict[str, Any] = {
+            "scores": scores_dict,
+            "completion_response": self.completion_response,
+            "current_instructions": self.current_instructions,
+            "current_parameters": self.current_parameters,
+            "current_model": self.current_model,
+            "user_input": self.user_input,
+            "current_variables": self.current_variables,
+            "history": history_list,
+            "iteration": self.iteration,
+            "duration_ms": self.duration_ms,
+        }
+        if self.usage is not None:
+            result["usage"] = {
+                "total": self.usage.total,
+                "input": self.usage.input,
+                "output": self.usage.output,
+            }
+        return result
+
+
+@dataclass
+class OptimizationJudgeContext:
+    """Context for a single judge evaluation turn."""
+
+    user_input: str  # the agent response being evaluated
+    current_variables: Dict[str, Any] = field(default_factory=dict)  # variable set used during agent generation
+
+
+# Shared callback type aliases used by both OptimizationOptions and
+# OptimizationFromConfigOptions to avoid duplicating the full signatures.
+# Placed here so all referenced types (OptimizationContext, AIJudgeCallConfig,
+# OptimizationJudgeContext) are already defined above.
+#
+# Both aliases use the LLMCallConfig / LLMCallContext Protocols so callers can
+# write a single handler for both agent and judge calls.  Handlers typed with
+# the concrete types (AIAgentConfig / AIJudgeCallConfig) continue to work
+# because those types structurally satisfy the Protocols.
+HandleAgentCall = Union[
+    Callable[[str, LLMCallConfig, LLMCallContext, bool], OptimizationResponse],
+    Callable[[str, LLMCallConfig, LLMCallContext, bool], Awaitable[OptimizationResponse]],
+]
+HandleJudgeCall = Union[
+    Callable[[str, LLMCallConfig, LLMCallContext, bool], OptimizationResponse],
+    Callable[[str, LLMCallConfig, LLMCallContext, bool], Awaitable[OptimizationResponse]],
+]
+
+_StatusLiteral = Literal[
+    "init",
+    "generating",
+    "evaluating",
+    "generating variation",
+    "validating",
+    "turn completed",
+    "success",
+    "failure",
+]
+
+
+@dataclass
+class OptimizationOptions:
+    """Options for agent optimization."""
+
+    # Configuration - Required
+    max_attempts: int
+    model_choices: List[str]  # model ids the LLM can choose from, 1 min required
+    judge_model: str  # which model to use as judge; this should remain consistent
+    variable_choices: List[
+        Dict[str, Any]
+    ]  # choices of interpolated variables to be chosen at random per turn, 1 min required
+    # Actual agent/completion (judge) calls - Required
+    handle_agent_call: HandleAgentCall
+    # Optional; falls back to handle_agent_call when omitted (both share the same signature)
+    handle_judge_call: Optional[HandleJudgeCall] = None
+    # Criteria for pass/fail - Optional
+    user_input_options: Optional[List[str]] = (
+        None  # optional list of user input messages to randomly select from
+    )
+    judges: Optional[Dict[str, OptimizationJudge]] = (
+        None  # auto-judges for this model that the LLM will use
+    )
+    on_turn: Optional[Callable[[OptimizationContext], bool]] = (
+        None  # if you want manual control of pass/fail
+    )
+    # Context - Optional; defaults to a single anonymous context
+    context_choices: List[Context] = field(
+        default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()]
+    )
+    # Auto-commit - Optional
+    auto_commit: bool = False
+    project_key: Optional[str] = None  # required when auto_commit=True
+    output_key: Optional[str] = None   # variation key/name; auto-generated if omitted
+    base_url: Optional[str] = None  # override to target a non-default LD instance
+    on_passing_result: Optional[Callable[[OptimizationContext], None]] = None
+    on_failing_result: Optional[Callable[[OptimizationContext], None]] = None
+    # called to provide status updates during the optimization flow
+    on_status_update: Optional[Callable[[_StatusLiteral, OptimizationContext], None]] = None
+    token_limit: Optional[int] = None  # stop the run when total token usage reaches this value
+
+    def __post_init__(self):
+        """Validate required options."""
+        if len(self.model_choices) < 1:
+            raise ValueError("model_choices must have at least 1 model")
+        if self.judges is None and self.on_turn is None:
+            raise ValueError("Either judges or on_turn must be provided")
+        if self.judge_model is None:
+            raise ValueError("judge_model must be provided")
+
+
+@dataclass
+class GroundTruthSample:
+    """A single ground truth evaluation sample for use with optimize_from_ground_truth_options.
+
+    Each sample ties together the user input, expected response, and variable set for one
+    evaluation. Samples are evaluated in order; the optimization only passes if all samples
+    pass their judges in the same attempt.
+
+    :param user_input: The user message to send to the agent for this evaluation.
+    :param expected_response: The ideal response the agent should produce. Injected into
+        judge context so judges can score actual vs. expected.
+    :param variables: Variable set interpolated into the agent instructions for this sample.
+        Defaults to an empty dict if no placeholders are used.
+    """
+
+    user_input: str
+    expected_response: str
+    variables: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class GroundTruthOptimizationOptions:
+    """Options for optimize_from_ground_truth_options.
+
+    Mirrors OptimizationOptions but replaces variable_choices / user_input_options with
+    ground_truth_responses. Each GroundTruthSample bundles the user input, expected
+    response, and variable set for one evaluation. All N samples must pass their judges
+    in the same attempt for the optimization to succeed.
+
+    :param context_choices: One or more LD evaluation contexts to use.
+    :param ground_truth_responses: Ordered list of ground truth samples to evaluate.
+        At least 1 required. All samples share the same instructions and model being optimized.
+    :param max_attempts: Maximum number of variation attempts before the run is marked failed.
+    :param model_choices: Model IDs the variation generator may select from. At least 1 required.
+    :param judge_model: Model used for judge evaluation. Should remain consistent across attempts.
+    :param handle_agent_call: Callback that invokes the agent and returns its response.
+    :param handle_judge_call: Callback that invokes a judge LLM and returns its response.
+    :param judges: Auto-judges (config judges and/or acceptance statements) to score each response.
+    :param on_turn: Optional manual pass/fail callback applied per sample; skips judge scoring when provided.
+    :param on_sample_result: Called with each sample's OptimizationContext as results arrive,
+        before the overall pass/fail decision is made for the attempt.
+    :param on_passing_result: Called once with the last context when all N samples pass.
+    :param on_failing_result: Called once with the last context when max attempts are exhausted.
+    :param on_status_update: Called on each status transition during the run.
+    """
+
+    ground_truth_responses: List[GroundTruthSample]
+    max_attempts: int
+    model_choices: List[str]
+    judge_model: str
+    handle_agent_call: HandleAgentCall
+    # Optional; falls back to handle_agent_call when omitted (both share the same signature)
+    handle_judge_call: Optional[HandleJudgeCall] = None
+    judges: Optional[Dict[str, OptimizationJudge]] = None
+    on_turn: Optional[Callable[[OptimizationContext], bool]] = None
+    on_sample_result: Optional[Callable[[OptimizationContext], None]] = None
+    on_passing_result: Optional[Callable[[OptimizationContext], None]] = None
+    on_failing_result: Optional[Callable[[OptimizationContext], None]] = None
+    on_status_update: Optional[
+        Callable[
+            [
+                _StatusLiteral,
+                OptimizationContext,
+            ],
+            None,
+        ]
+    ] = None
+    # Context - Optional; defaults to a single anonymous context
+    context_choices: List[Context] = field(
+        default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()]
+    )
+    # Auto-commit - Optional
+    auto_commit: bool = False
+    project_key: Optional[str] = None  # required when auto_commit=True
+    output_key: Optional[str] = None   # variation key/name; auto-generated if omitted
+    base_url: Optional[str] = None  # override to target a non-default LD instance
+    token_limit: Optional[int] = None  # stop the run when total token usage reaches this value
+
+    def __post_init__(self):
+        """Validate required options."""
+        if len(self.model_choices) < 1:
+            raise ValueError("model_choices must have at least 1 model")
+        if len(self.ground_truth_responses) < 1:
+            raise ValueError("ground_truth_responses must have at least 1 sample")
+        if self.judges is None and self.on_turn is None:
+            raise ValueError("Either judges or on_turn must be provided")
+
+
+@dataclass
+class OptimizationFromConfigOptions:
+    """User-provided options for optimize_from_config.
+
+    Fields that come from the LaunchDarkly API (max_attempts, model_choices,
+    judge_model, variable_choices, user_input_options, judges) are omitted here
+    and sourced from the fetched agent optimization config instead.
+
+    :param project_key: LaunchDarkly project key used to build API paths.
+    :param context_choices: One or more LD evaluation contexts to use.
+    :param handle_agent_call: Callback that invokes the agent and returns its response.
+    :param handle_judge_call: Callback that invokes a judge and returns its response.
+    :param on_turn: Optional manual pass/fail callback; when provided, judge scoring is skipped.
+    :param on_sample_result: Ground truth path only. Called with each sample's
+        OptimizationContext as results arrive during a ground truth run.
+    :param on_passing_result: Called with the winning OptimizationContext on success.
+    :param on_failing_result: Called with the final OptimizationContext on failure.
+    :param on_status_update: Called on each status transition; chained after the
+        automatic result-persistence POST so it always runs after the record is saved.
+    :param base_url: Base URL of the LaunchDarkly instance. Defaults to
+        https://app.launchdarkly.com. Override to target a staging instance.
+    """
+
+    project_key: str
+    handle_agent_call: HandleAgentCall
+    # Optional; falls back to handle_agent_call when omitted (both share the same signature)
+    handle_judge_call: Optional[HandleJudgeCall] = None
+    on_turn: Optional[Callable[["OptimizationContext"], bool]] = None
+    on_sample_result: Optional[Callable[["OptimizationContext"], None]] = None
+    on_passing_result: Optional[Callable[["OptimizationContext"], None]] = None
+    on_failing_result: Optional[Callable[["OptimizationContext"], None]] = None
+    on_status_update: Optional[Callable[[_StatusLiteral, "OptimizationContext"], None]] = None
+    # Context - Optional; defaults to a single anonymous context
+    context_choices: List[Context] = field(
+        default_factory=lambda: [Context.builder("anonymous").anonymous(True).build()]
+    )
+    base_url: Optional[str] = None
+    # Auto-commit defaults to True for config-driven runs; set False to disable
+    auto_commit: bool = True
+    output_key: Optional[str] = None  # variation key/name; auto-generated if omitted
diff --git a/packages/optimization/src/ldai_optimizer/ld_api_client.py b/packages/optimization/src/ldai_optimizer/ld_api_client.py
new file mode 100644
index 00000000..3efa725d
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/ld_api_client.py
@@ -0,0 +1,380 @@
+"""Internal LaunchDarkly REST API client for the optimization package."""
+
+import json
+import logging
+import time
+import urllib.error
+import urllib.request
+from typing import Any, Dict, List, Optional, TypedDict
+
+from ldai_optimizer.util import RedactionFilter
+
+logger = logging.getLogger(__name__)
+logger.addFilter(RedactionFilter())
+
+_BASE_URL = "https://app.launchdarkly.com"
+
+_MAX_RETRIES = 3
+_INITIAL_BACKOFF = 1.0  # seconds; doubles on each attempt (1s, 2s, 4s)
+
+# Status codes that warrant a retry.  Everything else (including 400, 401, 403,
+# 404) is a permanent or auth failure — retrying would not help and could lead
+# to corrupted optimization results if some requests succeed and others fail.
+_RETRYABLE_STATUS_CODES = frozenset({429, 500, 502, 503, 504})
+
+
+class LDApiError(Exception):
+    """Raised when the LaunchDarkly REST API returns an error or is unreachable.
+
+    Attributes:
+        status_code: HTTP status code, or None for network-level failures.
+        path: The API path that was requested.
+    """
+
+    def __init__(self, message: str, status_code: Optional[int] = None, path: str = "") -> None:
+        super().__init__(message)
+        self.status_code = status_code
+        self.path = path
+
+
+_HTTP_ERROR_HINTS: Dict[int, str] = {
+    401: "Authentication failed — check that LAUNCHDARKLY_API_KEY is set correctly.",
+    403: "Authorization failed — check that your API key has the required permissions.",
+    404: "Resource not found — check that the project key and optimization config key are correct.",
+    429: "Rate limit exceeded — too many requests to the LaunchDarkly API.",
+}
+
+_REQUIRED_STRING_FIELDS = ("id", "key", "aiConfigKey", "judgeModel")
+_REQUIRED_INT_FIELDS = ("maxAttempts", "version", "createdAt")
+_REQUIRED_LIST_FIELDS = (
+    "modelChoices",
+    "variableChoices",
+    "acceptanceStatements",
+    "judges",
+    "userInputOptions",
+)
+
+
+# ---------------------------------------------------------------------------
+# API response shapes
+# ---------------------------------------------------------------------------
+
+class _AcceptanceStatement(TypedDict):
+    statement: str
+    threshold: float
+
+
+class _AgentOptimizationJudge(TypedDict):
+    key: str
+    threshold: float
+
+
+class _AgentOptimizationConfigRequired(TypedDict):
+    id: str
+    key: str
+    aiConfigKey: str
+    maxAttempts: int
+    modelChoices: List[str]
+    judgeModel: str
+    variableChoices: List[Dict[str, Any]]
+    acceptanceStatements: List[_AcceptanceStatement]
+    judges: List[_AgentOptimizationJudge]
+    userInputOptions: List[str]
+    version: int
+    createdAt: int
+
+
+class AgentOptimizationConfig(_AgentOptimizationConfigRequired, total=False):
+    """Typed representation of the AgentOptimization API response."""
+
+    groundTruthResponses: List[str]
+    metricKey: str
+    tokenLimit: int
+
+
+# ---------------------------------------------------------------------------
+# Result payload shapes
+# ---------------------------------------------------------------------------
+
+class _AgentOptimizationResultPostRequired(TypedDict):
+    runId: str
+    agentOptimizationVersion: int
+    iteration: int
+    instructions: str
+
+
+class AgentOptimizationResultPost(_AgentOptimizationResultPostRequired, total=False):
+    """Payload for POST /agent-optimizations/{key}/results — creates a new result record."""
+
+    userInput: str
+    parameters: Dict[str, Any]
+
+
+class AgentOptimizationResultPatch(TypedDict, total=False):
+    """Payload for PATCH /agent-optimizations/{key}/results/{id} — updates a result record."""
+
+    status: str
+    activity: str
+    completionResponse: str
+    scores: Dict[str, Any]
+    generationLatency: int
+    generationTokens: Dict[str, int]
+    evaluationLatencies: Dict[str, float]
+    evaluationTokens: Dict[str, Dict[str, int]]
+    variation: Dict[str, Any]
+    createdVariationKey: str
+
+
+# ---------------------------------------------------------------------------
+# Validation
+# ---------------------------------------------------------------------------
+
+def _parse_agent_optimization(data: Any) -> AgentOptimizationConfig:
+    """Validate and cast a raw API response dict to AgentOptimizationConfig.
+
+    :param data: Parsed JSON response from the GET endpoint.
+    :return: The same dict narrowed to AgentOptimizationConfig.
+    :raises ValueError: If required fields are missing or have wrong types.
+    """
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"Expected a JSON object from AgentOptimization API, got {type(data).__name__}"
+        )
+
+    errors: List[str] = []
+
+    for field in _REQUIRED_STRING_FIELDS:
+        if field not in data:
+            errors.append(f"missing required field '{field}'")
+        elif not isinstance(data[field], str):
+            errors.append(
+                f"field '{field}' must be a string, got {type(data[field]).__name__}"
+            )
+
+    for field in _REQUIRED_INT_FIELDS:
+        if field not in data:
+            errors.append(f"missing required field '{field}'")
+        elif not isinstance(data[field], int):
+            errors.append(
+                f"field '{field}' must be an integer, got {type(data[field]).__name__}"
+            )
+
+    for field in _REQUIRED_LIST_FIELDS:
+        if field not in data:
+            errors.append(f"missing required field '{field}'")
+        elif not isinstance(data[field], list):
+            errors.append(
+                f"field '{field}' must be a list, got {type(data[field]).__name__}"
+            )
+
+    if not errors and "modelChoices" in data and isinstance(data["modelChoices"], list):
+        if len(data["modelChoices"]) < 1:
+            errors.append("field 'modelChoices' must have at least 1 entry")
+
+    if errors:
+        raise ValueError(
+            f"Invalid AgentOptimization response: {'; '.join(errors)}"
+        )
+
+    return data  # type: ignore[return-value]
+
+
+# ---------------------------------------------------------------------------
+# Client
+# ---------------------------------------------------------------------------
+
+class LDApiClient:
+    """Thin wrapper around the LaunchDarkly REST API for agent-optimization endpoints."""
+
+    def __init__(self, api_key: str, base_url: str = _BASE_URL) -> None:
+        self._api_key = api_key
+        self._base_url = base_url.rstrip("/")
+
+    def __repr__(self) -> str:
+        return f"LDApiClient(base_url={self._base_url!r})"
+
+    def _auth_headers(self) -> Dict[str, str]:
+        return {"Authorization": self._api_key}
+
+    def _request(
+        self,
+        method: str,
+        path: str,
+        body: Any = None,
+        extra_headers: Optional[Dict[str, str]] = None,
+    ) -> Any:
+        """Execute an HTTP request with automatic retry and exponential backoff.
+
+        Retries up to ``_MAX_RETRIES`` times for transient errors (429, 5xx,
+        network failures) with exponential backoff starting at ``_INITIAL_BACKOFF``
+        seconds.  Non-retryable status codes (400, 401, 403, 404, …) are raised
+        immediately without retrying.
+
+        :param method: HTTP method (GET, POST, PATCH, …).
+        :param path: API path, appended to ``self._base_url``.
+        :param body: Optional request body; serialised to JSON.
+        :param extra_headers: Additional headers merged with the auth header.
+        :raises LDApiError: After all retry attempts are exhausted, or immediately
+            for non-retryable status codes.
+        """
+        url = f"{self._base_url}{path}"
+        headers = {**self._auth_headers(), **(extra_headers or {})}
+        data = json.dumps(body).encode() if body is not None else None
+        if data is not None:
+            headers["Content-Type"] = "application/json"
+
+        last_exc: Optional[LDApiError] = None
+        for attempt in range(_MAX_RETRIES + 1):
+            req = urllib.request.Request(url, data=data, headers=headers, method=method)
+            try:
+                with urllib.request.urlopen(req) as resp:
+                    raw = resp.read()
+                    return json.loads(raw) if raw else None
+            except urllib.error.HTTPError as exc:
+                body_excerpt = exc.read(500).decode(errors="replace")
+                hint = _HTTP_ERROR_HINTS.get(exc.code, "")
+                detail = f"{hint} (API response: {body_excerpt})" if hint else f"API response: {body_excerpt}"
+                api_error = LDApiError(
+                    f"LaunchDarkly API error {exc.code} {exc.msg} for {method} {path}. {detail}",
+                    status_code=exc.code,
+                    path=path,
+                )
+                if exc.code not in _RETRYABLE_STATUS_CODES:
+                    raise api_error from exc
+                last_exc = api_error
+            except urllib.error.URLError as exc:
+                last_exc = LDApiError(
+                    f"Could not reach LaunchDarkly API at {url}: {exc.reason}. "
+                    "Check your network connection and the base_url setting.",
+                    path=path,
+                )
+
+            if attempt < _MAX_RETRIES:
+                delay = _INITIAL_BACKOFF * (2 ** attempt)
+                logger.warning(
+                    "LaunchDarkly API request failed (attempt %d/%d, path=%s), "
+                    "retrying in %.1fs: %s",
+                    attempt + 1,
+                    _MAX_RETRIES + 1,
+                    path,
+                    delay,
+                    last_exc,
+                )
+                time.sleep(delay)
+
+        assert last_exc is not None
+        raise last_exc
+
+    def get_model_configs(self, project_key: str) -> List[Dict[str, Any]]:
+        """Fetch all AI model configs for a project.
+
+        :param project_key: LaunchDarkly project key.
+        :return: List of model config dicts (each has at minimum ``id`` and ``key``).
+        :raises LDApiError: On non-200 HTTP responses or network errors.
+        """
+        path = f"/api/v2/projects/{project_key}/ai-configs/model-configs"
+        result = self._request("GET", path, extra_headers={"LD-API-Version": "beta"})
+        return result if isinstance(result, list) else []
+
+    def get_ai_config(self, project_key: str, config_key: str) -> Any:
+        """Fetch a single AI Config by key, including its variations.
+
+        :param project_key: LaunchDarkly project key.
+        :param config_key: Key of the AI Config (aiConfigKey).
+        :return: Raw AI Config dict with a ``variations`` list.
+        :raises LDApiError: On non-200 HTTP responses or network errors.
+        """
+        path = f"/api/v2/projects/{project_key}/ai-configs/{config_key}"
+        return self._request("GET", path, extra_headers={"LD-API-Version": "beta"})
+
+    def create_ai_config_variation(
+        self, project_key: str, config_key: str, payload: Dict[str, Any]
+    ) -> Any:
+        """Create a new variation on an AI Config.
+
+        :param project_key: LaunchDarkly project key.
+        :param config_key: Key of the AI Config.
+        :param payload: Variation payload (key, name, mode, instructions, model).
+        :return: Created AIConfigVariation dict.
+        :raises LDApiError: On non-200 HTTP responses or network errors.
+        """
+        path = f"/api/v2/projects/{project_key}/ai-configs/{config_key}/variations"
+        return self._request("POST", path, body=payload, extra_headers={"LD-API-Version": "beta"})
+
+    def get_agent_optimization(
+        self, project_key: str, optimization_key: str
+    ) -> AgentOptimizationConfig:
+        """Fetch and validate a single agent optimization config by key.
+
+        :param project_key: LaunchDarkly project key.
+        :param optimization_key: Key of the agent optimization config.
+        :return: Validated AgentOptimizationConfig.
+        :raises LDApiError: On non-200 HTTP responses or network errors.
+        :raises ValueError: If the response is missing required fields.
+        """
+        path = f"/api/v2/projects/{project_key}/agent-optimizations/{optimization_key}"
+        raw = self._request("GET", path)
+        return _parse_agent_optimization(raw)
+
+    def post_agent_optimization_result(
+        self, project_key: str, optimization_key: str, payload: AgentOptimizationResultPost
+    ) -> Optional[str]:
+        """Create an iteration result record for the given optimization run.
+
+        Errors are caught and logged rather than raised so that persistence
+        failures never abort an in-progress optimization run.
+
+        :param project_key: LaunchDarkly project key.
+        :param optimization_key: String key of the parent agent_optimization record.
+        :param payload: POST payload for this iteration.
+        :return: The ``id`` of the newly created result record, or None on failure.
+        """
+        path = f"/api/v2/projects/{project_key}/agent-optimizations/{optimization_key}/results"
+        try:
+            result = self._request("POST", path, body=payload)
+            return result.get("id") if isinstance(result, dict) else None
+        except LDApiError as exc:
+            logger.debug(
+                "Failed to persist optimization result (optimization_key=%s, iteration=%s): %s",
+                optimization_key,
+                payload.get("iteration"),
+                exc,
+            )
+            return None
+        except Exception as exc:
+            logger.debug(
+                "Unexpected error persisting optimization result (optimization_key=%s, iteration=%s): %s",
+                optimization_key,
+                payload.get("iteration"),
+                exc,
+            )
+            return None
+
+    def patch_agent_optimization_result(
+        self, project_key: str, optimization_key: str, result_id: str, payload: AgentOptimizationResultPatch
+    ) -> None:
+        """Update an existing iteration result record.
+
+        Errors are caught and logged rather than raised so that persistence
+        failures never abort an in-progress optimization run.
+
+        :param project_key: LaunchDarkly project key.
+        :param optimization_key: String key of the parent agent_optimization record.
+        :param result_id: ID of the result record to update.
+        :param payload: PATCH payload with fields to update.
+        """
+        path = f"/api/v2/projects/{project_key}/agent-optimizations/{optimization_key}/results/{result_id}"
+        try:
+            self._request("PATCH", path, body=payload)
+        except LDApiError as exc:
+            logger.debug(
+                "Failed to update optimization result (result_id=%s): %s",
+                result_id,
+                exc,
+            )
+        except Exception as exc:
+            logger.debug(
+                "Unexpected error updating optimization result (result_id=%s): %s",
+                result_id,
+                exc,
+            )
diff --git a/packages/optimization/src/ldai_optimizer/prompts.py b/packages/optimization/src/ldai_optimizer/prompts.py
new file mode 100644
index 00000000..c699cb19
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/prompts.py
@@ -0,0 +1,557 @@
+"""Prompt-building functions for LaunchDarkly AI optimization."""
+
+import re
+from typing import Any, Dict, List, Optional
+
+from ldai_optimizer.dataclasses import (
+    OptimizationContext,
+    OptimizationJudge,
+)
+
+_DURATION_KEYWORDS = re.compile(
+    r"\b(fast|faster|quickly|quick|latency|low-latency|duration|response\s+time|"
+    r"time\s+to\s+respond|milliseconds|performant|snappy|efficient|seconds)\b|"
+    r"(?<![a-zA-Z])ms\b",
+    re.IGNORECASE,
+)
+
+
+def _acceptance_criteria_implies_duration_optimization(
+    judges: Optional[Dict[str, OptimizationJudge]],
+) -> bool:
+    """Return True if any judge acceptance statement implies a latency optimization goal.
+
+    Scans each judge's acceptance_statement for latency-related keywords. The
+    check is case-insensitive. Returns False when judges is None or no judge
+    carries an acceptance statement.
+
+    :param judges: Judge configuration dict from OptimizationOptions, or None.
+    :return: True if duration optimization should be applied.
+    """
+    if not judges:
+        return False
+    for judge in judges.values():
+        if judge.acceptance_statement and _DURATION_KEYWORDS.search(
+            judge.acceptance_statement
+        ):
+            return True
+    return False
+
+
+def build_message_history_text(
+    history: List[OptimizationContext],
+    input_text: str,
+    reasoning_history: str,
+    current_user_input: str,
+) -> str:
+    """
+    Build a formatted message-history string for use as a judge template variable.
+
+    Combines the current instructions (system text), the conversation turns
+    recorded in history, the current turn's user question, and the accumulated
+    reasoning/score history.
+
+    :param history: All previous OptimizationContexts, oldest first
+    :param input_text: Current system instructions (may be empty string)
+    :param reasoning_history: Pre-formatted string from build_reasoning_history
+    :param current_user_input: The user question for the turn being evaluated.
+        Must be passed explicitly because the current turn is not yet in
+        history when the judge runs.
+    :return: Combined string to substitute into the judge's message_history variable
+    """
+    turn_messages = []
+    for ctx in history:
+        if ctx.user_input:
+            turn_messages.append(f"User: <untrusted>{ctx.user_input}</untrusted>")
+        if ctx.completion_response:
+            turn_messages.append(f"Assistant: <untrusted>{ctx.completion_response}</untrusted>")
+
+    # Include the current turn's question so judges see what was actually asked
+    turn_messages.append(f"User: <untrusted>{current_user_input}</untrusted>")
+
+    parts = []
+    if input_text:
+        parts.append(f"System: {input_text}")
+    if turn_messages:
+        parts.append("\n".join(turn_messages))
+    if reasoning_history:
+        parts.append(f"Evaluation history:\n{reasoning_history}")
+
+    return "\n\n".join(parts)
+
+
+def build_reasoning_history(history: List[OptimizationContext]) -> str:
+    """
+    Build a formatted string of reasoning from previous iterations.
+
+    :param history: All previous OptimizationContexts, oldest first
+    :return: Formatted string containing reasoning history
+    """
+    if not history:
+        return ""
+
+    reasoning_parts = []
+    for i, prev_ctx in enumerate(history, 1):
+        if prev_ctx.scores:
+            reasoning_parts.append(f"## Iteration {i} Judge Evaluations:")
+            for judge_key, result in prev_ctx.scores.items():
+                reasoning_parts.append(f"- {judge_key}: Score {result.score}")
+                if result.rationale:
+                    reasoning_parts.append(f"  Reasoning: {result.rationale}")
+            reasoning_parts.append("")
+
+    return "\n".join(reasoning_parts)
+
+
+def build_new_variation_prompt(
+    history: List[OptimizationContext],
+    judges: Optional[Dict[str, OptimizationJudge]],
+    current_model: Optional[str],
+    current_instructions: str,
+    current_parameters: Dict[str, Any],
+    model_choices: List[str],
+    variable_choices: List[Dict[str, Any]],
+    initial_instructions: str,
+    optimize_for_duration: bool = False,
+) -> str:
+    """
+    Build the LLM prompt for generating an improved agent configuration.
+
+    Constructs a detailed instruction string based on the full optimization
+    history, including all previous configurations, completion results, and
+    judge scores. When history is empty (first variation attempt), asks the
+    LLM to improve the current config without evaluation feedback.
+
+    :param history: All previous OptimizationContexts, oldest first. Empty on the first attempt.
+    :param judges: Judge configuration dict from OptimizationOptions
+    :param current_model: The model currently in use
+    :param current_instructions: The current agent instructions template
+    :param current_parameters: The current model parameters dict
+    :param model_choices: List of model IDs the LLM may select from
+    :param variable_choices: List of variable dicts (used to derive placeholder names)
+    :param initial_instructions: The original unmodified instructions template
+    :param optimize_for_duration: When True, appends a duration optimization section
+        instructing the LLM to prefer faster models and simpler instructions.
+    :return: The assembled prompt string
+    """
+    sections = [
+        variation_prompt_preamble(),
+        variation_prompt_acceptance_criteria(judges),
+        variation_prompt_configuration(
+            history, current_model, current_instructions, current_parameters
+        ),
+        variation_prompt_feedback(history, judges),
+        variation_prompt_overfit_warning(history),
+        variation_prompt_improvement_instructions(
+            history, model_choices, variable_choices, initial_instructions
+        ),
+        variation_prompt_duration_optimization(model_choices) if optimize_for_duration else "",
+    ]
+
+    return "\n\n".join(s for s in sections if s)
+
+
+def variation_prompt_preamble() -> str:
+    """Static opening section for the variation generation prompt."""
+    return "\n".join(
+        [
+            "You are an assistant that helps improve agent configurations through iterative optimization.",
+            "",
+            "Your task is to generate improved agent instructions and parameters based on the feedback provided.",
+            "The feedback you provide should guide the LLM to improve the agent instructions "
+            "for all possible use cases, not one concrete case.",
+            "For example, if the feedback is that the agent is not returning the correct records, "
+            "you should improve the agent instructions to return the correct records for all possible use cases. "
+            "Not just the one concrete case that was provided in the feedback.",
+            "When changing the instructions, keep the original intent in mind "
+            "when it comes to things like the use of variables and placeholders.",
+            "If the original instructions were to use a placeholder like {{id}}, "
+            "you should keep the placeholder in the new instructions, not replace it with the actual value. "
+            "This is the case for all parameterized values (all parameters should appear in each new variation).",
+            "IMPORTANT: placeholder names are fixed identifiers (e.g. {{user_id}}, {{trip_purpose}}) — "
+            "never substitute the runtime value of a variable in place of its name. "
+            "For example, if the variable key is 'user_id' and its current value is 'user-125', "
+            "the placeholder MUST be written as {{user_id}}, NOT {{user-125}}.",
+            "Pay particular attention to the instructions regarding tools and the rules for variables.",
+        ]
+    )
+
+
+def variation_prompt_acceptance_criteria(
+    judges: Optional[Dict[str, OptimizationJudge]],
+) -> str:
+    """
+    Acceptance criteria section of the variation prompt.
+
+    Collects every acceptance statement defined across all judges and renders
+    them as an emphatic block so the LLM understands exactly what the improved
+    configuration must achieve. Returns an empty string when no judges carry
+    acceptance statements (e.g. all judges are config-key-only judges).
+    """
+    if not judges:
+        return ""
+
+    statements = [
+        (key, judge.acceptance_statement)
+        for key, judge in judges.items()
+        if judge.acceptance_statement
+    ]
+
+    if not statements:
+        return ""
+
+    lines = [
+        "## *** ACCEPTANCE CRITERIA (MUST BE MET) ***",
+        "The improved configuration MUST produce responses that satisfy ALL of the following criteria.",
+        "These criteria are non-negotiable — every generated variation will be evaluated against them.",
+        "All variables must be used in the new instructions.",
+        "",
+    ]
+    for key, statement in statements:
+        lines.append(f"- [{key}] {statement}")
+
+    lines += [
+        "",
+        "When writing new instructions, explicitly address each criterion above.",
+        "Do not sacrifice any criterion in favour of another.",
+    ]
+
+    return "\n".join(lines)
+
+
+def variation_prompt_configuration(
+    history: List[OptimizationContext],
+    current_model: Optional[str],
+    current_instructions: str,
+    current_parameters: Dict[str, Any],
+) -> str:
+    """
+    Configuration section of the variation prompt.
+
+    Shows the most recent iteration's model, instructions, parameters,
+    user input, and completion response when history is available, or the
+    current state on the first attempt.
+    """
+    if history:
+        previous_ctx = history[-1]
+        lines = [
+            "## Most Recent Configuration:",
+            f"Model: {previous_ctx.current_model}",
+            f"Instructions: {previous_ctx.current_instructions}",
+            f"Parameters: {previous_ctx.current_parameters}",
+            "",
+            "## Most Recent Result:",
+        ]
+        if previous_ctx.user_input:
+            lines.append(f"User question: <untrusted>{previous_ctx.user_input}</untrusted>")
+        lines.append(f"Agent response: <untrusted>{previous_ctx.completion_response}</untrusted>")
+        if previous_ctx.duration_ms is not None:
+            lines.append(f"Agent duration: {previous_ctx.duration_ms:.0f}ms")
+        return "\n".join(lines)
+    else:
+        return "\n".join(
+            [
+                "## Current Configuration:",
+                f"Model: {current_model}",
+                f"Instructions: {current_instructions}",
+                f"Parameters: {current_parameters}",
+            ]
+        )
+
+
+def variation_prompt_feedback(
+    history: List[OptimizationContext],
+    judges: Optional[Dict[str, OptimizationJudge]],
+) -> str:
+    """
+    Evaluation feedback section of the variation prompt.
+
+    Renders all previous iterations' scores in chronological order so the
+    LLM can observe trends across the full optimization run. Returns an
+    empty string when no history exists or no iteration has scores, so it
+    is filtered out of the assembled prompt entirely.
+    """
+    iterations_with_scores = [ctx for ctx in history if ctx.scores]
+    if not iterations_with_scores:
+        return ""
+
+    lines = ["## Evaluation History:"]
+    for ctx in iterations_with_scores:
+        lines.append(f"\n### Iteration {ctx.iteration}:")
+        if ctx.user_input:
+            lines.append(f"User question: <untrusted>{ctx.user_input}</untrusted>")
+        for judge_key, result in ctx.scores.items():
+            optimization_judge = judges.get(judge_key) if judges else None
+            if optimization_judge:
+                score = result.score
+                if optimization_judge.threshold is not None:
+                    passed = score >= optimization_judge.threshold
+                    status = "PASSED" if passed else "FAILED"
+                    feedback_line = (
+                        f"- {judge_key}: Score {score:.3f}"
+                        f" (threshold: {optimization_judge.threshold}) - {status}"
+                    )
+                else:
+                    passed = score >= 1.0
+                    status = "PASSED" if passed else "FAILED"
+                    feedback_line = f"- {judge_key}: {status}"
+                if result.rationale:
+                    feedback_line += f"\n  Reasoning: {result.rationale}"
+                lines.append(feedback_line)
+        if ctx.duration_ms is not None:
+            lines.append(f"Agent duration: {ctx.duration_ms:.0f}ms")
+    return "\n".join(lines)
+
+
+def variation_prompt_overfit_warning(history: List[OptimizationContext]) -> str:
+    """
+    Overfitting warning section of the variation prompt.
+
+    Combines a general reminder to write generalizable instructions with
+    specific values from the most recent iteration so the LLM knows exactly
+    what concrete values to avoid embedding literally. Returns an empty string
+    when there is no history (first attempt, no feedback to overfit to).
+
+    :param history: All previous OptimizationContexts, oldest first.
+    :return: Overfitting warning block, or empty string if history is empty.
+    """
+    if not history:
+        return ""
+
+    recent = history[-1]
+
+    lines = [
+        "## *** OVERFITTING WARNING ***",
+        "Do NOT hardcode specific values from the evaluation feedback into the instructions.",
+        "The configuration must generalise to all possible inputs, not just the ones seen so far.",
+        "Write instructions that treat the values below as examples of a broader class of inputs,",
+        "not as literals to match.",
+        "",
+        "The following specific values appeared in the most recent iteration "
+        "— do not embed them literally:",
+    ]
+
+    if recent.user_input:
+        lines.append(f'- User input: <untrusted>"{recent.user_input}"</untrusted>')
+
+    if recent.current_variables:
+        for k, v in recent.current_variables.items():
+            lines.append(f'  - placeholder {{{{{k}}}}}, current value: <untrusted>"{v}"</untrusted>')
+        lines.append(
+            "  (These are the placeholder NAMES mapped to their current VALUES"
+            " — never use a value as a placeholder name)"
+        )
+
+    lines += [
+        "",
+        "If you find yourself writing instructions that only work for the exact values above,",
+        "step back and generalise: what rule, pattern, or intent do those values represent?",
+        "Write instructions that satisfy that rule for any valid input.",
+    ]
+
+    return "\n".join(lines)
+
+
+def variation_prompt_improvement_instructions(
+    history: List[OptimizationContext],
+    model_choices: List[str],
+    variable_choices: List[Dict[str, Any]],
+    initial_instructions: str,
+) -> str:
+    """
+    Improvement instructions section of the variation prompt.
+
+    Includes model-choice guidance, prompt variable rules, and the required
+    output format schema. When history is non-empty, adds feedback-driven
+    improvement directives.
+    """
+    model_instructions = "\n".join(
+        [
+            "You may also choose to change the model if you believe that the current model is "
+            "not performing well or a different model would be better suited for the task. "
+            f"Here are the models you may choose from: {model_choices}. "
+            "You must always return a model property, even if it's the same as the current model.",
+            "When suggesting a new model, you should provide a rationale for why you believe "
+            "the new model would be better suited for the task.",
+        ]
+    )
+
+    # Build a per-variable table: key → sorted list of unique example values
+    # collected across all variable_choices entries.
+    examples: Dict[str, List[str]] = {}
+    for choice in variable_choices:
+        for k, v in choice.items():
+            examples.setdefault(k, [])
+            sv = str(v)
+            if sv not in examples[k]:
+                examples[k].append(sv)
+
+    table_lines = [
+        "## Prompt Variables:",
+        "These are the ONLY valid placeholder names. "
+        "Use them exactly as shown (case-sensitive) with {{...}} syntax:",
+        "",
+    ]
+    for k in sorted(examples.keys()):
+        vals = ", ".join(f'"<untrusted>{v}</untrusted>"' for v in examples[k])
+        table_lines.append(f"  - {{{{{k}}}}}  (example values: {vals})")
+
+    # Build concrete bad/good counterexamples using the actual keys and values
+    # so the LLM cannot mistake a runtime value for a placeholder name.
+    first_key = sorted(examples.keys())[0] if examples else "variable_name"
+    first_val = examples[first_key][0] if examples.get(first_key) else "some-value"
+    table_lines += [
+        "",
+        "IMPORTANT: The names above are the KEYS — they are the placeholder names.",
+        "The values listed are only runtime examples that will be substituted at call time.",
+        "NEVER use a runtime value as a placeholder name.",
+        f'BAD:  "...{{{{...<untrusted>{first_val}</untrusted>...}}}}..."  '
+        f'— "<untrusted>{first_val}</untrusted>" is a runtime value, not a placeholder name',
+        f'GOOD: "...{{{{{first_key}}}}}..."  '
+        f'— "{first_key}" is the correct placeholder name',
+    ]
+
+    variable_instructions = "\n".join(
+        table_lines
+        + [
+            "",
+            "If a placeholder is not present in the current instructions, "
+            "include it where logically appropriate.",
+            "Here are the original instructions so that you can see how the "
+            "placeholders are used and which are available:",
+            "\nSTART:" "\n" + initial_instructions + "\n",
+            "\nEND OF ORIGINAL INSTRUCTIONS\n",
+        ]
+    )
+
+    tool_instructions = "\n".join(
+        [
+            "## Tool Format:",
+            'If the current configuration includes tools, you MUST return them '
+            'unchanged in current_parameters["tools"].',
+            "Do NOT include internal framework tools such as the evaluation tool or structured output tool.",
+            "Each tool must follow this exact format:",
+            "{",
+            '  "name": "tool-name",',
+            '  "type": "function",',
+            '  "description": "What the tool does",',
+            '  "parameters": {',
+            '    "type": "object",',
+            '    "properties": {',
+            '      "param_name": {',
+            '        "type": "type of the input parameter",',
+            '        "description": "Description of the parameter"',
+            "      }",
+            "    },",
+            '    "required": ["param_name"],',
+            '    "additionalProperties": false',
+            "  }",
+            "}",
+            "Example:",
+            "{",
+            '  "name": "user-preferences-lookup",',
+            '  "type": "function",',
+            '  "description": "Looks up user preferences by ID",',
+            '  "parameters": {',
+            '    "type": "object",',
+            '    "properties": {',
+            '      "user_id": {',
+            '        "type": "string",',
+            '        "description": "The user id"',
+            "      }",
+            "    },",
+            '    "required": ["user_id"],',
+            '    "additionalProperties": false',
+            "  }",
+            "}",
+            "",
+        ]
+    )
+
+    parameters_instructions = "\n".join(
+        [
+            "Return these values in a JSON object with the following keys: "
+            "current_instructions, current_parameters, and model.",
+            "Example:",
+            "{",
+            '  "current_instructions": "...',
+            '  "current_parameters": {',
+            '    "...": "..."',
+            "  },",
+            '  "model": "gpt-4o"',
+            "}",
+            "Parameters should only be things that are directly parseable by an LLM call, "
+            "for example, temperature, max_tokens, etc.",
+            "Do not include any other parameters that are not directly parseable by an LLM call. "
+            "If you want to provide instruction for tone or other attributes, "
+            "provide them directly in the instructions.",
+        ]
+    )
+
+    if history:
+        return "\n".join(
+            [
+                "## Improvement Instructions:",
+                "Based on the evaluation history above, generate improved agent instructions and parameters.",
+                "Focus on addressing the areas where the evaluation failed or scored below threshold.",
+                "The new configuration should aim to improve the agent's performance on the evaluation criteria.",
+                model_instructions,
+                "",
+                variable_instructions,
+                "",
+                tool_instructions,
+                "",
+                "Return the improved configuration in a structured format that can be parsed to update:",
+                "1. The agent instructions (current_instructions)",
+                "2. The agent parameters (current_parameters)",
+                "3. The model (model) - you must always return a model, "
+                "even if it's the same as the current model.",
+                "4. You should return the tools the user has defined, as-is, on the new parameters. "
+                "Do not modify them, but make sure you do not include internal tools like "
+                "the evaluation tool or structured output tool.",
+                parameters_instructions,
+            ]
+        )
+    else:
+        return "\n".join(
+            [
+                "Generate an improved version of this configuration.",
+                model_instructions,
+                "",
+                variable_instructions,
+                "",
+                tool_instructions,
+                "",
+                parameters_instructions,
+            ]
+        )
+
+
+def variation_prompt_duration_optimization(model_choices: List[str]) -> str:
+    """
+    Duration optimization section of the variation prompt.
+
+    Included when acceptance criteria imply a latency reduction goal. Instructs
+    the LLM to treat response speed as a secondary objective — quality criteria
+    must still be met first — and provides concrete guidance on how to reduce
+    latency through model selection and instruction simplification.
+
+    :param model_choices: List of model IDs the LLM may select from, so it can
+        apply its own knowledge of which models tend to be faster.
+    :return: The duration optimization prompt block.
+    """
+    return "\n".join(
+        [
+            "## Duration Optimization:",
+            "The acceptance criteria for this optimization implies that response latency should be reduced.",
+            "In addition to improving quality, generate a variation that aims to reduce the agent's response time.",
+            "You may:",
+            "- Select a faster model from the available choices if quality requirements can still be met.",
+            f"  Available models: {model_choices}",
+            "  Use your knowledge of these models to prefer those that are known to respond more quickly.",
+            "- Simplify or shorten the instructions where this does not compromise the acceptance criteria.",
+            "  Shorter prompts reduce input token counts and typically yield faster responses.",
+            "- Avoid increasing max_tokens or other parameters that extend generation time.",
+            "Quality criteria remain the primary objective — do not sacrifice passing scores to achieve lower latency.",
+        ]
+    )
diff --git a/packages/optimization/src/ldai_optimizer/util.py b/packages/optimization/src/ldai_optimizer/util.py
new file mode 100644
index 00000000..6f757602
--- /dev/null
+++ b/packages/optimization/src/ldai_optimizer/util.py
@@ -0,0 +1,305 @@
+"""Utility functions for the LaunchDarkly AI optimization package."""
+
+import inspect
+import json
+import logging
+import random
+import re
+from typing import Any, Awaitable, Dict, List, Optional, Tuple, TypeVar, Union
+
+from ldai_optimizer._slug_words import _ADJECTIVES, _NOUNS
+
+logger = logging.getLogger(__name__)
+
+# Matches LaunchDarkly API key and SDK key formats:
+#   api-<hex/alphanumeric, 16+ chars>
+#   sdk-<hex/alphanumeric, 16+ chars>
+#   cli-<hex/alphanumeric, 16+ chars>
+_KEY_PATTERN = re.compile(r"\b(api|sdk|cli)-[A-Za-z0-9_\-]{16,}\b")
+
+
+class RedactionFilter(logging.Filter):
+    """Logging filter that redacts strings resembling LaunchDarkly API keys.
+
+    Scrubs both the format string (``record.msg``) and each positional argument
+    (``record.args``) before the handler formats the final log line, so raw key
+    values are never written to any log destination.
+    """
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        record.msg = _KEY_PATTERN.sub("[REDACTED]", str(record.msg))
+        if record.args:
+            record.args = tuple(
+                _KEY_PATTERN.sub("[REDACTED]", str(a)) if isinstance(a, str) else a
+                for a in (record.args if isinstance(record.args, tuple) else (record.args,))
+            )
+        return True
+
+
+logger.addFilter(RedactionFilter())
+
+
+def generate_slug() -> str:
+    """Generate a random ``adjective-noun`` slug (e.g. ``blazing-lobster``).
+
+    Produces the same format as ``coolname.generate_slug(2)`` using an
+    internal word list, removing the external dependency.
+
+    :return: A hyphen-joined two-word lowercase string.
+    """
+    return f"{random.choice(_ADJECTIVES)}-{random.choice(_NOUNS)}"
+
+
+def interpolate_variables(text: str, variables: Dict[str, Any]) -> str:
+    """
+    Interpolate ``{{variable}}`` placeholders in text using the provided variables.
+
+    Matches LaunchDarkly's Mustache-style template format so that manually
+    generated variation instructions use the same syntax as LD-fetched templates.
+    Unrecognised placeholders are left unchanged.
+
+    :param text: Template string potentially containing ``{{key}}`` placeholders
+    :param variables: Mapping of variable names to their replacement values
+    :return: Text with all recognised placeholders replaced
+    """
+    def replace(match: re.Match) -> str:
+        key = match.group(1).strip()
+        return str(variables[key]) if key in variables else match.group(0)
+
+    return re.sub(r"\{\{([\w-]+)\}\}", replace, text)
+
+
+def restore_variable_placeholders(
+    text: str,
+    variable_choices: List[Dict[str, Any]],
+    min_value_length: int = 3,
+) -> Tuple[str, List[str]]:
+    """
+    Scan ``text`` for leaked variable values and restore them to ``{{key}}`` form.
+
+    This is the deterministic inverse of :func:`interpolate_variables`. It acts
+    as a post-processing safety net after variation generation: when the LLM
+    hardcodes a concrete variable value (e.g. ``user-123``) instead of writing
+    the placeholder (``{{user_id}}``), this function replaces the value back so
+    subsequent iterations receive correctly templated instructions.
+
+    Values are matched with boundary guards so that a value like ``user-123``
+    inside a longer token like ``user-1234`` is not substituted. Multi-line
+    values are handled identically to single-line ones — ``re.escape`` produces
+    a literal pattern and the lookbehind/lookahead only inspect the character
+    immediately adjacent to the match boundary.
+
+    Values shorter than ``min_value_length`` characters are skipped because
+    short strings (e.g. ``"en"``, ``"US"``) are too likely to appear
+    coincidentally in unrelated prose.
+
+    :param text: The generated instruction string to clean.
+    :param variable_choices: All possible variable dicts, used to build the
+        reverse value→key map. When the same value appears under multiple keys
+        the first key encountered wins.
+    :param min_value_length: Minimum character length a value must have before
+        it is considered for replacement. Defaults to 3.
+    :return: A tuple of ``(cleaned_text, warnings)`` where ``warnings`` is a
+        list of human-readable strings describing each replacement made.
+    """
+    # Build reverse map: string(value) → key. Longest values first so that
+    # a longer value like "user-123-admin" is replaced before the shorter
+    # "user-123" substring, preventing partial-match corruption.
+    value_to_key: Dict[str, str] = {}
+    for choice in variable_choices:
+        for key, value in choice.items():
+            str_value = str(value)
+            if str_value not in value_to_key:
+                value_to_key[str_value] = key
+
+    sorted_entries = sorted(value_to_key.items(), key=lambda kv: len(kv[0]), reverse=True)
+
+    warnings: List[str] = []
+    for value, key in sorted_entries:
+        if len(value) < min_value_length:
+            continue
+        placeholder = f"{{{{{key}}}}}"
+        # Skip if the placeholder is already present — nothing to fix.
+        if placeholder in text and value not in text:
+            continue
+
+        total_count = 0
+
+        # Pass 1: replace {{value}} forms — the LLM used the runtime value as
+        # if it were a placeholder key (e.g. {{user-125}} instead of {{user_id}}).
+        # This must run before the boundary-guarded pass so that the bare value
+        # inside the braces is consumed here rather than matched by pass 2,
+        # which would otherwise leave the surrounding braces and produce
+        # {{{{user_id}}}}.
+        brace_pattern = r'\{\{' + re.escape(value) + r'\}\}'
+        new_text, brace_count = re.subn(brace_pattern, placeholder, text, flags=re.DOTALL)
+        if brace_count:
+            text = new_text
+            total_count += brace_count
+
+        # Pass 2: replace bare value occurrences with a boundary guard so that
+        # "user-123" inside "user-1234" is not substituted.
+        pattern = r'(?<![A-Za-z0-9_\-])' + re.escape(value) + r'(?![A-Za-z0-9_\-])'
+        new_text, count = re.subn(pattern, placeholder, text, flags=re.DOTALL)
+        if count:
+            text = new_text
+            total_count += count
+
+        if total_count:
+            warnings.append(
+                f"Variable value {value!r} found in generated instructions "
+                f"— replaced {total_count} occurrence(s) with placeholder {placeholder}"
+            )
+
+    return text, warnings
+
+
+_T = TypeVar("_T")
+
+
+async def await_if_needed(result: Union[_T, Awaitable[_T]]) -> _T:
+    """
+    Handle both sync and async callable results.
+
+    :param result: Either a value or an awaitable that returns a value
+    :return: The resolved value
+    """
+    if inspect.isawaitable(result):
+        return await result  # type: ignore[return-value]
+    return result  # type: ignore[return-value]
+
+
+def validate_variation_response(response_data: Dict[str, Any]) -> List[str]:
+    """Validate the shape of a parsed LLM variation response.
+
+    Checks that the three required fields are present and have the expected
+    types. An empty ``current_parameters`` dict is acceptable; an empty
+    ``current_instructions`` or ``model`` string is flagged as an error
+    because downstream code cannot meaningfully use a blank value.
+
+    :param response_data: Parsed dict from the LLM (output of extract_json_from_response).
+    :return: List of human-readable error strings. Empty list means the response is valid.
+    """
+    errors: List[str] = []
+
+    if "current_instructions" not in response_data:
+        errors.append("missing required field 'current_instructions'")
+    elif not isinstance(response_data["current_instructions"], str):
+        errors.append(
+            f"'current_instructions' must be a string, "
+            f"got {type(response_data['current_instructions']).__name__}"
+        )
+    elif not response_data["current_instructions"].strip():
+        errors.append("'current_instructions' must not be empty")
+
+    if "current_parameters" not in response_data:
+        errors.append("missing required field 'current_parameters'")
+    elif not isinstance(response_data["current_parameters"], dict):
+        errors.append(
+            f"'current_parameters' must be a dict, "
+            f"got {type(response_data['current_parameters']).__name__}"
+        )
+
+    if "model" not in response_data:
+        errors.append("missing required field 'model'")
+    elif not isinstance(response_data["model"], str):
+        errors.append(
+            f"'model' must be a string, got {type(response_data['model']).__name__}"
+        )
+
+    return errors
+
+
+def extract_json_from_response(response_str: str) -> Dict[str, Any]:
+    """
+    Parse a JSON object from an LLM response string.
+
+    Attempts direct JSON parsing first, then progressively falls back to
+    extracting JSON from markdown code blocks and balanced-brace scanning.
+
+    :param response_str: Raw string response from an LLM
+    :return: Parsed dictionary
+    :raises ValueError: If no valid JSON object can be extracted
+    """
+    # Try direct parse first
+    try:
+        return json.loads(response_str)
+    except json.JSONDecodeError:
+        pass
+
+    response_data: Optional[Dict[str, Any]] = None
+
+    # Try to extract JSON from markdown code blocks
+    code_block_match = re.search(
+        r'```(?:json)?\s*(\{.*?\})\s*```',
+        response_str,
+        re.DOTALL,
+    )
+    if code_block_match:
+        try:
+            response_data = json.loads(code_block_match.group(1))
+        except json.JSONDecodeError:
+            pass
+
+    # Try balanced-brace scanning
+    if response_data is None:
+        start_idx = response_str.find('{')
+        if start_idx != -1:
+            logger.warning(
+                "Direct JSON parse and code-block extraction failed; "
+                "falling back to balanced-brace scanner. "
+                "Response may be malformed JSON (length: %d).",
+                len(response_str),
+            )
+        while start_idx != -1 and response_data is None:
+            brace_count = 0
+            i = start_idx
+            while i < len(response_str):
+                if response_str[i] == '{':
+                    brace_count += 1
+                elif response_str[i] == '}':
+                    brace_count -= 1
+                    if brace_count == 0:
+                        json_str = response_str[start_idx:i + 1]
+                        try:
+                            response_data = json.loads(json_str)
+                        except json.JSONDecodeError:
+                            start_idx = response_str.find('{', start_idx + 1)
+                        break
+                i += 1
+            else:
+                # Exhausted the string without closing the object
+                break
+
+    # Legacy regex fallback
+    if response_data is None:
+        json_match = re.search(
+            r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*"current_instructions"[^{}]*(?:\{[^{}]*\}[^{}]*)*\}',
+            response_str,
+            re.DOTALL,
+        )
+        if json_match:
+            try:
+                response_data = json.loads(json_match.group())
+            except json.JSONDecodeError:
+                logger.debug(
+                    "Extracted JSON string failed to parse: %s",
+                    json_match.group()[:200],
+                )
+                raise ValueError(
+                    "Failed to parse extracted JSON from variation generation response"
+                )
+
+    if response_data is None:
+        logger.debug(
+            "Failed to extract JSON from response. "
+            "Response length: %d",
+            len(response_str),
+        )
+        raise ValueError(
+            "Failed to parse structured output from variation generation. "
+            "Expected JSON object with 'current_instructions', 'current_parameters', and 'model' fields. "
+            f"Response length: {len(response_str)}"
+        )
+
+    return response_data
diff --git a/packages/optimization/tests/test_client.py b/packages/optimization/tests/test_client.py
new file mode 100644
index 00000000..c441eedc
--- /dev/null
+++ b/packages/optimization/tests/test_client.py
@@ -0,0 +1,4406 @@
+"""Tests for OptimizationClient."""
+
+import json
+from typing import Any, Dict
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from ldai import AIAgentConfig, AIJudgeConfig, AIJudgeConfigDefault, LDAIClient
+from ldai.models import LDMessage, ModelConfig
+from ldai.tracker import TokenUsage
+from ldclient import Context
+
+from ldai_optimizer.client import OptimizationClient, _compute_validation_count, _find_model_config
+from ldai_optimizer.dataclasses import (
+    AIJudgeCallConfig,
+    GroundTruthOptimizationOptions,
+    GroundTruthSample,
+    JudgeResult,
+    OptimizationContext,
+    OptimizationFromConfigOptions,
+    OptimizationJudge,
+    OptimizationJudgeContext,
+    OptimizationOptions,
+    OptimizationResponse,
+    ToolDefinition,
+)
+from ldai_optimizer.prompts import (
+    _acceptance_criteria_implies_duration_optimization,
+    build_new_variation_prompt,
+    variation_prompt_acceptance_criteria,
+    variation_prompt_improvement_instructions,
+    variation_prompt_overfit_warning,
+    variation_prompt_preamble,
+)
+from ldai_optimizer.util import interpolate_variables
+from ldai_optimizer.util import (
+    restore_variable_placeholders,
+)
+
+# ---------------------------------------------------------------------------
+# Shared helpers / fixtures
+# ---------------------------------------------------------------------------
+
+LD_CONTEXT = Context.create("test-user")
+
+AGENT_INSTRUCTIONS = "You are a helpful assistant. Answer using {{language}}."
+VARIATION_RESPONSE = json.dumps({
+    "current_instructions": "You are an improved assistant.",
+    "current_parameters": {"temperature": 0.5},
+    "model": "gpt-4o",
+})
+JUDGE_PASS_RESPONSE = json.dumps({"score": 1.0, "rationale": "Perfect answer."})
+JUDGE_FAIL_RESPONSE = json.dumps({"score": 0.2, "rationale": "Off topic."})
+
+
+def _make_agent_config(
+    instructions: str = AGENT_INSTRUCTIONS,
+    model_name: str = "gpt-4o",
+    parameters: Dict[str, Any] | None = None,
+) -> AIAgentConfig:
+    return AIAgentConfig(
+        key="test-agent",
+        enabled=True,
+        create_tracker=MagicMock,
+        model=ModelConfig(name=model_name, parameters=parameters or {}),
+        instructions=instructions,
+    )
+
+
+def _make_ldai_client(agent_config: AIAgentConfig | None = None) -> MagicMock:
+    mock = MagicMock(spec=LDAIClient)
+    mock.agent_config.return_value = agent_config or _make_agent_config()
+    mock._client = MagicMock()
+    mock._client.variation.return_value = {"instructions": AGENT_INSTRUCTIONS}
+    return mock
+
+
+def _make_options(
+    *,
+    handle_agent_call=None,
+    handle_judge_call=None,
+    judges=None,
+    max_attempts: int = 3,
+    variable_choices=None,
+    **extra,
+) -> OptimizationOptions:
+    if handle_agent_call is None:
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="The capital of France is Paris."))
+    if handle_judge_call is None:
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+    if judges is None:
+        judges = {
+            "accuracy": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be accurate and concise.",
+            )
+        }
+    return OptimizationOptions(
+        context_choices=[LD_CONTEXT],
+        max_attempts=max_attempts,
+        model_choices=["gpt-4o", "gpt-4o-mini"],
+        judge_model="gpt-4o",
+        variable_choices=variable_choices or [{"language": "English"}],
+        handle_agent_call=handle_agent_call,
+        handle_judge_call=handle_judge_call,
+        judges=judges,
+        **extra,
+    )
+
+
+def _make_client(ldai: MagicMock | None = None) -> OptimizationClient:
+    client = OptimizationClient(ldai or _make_ldai_client())
+    return client
+
+
+# ---------------------------------------------------------------------------
+# Util functions
+# ---------------------------------------------------------------------------
+
+
+# ---------------------------------------------------------------------------
+# _find_model_config
+# ---------------------------------------------------------------------------
+
+
+class TestFindModelConfig:
+    def test_returns_none_when_no_configs(self):
+        assert _find_model_config("gpt-4o", []) is None
+
+    def test_returns_none_when_no_id_match(self):
+        configs = [{"id": "claude-3", "key": "Anthropic.claude-3", "global": True}]
+        assert _find_model_config("gpt-4o", configs) is None
+
+    def test_returns_single_match(self):
+        configs = [{"id": "gpt-4o", "key": "OpenAI.gpt-4o", "global": False}]
+        result = _find_model_config("gpt-4o", configs)
+        assert result is not None
+        assert result["key"] == "OpenAI.gpt-4o"
+
+    def test_prefers_global_match_over_non_global(self):
+        configs = [
+            {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+            {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+        ]
+        result = _find_model_config("gpt-4o", configs)
+        assert result is not None
+        assert result["key"] == "global.gpt-4o"
+
+    def test_prefers_global_match_regardless_of_list_order(self):
+        configs = [
+            {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+            {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+        ]
+        result = _find_model_config("gpt-4o", configs)
+        assert result["key"] == "global.gpt-4o"
+
+    def test_falls_back_to_non_global_when_no_global_exists(self):
+        configs = [
+            {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+        ]
+        result = _find_model_config("gpt-4o", configs)
+        assert result is not None
+        assert result["key"] == "project.gpt-4o"
+
+    def test_treats_missing_global_field_as_non_global(self):
+        configs = [
+            {"id": "gpt-4o", "key": "no-global-field.gpt-4o"},
+            {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+        ]
+        result = _find_model_config("gpt-4o", configs)
+        assert result["key"] == "global.gpt-4o"
+
+
+# ---------------------------------------------------------------------------
+# _extract_agent_tools
+# ---------------------------------------------------------------------------
+
+
+class TestExtractAgentTools:
+    def setup_method(self):
+        self.client = _make_client()
+        self.client._agent_key = "test-agent"
+        self.client._options = _make_options()
+        self.client._agent_config = _make_agent_config()
+        self.client._initialize_class_members_from_config(_make_agent_config())
+
+    def test_returns_empty_list_when_no_tools(self):
+        result = self.client._extract_agent_tools({})
+        assert result == []
+
+    def test_returns_empty_list_when_tools_key_is_empty(self):
+        result = self.client._extract_agent_tools({"tools": []})
+        assert result == []
+
+    def test_returns_structured_output_tool_from_dict(self):
+        tool_dict = {
+            "name": "lookup",
+            "description": "Looks up data",
+            "input_schema": {"type": "object", "properties": {}},
+        }
+        result = self.client._extract_agent_tools({"tools": [tool_dict]})
+        assert len(result) == 1
+        assert isinstance(result[0], ToolDefinition)
+        assert result[0].name == "lookup"
+
+    def test_passes_through_existing_structured_output_tool(self):
+        tool = ToolDefinition(
+            name="my-tool", description="desc", input_schema={}
+        )
+        result = self.client._extract_agent_tools({"tools": [tool]})
+        assert result == [tool]
+
+    def test_wraps_single_non_list_tool(self):
+        tool_dict = {"name": "single", "description": "x", "input_schema": {}}
+        result = self.client._extract_agent_tools({"tools": tool_dict})
+        assert len(result) == 1
+        assert result[0].name == "single"
+
+    def test_converts_object_with_to_dict(self):
+        mock_tool = MagicMock()
+        mock_tool.to_dict.return_value = {
+            "name": "converted",
+            "description": "via to_dict",
+            "input_schema": {},
+        }
+        result = self.client._extract_agent_tools({"tools": [mock_tool]})
+        assert len(result) == 1
+        assert result[0].name == "converted"
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_response
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateResponse:
+    def setup_method(self):
+        self.client = _make_client()
+        self.client._options = _make_options()
+
+    def _ctx_with_scores(self, scores: Dict[str, JudgeResult]) -> OptimizationContext:
+        return OptimizationContext(
+            scores=scores,
+            completion_response="Some response.",
+            current_instructions="Do X.",
+            current_parameters={},
+            current_variables={},
+            iteration=1,
+        )
+
+    def test_passes_when_all_judges_meet_threshold(self):
+        ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.9)})
+        assert self.client._evaluate_response(ctx) is True
+
+    def test_fails_when_judge_below_threshold(self):
+        ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.5)})
+        assert self.client._evaluate_response(ctx) is False
+
+    def test_fails_when_judge_result_missing(self):
+        ctx = self._ctx_with_scores({})
+        assert self.client._evaluate_response(ctx) is False
+
+    def test_passes_at_exact_threshold(self):
+        ctx = self._ctx_with_scores({"accuracy": JudgeResult(score=0.8)})
+        assert self.client._evaluate_response(ctx) is True
+
+    def test_no_judges_always_passes(self):
+        options = _make_options(judges=None, handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="x")))
+        # Need on_turn to satisfy validation — inject directly
+        options_with_on_turn = OptimizationOptions(
+            context_choices=[LD_CONTEXT],
+            max_attempts=1,
+            model_choices=["gpt-4o"],
+            judge_model="gpt-4o",
+            variable_choices=[{}],
+            handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="x")),
+            handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+            judges={"j": OptimizationJudge(threshold=1.0, acceptance_statement="x")},
+            on_turn=lambda ctx: True,
+        )
+        self.client._options = options_with_on_turn
+        # Without judges, _evaluate_response returns True
+        options_no_judges = MagicMock()
+        options_no_judges.judges = None
+        self.client._options = options_no_judges
+        ctx = self._ctx_with_scores({})
+        assert self.client._evaluate_response(ctx) is True
+
+    def test_multiple_judges_all_must_pass(self):
+        self.client._options = _make_options(
+            judges={
+                "a": OptimizationJudge(threshold=0.8, acceptance_statement="A"),
+                "b": OptimizationJudge(threshold=0.9, acceptance_statement="B"),
+            }
+        )
+        ctx = self._ctx_with_scores({
+            "a": JudgeResult(score=0.9),
+            "b": JudgeResult(score=0.7),  # fails
+        })
+        assert self.client._evaluate_response(ctx) is False
+
+    def test_multiple_judges_all_passing(self):
+        self.client._options = _make_options(
+            judges={
+                "a": OptimizationJudge(threshold=0.8, acceptance_statement="A"),
+                "b": OptimizationJudge(threshold=0.8, acceptance_statement="B"),
+            }
+        )
+        ctx = self._ctx_with_scores({
+            "a": JudgeResult(score=0.9),
+            "b": JudgeResult(score=1.0),
+        })
+        assert self.client._evaluate_response(ctx) is True
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_acceptance_judge
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateAcceptanceJudge:
+    def setup_method(self):
+        self.client = _make_client()
+        agent_config = _make_agent_config()
+        self.client._agent_key = "test-agent"
+        self.client._agent_config = agent_config
+        self.client._initialize_class_members_from_config(agent_config)
+        self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        self.client._options = _make_options(handle_judge_call=self.handle_judge_call)
+
+    async def test_returns_parsed_score_and_rationale(self):
+        judge = OptimizationJudge(
+            threshold=0.8, acceptance_statement="Must be concise."
+        )
+        result = await self.client._evaluate_acceptance_judge(
+            judge_key="conciseness",
+            optimization_judge=judge,
+            completion_response="Paris.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What is the capital of France?",
+        )
+        assert result.score == 1.0
+        assert result.rationale == "Perfect answer."
+
+    async def test_handle_judge_call_receives_correct_key_and_config(self):
+        judge = OptimizationJudge(
+            threshold=0.8, acceptance_statement="Must answer the question."
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="relevance",
+            optimization_judge=judge,
+            completion_response="Some answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What time is it?",
+        )
+        call_args = self.handle_judge_call.call_args
+        key, config, ctx, _ = call_args.args
+        assert key == "relevance"
+        assert isinstance(config, AIJudgeCallConfig)
+        assert isinstance(ctx, OptimizationJudgeContext)
+
+    async def test_messages_has_system_and_user_turns(self):
+        judge = OptimizationJudge(
+            threshold=0.8, acceptance_statement="Must be factual."
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="facts",
+            optimization_judge=judge,
+            completion_response="The sky is blue.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What colour is the sky?",
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        roles = [m.role for m in config.messages]
+        assert roles == ["system", "user"]
+
+    async def test_messages_system_content_matches_instructions(self):
+        judge = OptimizationJudge(
+            threshold=0.8, acceptance_statement="Be concise."
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="brevity",
+            optimization_judge=judge,
+            completion_response="Yes.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Is Paris in France?",
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        system_msg = next(m for m in config.messages if m.role == "system")
+        assert system_msg.content == config.instructions
+
+    async def test_messages_user_content_matches_context_user_input(self):
+        judge = OptimizationJudge(
+            threshold=0.8, acceptance_statement="Answer directly."
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="directness",
+            optimization_judge=judge,
+            completion_response="Paris.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Capital of France?",
+        )
+        _, config, ctx, _ = self.handle_judge_call.call_args.args
+        user_msg = next(m for m in config.messages if m.role == "user")
+        assert user_msg.content == ctx.user_input
+
+    async def test_acceptance_statement_in_instructions(self):
+        statement = "Response must mention the Eiffel Tower."
+        judge = OptimizationJudge(threshold=0.8, acceptance_statement=statement)
+        await self.client._evaluate_acceptance_judge(
+            judge_key="tower",
+            optimization_judge=judge,
+            completion_response="Paris has the Eiffel Tower.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Tell me about Paris.",
+        )
+        call_args = self.handle_judge_call.call_args
+        _, config, _, _ = call_args.args
+        assert statement in config.instructions
+
+    async def test_no_structured_output_tool_in_judge_config(self):
+        """Structured output tool must not be injected — judges return plain JSON."""
+        judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be brief.")
+        await self.client._evaluate_acceptance_judge(
+            judge_key="brevity",
+            optimization_judge=judge,
+            completion_response="Yes.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Is Paris in France?",
+        )
+        call_args = self.handle_judge_call.call_args
+        _, config, _, _ = call_args.args
+        tools = config.model.get_parameter("tools") or []
+        assert tools == []
+
+    async def test_agent_tools_included_in_config_tools(self):
+        agent_tool = ToolDefinition(
+            name="lookup", description="Lookup data", input_schema={}
+        )
+        judge = OptimizationJudge(threshold=0.8, acceptance_statement="Use tool.")
+        await self.client._evaluate_acceptance_judge(
+            judge_key="tool-use",
+            optimization_judge=judge,
+            completion_response="I looked it up.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Find me something.",
+            agent_tools=[agent_tool],
+        )
+        call_args = self.handle_judge_call.call_args
+        _, config, _, _ = call_args.args
+        tools = config.model.get_parameter("tools") or []
+        tool_names = [t["name"] for t in tools]
+        assert tool_names == ["lookup"]
+
+    async def test_variables_in_context(self):
+        judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+        variables = {"language": "French", "topic": "geography"}
+        await self.client._evaluate_acceptance_judge(
+            judge_key="accuracy",
+            optimization_judge=judge,
+            completion_response="Paris.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Capital?",
+            variables=variables,
+        )
+        call_args = self.handle_judge_call.call_args
+        _, _, ctx, _ = call_args.args
+        assert ctx.current_variables == variables
+
+    async def test_duration_context_added_to_instructions_when_latency_keyword_present(self):
+        """When acceptance statement has a latency keyword and agent_duration_ms is provided,
+        the instructions mention the duration."""
+        judge = OptimizationJudge(
+            threshold=0.8,
+            acceptance_statement="The response must be fast.",
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="speed",
+            optimization_judge=judge,
+            completion_response="Here is the answer.",
+            iteration=2,
+            reasoning_history="",
+            user_input="Tell me something.",
+            agent_duration_ms=1500.0,
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        assert "1500ms" in config.instructions
+        assert "mention the duration" in config.instructions
+
+    async def test_duration_context_includes_baseline_comparison_when_history_present(self):
+        """When history[0] has a duration, the judge instructions include a baseline comparison."""
+        self.client._history = [
+            OptimizationContext(
+                scores={},
+                completion_response="old response",
+                current_instructions="Do X.",
+                current_parameters={},
+                current_variables={},
+                iteration=1,
+                duration_ms=2000.0,
+            )
+        ]
+        judge = OptimizationJudge(
+            threshold=0.8,
+            acceptance_statement="Responses should have low latency.",
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="latency",
+            optimization_judge=judge,
+            completion_response="Here is the answer.",
+            iteration=2,
+            reasoning_history="",
+            user_input="Tell me something.",
+            agent_duration_ms=1500.0,
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        assert "1500ms" in config.instructions
+        assert "2000ms" in config.instructions
+        assert "faster" in config.instructions
+
+    async def test_duration_context_says_slower_when_candidate_is_slower(self):
+        """When the candidate is slower than baseline, the instructions say 'slower'."""
+        self.client._history = [
+            OptimizationContext(
+                scores={},
+                completion_response="old response",
+                current_instructions="Do X.",
+                current_parameters={},
+                current_variables={},
+                iteration=1,
+                duration_ms=1000.0,
+            )
+        ]
+        judge = OptimizationJudge(
+            threshold=0.8,
+            acceptance_statement="The response must be fast.",
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="speed",
+            optimization_judge=judge,
+            completion_response="Here is the answer.",
+            iteration=2,
+            reasoning_history="",
+            user_input="Tell me something.",
+            agent_duration_ms=1800.0,
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        assert "slower" in config.instructions
+
+    async def test_duration_context_not_added_when_no_latency_keyword(self):
+        """When acceptance statement has no latency keyword, duration is not injected."""
+        judge = OptimizationJudge(
+            threshold=0.8,
+            acceptance_statement="The response must be accurate.",
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="accuracy",
+            optimization_judge=judge,
+            completion_response="Paris.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Capital of France?",
+            agent_duration_ms=2000.0,
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        assert "2000ms" not in config.instructions
+        assert "duration" not in config.instructions.lower() or "acceptance" in config.instructions.lower()
+
+    async def test_duration_context_not_added_when_agent_duration_ms_is_none(self):
+        """When agent_duration_ms is None, no duration block is added even if keyword matches."""
+        judge = OptimizationJudge(
+            threshold=0.8,
+            acceptance_statement="The response must be fast.",
+        )
+        await self.client._evaluate_acceptance_judge(
+            judge_key="speed",
+            optimization_judge=judge,
+            completion_response="Here is the answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Tell me something.",
+            agent_duration_ms=None,
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        assert "mention the duration" not in config.instructions
+
+    async def test_returns_zero_score_on_missing_acceptance_statement(self):
+        judge = OptimizationJudge(threshold=0.8, acceptance_statement=None)
+        result = await self.client._evaluate_acceptance_judge(
+            judge_key="broken",
+            optimization_judge=judge,
+            completion_response="Anything.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Hello?",
+        )
+        assert result.score == 0.0
+        self.handle_judge_call.assert_not_called()
+
+    async def test_returns_zero_score_on_parse_failure(self):
+        self.handle_judge_call.return_value = OptimizationResponse(output="not json at all")
+        judge = OptimizationJudge(threshold=0.8, acceptance_statement="Be clear.")
+        result = await self.client._evaluate_acceptance_judge(
+            judge_key="clarity",
+            optimization_judge=judge,
+            completion_response="Clear answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Explain X.",
+        )
+        assert result.score == 0.0
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_config_judge
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateConfigJudge:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+        self.client = _make_client(self.mock_ldai)
+        agent_config = _make_agent_config()
+        self.client._agent_key = "test-agent"
+        self.client._agent_config = agent_config
+        self.client._initialize_class_members_from_config(agent_config)
+        self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        self.client._options = _make_options(handle_judge_call=self.handle_judge_call)
+
+    def _make_judge_config(self, enabled: bool = True) -> AIJudgeConfig:
+        return AIJudgeConfig(
+            key="ld-judge-key",
+            enabled=enabled,
+            create_tracker=MagicMock,
+            model=ModelConfig(name="gpt-4o", parameters={}),
+            messages=[
+                LDMessage(role="system", content="You are an evaluator."),
+                LDMessage(role="user", content="Evaluate this response."),
+            ],
+        )
+
+    async def test_calls_handle_judge_call_with_correct_config_type(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Good answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What is X?",
+        )
+        call_args = self.handle_judge_call.call_args
+        key, config, ctx, _ = call_args.args
+        assert key == "quality"
+        assert isinstance(config, AIJudgeCallConfig)
+        assert "You are an evaluator." in config.instructions
+        assert isinstance(ctx, OptimizationJudgeContext)
+
+    async def test_messages_has_system_and_user_turns(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Good answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What is X?",
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        roles = [m.role for m in config.messages]
+        assert roles == ["system", "user"]
+
+    async def test_messages_system_content_matches_instructions(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Good answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What is X?",
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        system_msg = next(m for m in config.messages if m.role == "system")
+        assert system_msg.content == config.instructions
+
+    async def test_messages_user_content_matches_context_user_input(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Good answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What is X?",
+        )
+        _, config, ctx, _ = self.handle_judge_call.call_args.args
+        user_msg = next(m for m in config.messages if m.role == "user")
+        assert user_msg.content == ctx.user_input
+
+    async def test_messages_user_content_contains_ld_user_message(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Good answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What is X?",
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        user_msg = next(m for m in config.messages if m.role == "user")
+        assert "Evaluate this response." in user_msg.content
+
+    async def test_returns_zero_score_when_judge_disabled(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config(enabled=False)
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        result = await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Some answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="What?",
+        )
+        assert result.score == 0.0
+        self.handle_judge_call.assert_not_called()
+
+    async def test_returns_zero_score_when_judge_has_no_messages(self):
+        judge_config = AIJudgeConfig(
+            key="ld-judge-key",
+            enabled=True,
+            create_tracker=MagicMock,
+            model=ModelConfig(name="gpt-4o", parameters={}),
+            messages=None,
+        )
+        self.mock_ldai.judge_config.return_value = judge_config
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        result = await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Any.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Anything?",
+        )
+        assert result.score == 0.0
+        self.handle_judge_call.assert_not_called()
+
+    async def test_template_variables_merged_into_judge_config_call(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        variables = {"language": "Spanish"}
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Q?",
+            variables=variables,
+        )
+        call_kwargs = self.mock_ldai.judge_config.call_args
+        passed_vars = call_kwargs.args[3] if call_kwargs.args else call_kwargs.kwargs.get("variables", {})
+        assert passed_vars.get("language") == "Spanish"
+        assert "message_history" in passed_vars
+        assert "response_to_evaluate" in passed_vars
+
+    async def test_agent_tools_included_without_evaluation_tool(self):
+        self.mock_ldai.judge_config.return_value = self._make_judge_config()
+        agent_tool = ToolDefinition(name="search", description="Search", input_schema={})
+        judge = OptimizationJudge(threshold=0.8, judge_key="ld-judge-key")
+        await self.client._evaluate_config_judge(
+            judge_key="quality",
+            optimization_judge=judge,
+            completion_response="Answer.",
+            iteration=1,
+            reasoning_history="",
+            user_input="Q?",
+            agent_tools=[agent_tool],
+        )
+        _, config, _, _ = self.handle_judge_call.call_args.args
+        tools = config.model.get_parameter("tools") or []
+        names = [t["name"] for t in tools]
+        assert names == ["search"]
+
+
+# ---------------------------------------------------------------------------
+# _execute_agent_turn
+# ---------------------------------------------------------------------------
+
+
+class TestExecuteAgentTurn:
+    def setup_method(self):
+        self.agent_response = "Paris is the capital of France."
+        self.handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=self.agent_response))
+        self.handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        self.client = _make_client()
+        agent_config = _make_agent_config()
+        self.client._agent_key = "test-agent"
+        self.client._agent_config = agent_config
+        self.client._initialize_class_members_from_config(agent_config)
+        self.client._options = _make_options(
+            handle_agent_call=self.handle_agent_call,
+            handle_judge_call=self.handle_judge_call,
+        )
+
+    def _make_context(self, user_input: str = "What is the capital of France?") -> OptimizationContext:
+        return OptimizationContext(
+            scores={},
+            completion_response="",
+            current_instructions=AGENT_INSTRUCTIONS,
+            current_parameters={},
+            current_variables={"language": "English"},
+            current_model="gpt-4o",
+            user_input=user_input,
+            iteration=1,
+        )
+
+    async def test_calls_handle_agent_call_with_config_and_context(self):
+        ctx = self._make_context()
+        await self.client._execute_agent_turn(ctx, iteration=1)
+        self.handle_agent_call.assert_called_once()
+        key, config, passed_ctx, _ = self.handle_agent_call.call_args.args
+        assert key == "test-agent"
+        assert isinstance(config, AIAgentConfig)
+        assert passed_ctx is ctx
+
+    async def test_completion_response_stored_in_returned_context(self):
+        ctx = self._make_context()
+        result = await self.client._execute_agent_turn(ctx, iteration=1)
+        assert result.completion_response == self.agent_response
+
+    async def test_judge_scores_stored_in_returned_context(self):
+        ctx = self._make_context()
+        result = await self.client._execute_agent_turn(ctx, iteration=1)
+        assert "accuracy" in result.scores
+        assert result.scores["accuracy"].score == 1.0
+
+    async def test_variables_interpolated_into_agent_config_instructions(self):
+        ctx = self._make_context()
+        await self.client._execute_agent_turn(ctx, iteration=1)
+        _, config, _, _ = self.handle_agent_call.call_args.args
+        assert "{{language}}" not in config.instructions
+        assert "English" in config.instructions
+
+    async def test_raises_on_agent_call_failure(self):
+        self.handle_agent_call.side_effect = RuntimeError("LLM unavailable")
+        ctx = self._make_context()
+        with pytest.raises(RuntimeError, match="LLM unavailable"):
+            await self.client._execute_agent_turn(ctx, iteration=1)
+
+
+# ---------------------------------------------------------------------------
+# _generate_new_variation
+# ---------------------------------------------------------------------------
+
+
+class TestGenerateNewVariation:
+    def setup_method(self):
+        self.handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+        self.client = _make_client()
+        agent_config = _make_agent_config()
+        self.client._agent_key = "test-agent"
+        self.client._agent_config = agent_config
+        self.client._initial_instructions = AGENT_INSTRUCTIONS
+        self.client._initialize_class_members_from_config(agent_config)
+        self.client._options = _make_options(handle_agent_call=self.handle_agent_call)
+
+    async def test_updates_current_instructions(self):
+        await self.client._generate_new_variation(iteration=1, variables={"language": "English"})
+        assert self.client._current_instructions == "You are an improved assistant."
+
+    async def test_updates_current_parameters(self):
+        await self.client._generate_new_variation(iteration=1, variables={})
+        assert self.client._current_parameters == {"temperature": 0.5}
+
+    async def test_updates_current_model(self):
+        await self.client._generate_new_variation(iteration=1, variables={})
+        assert self.client._current_model == "gpt-4o"
+
+    async def test_no_structured_output_tool_in_variation_config(self):
+        """Variation turn must not inject the structured-output tool — prompts use plain JSON."""
+        await self.client._generate_new_variation(iteration=1, variables={})
+        _, config, _, _ = self.handle_agent_call.call_args.args
+        tools = config.model.get_parameter("tools") or []
+        assert tools == []
+
+    async def test_variation_call_uses_three_arg_signature(self):
+        """handle_agent_call receives exactly (key, config, context) — no tools arg."""
+        await self.client._generate_new_variation(iteration=1, variables={})
+        assert len(self.handle_agent_call.call_args.args) == 4
+
+    async def test_model_not_updated_when_not_in_model_choices(self):
+        bad_response = json.dumps({
+            "current_instructions": "New instructions.",
+            "current_parameters": {},
+            "model": "some-unknown-model",
+        })
+        self.handle_agent_call.return_value = OptimizationResponse(output=bad_response)
+        original_model = self.client._current_model
+        await self.client._generate_new_variation(iteration=1, variables={})
+        assert self.client._current_model == original_model
+
+    async def test_retries_on_empty_response_and_succeeds(self):
+        """First attempt returns empty string; second returns valid JSON — succeeds."""
+        self.handle_agent_call.side_effect = [
+            OptimizationResponse(output=""),           # attempt 1: empty
+            OptimizationResponse(output=VARIATION_RESPONSE),  # attempt 2: valid
+        ]
+        await self.client._generate_new_variation(iteration=1, variables={})
+        assert self.client._current_instructions == "You are an improved assistant."
+        assert self.handle_agent_call.call_count == 2
+
+    async def test_retries_on_unparseable_response_and_succeeds(self):
+        """First attempt returns non-JSON text; second returns valid JSON — succeeds."""
+        self.handle_agent_call.side_effect = [
+            OptimizationResponse(output="Sorry, I cannot do that."),  # attempt 1: not JSON
+            OptimizationResponse(output=VARIATION_RESPONSE),           # attempt 2: valid
+        ]
+        await self.client._generate_new_variation(iteration=1, variables={})
+        assert self.client._current_instructions == "You are an improved assistant."
+        assert self.handle_agent_call.call_count == 2
+
+    async def test_raises_after_max_retries_exhausted(self):
+        """All three attempts return empty strings — ValueError is raised."""
+        self.handle_agent_call.side_effect = [
+            OptimizationResponse(output=""),
+            OptimizationResponse(output=""),
+            OptimizationResponse(output=""),
+        ]
+        with pytest.raises(ValueError, match="Failed to parse structured output"):
+            await self.client._generate_new_variation(iteration=1, variables={})
+        assert self.handle_agent_call.call_count == 3
+
+
+# ---------------------------------------------------------------------------
+# Full optimization loop
+# ---------------------------------------------------------------------------
+
+
+class TestRunOptimization:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    async def test_succeeds_on_first_attempt_when_judge_passes(self):
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="The capital of France is Paris."))
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert result.scores["accuracy"].score == 1.0
+        # 1 initial agent call + 1 validation sample (repeated draw — only 1 variable choice)
+        assert handle_agent_call.call_count == 2
+
+    async def test_generates_variation_when_judge_fails(self):
+        agent_responses = [
+            OptimizationResponse(output="Bad answer."),
+            OptimizationResponse(output=VARIATION_RESPONSE),  # variation generation
+            OptimizationResponse(output="Better answer."),
+            OptimizationResponse(output="Better answer."),    # 1 validation sample (repeated draw — only 1 variable choice)
+        ]
+        handle_agent_call = AsyncMock(side_effect=agent_responses)
+        judge_responses = [
+            OptimizationResponse(output=JUDGE_FAIL_RESPONSE),
+            OptimizationResponse(output=JUDGE_PASS_RESPONSE),
+            OptimizationResponse(output=JUDGE_PASS_RESPONSE),
+        ]
+        handle_judge_call = AsyncMock(side_effect=judge_responses)
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            max_attempts=3,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert result.scores["accuracy"].score == 1.0
+        # 1 agent + 1 variation + 1 agent + 1 validation sample
+        assert handle_agent_call.call_count == 4
+
+    async def test_returns_last_context_after_max_attempts(self):
+        # The max_attempts guard fires before variation on the final iteration,
+        # so only iterations 1 and 2 produce a variation call.
+        handle_agent_call = AsyncMock(side_effect=[
+            OptimizationResponse(output="Bad answer."),       # iteration 1: agent
+            OptimizationResponse(output=VARIATION_RESPONSE),  # iteration 1: variation
+            OptimizationResponse(output="Still bad."),        # iteration 2: agent
+            OptimizationResponse(output=VARIATION_RESPONSE),  # iteration 2: variation
+            OptimizationResponse(output="Still bad."),        # iteration 3: agent (max_attempts reached — no variation)
+        ])
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            max_attempts=3,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert result.scores["accuracy"].score == 0.2
+
+    async def test_on_passing_result_called_on_success(self):
+        on_passing = MagicMock()
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="Great answer."))
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+        )
+        options.on_passing_result = on_passing
+        await client.optimize_from_options("test-agent", options)
+        on_passing.assert_called_once()
+
+    async def test_on_failing_result_called_on_max_attempts(self):
+        on_failing = MagicMock()
+        handle_agent_call = AsyncMock(side_effect=[
+            OptimizationResponse(output="Bad."),             # iteration 1: agent
+            OptimizationResponse(output=VARIATION_RESPONSE), # iteration 1: variation
+            OptimizationResponse(output="Still bad."),       # iteration 2: agent (max_attempts reached — no variation)
+        ])
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            max_attempts=2,
+        )
+        options.on_failing_result = on_failing
+        await client.optimize_from_options("test-agent", options)
+        on_failing.assert_called_once()
+
+    async def test_on_turn_manual_path_success(self):
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="Answer."))
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = OptimizationOptions(
+            context_choices=[LD_CONTEXT],
+            max_attempts=3,
+            model_choices=["gpt-4o"],
+            judge_model="gpt-4o",
+            variable_choices=[{}],
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            judges={"j": OptimizationJudge(threshold=0.8, acceptance_statement="x")},
+            on_turn=lambda ctx: True,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert result.completion_response == "Answer."
+
+    async def test_success_result_carries_main_iteration_context_not_validation_context(self):
+        # The main iteration returns "Main answer." but the validation run returns
+        # "Validation answer.". The result should reflect the main iteration so that
+        # completion_response and user_input are consistent with what was POSTed to the API.
+        agent_responses = [
+            OptimizationResponse(output="Main answer."),       # main iteration
+            OptimizationResponse(output="Validation answer."), # validation sample
+        ]
+        handle_agent_call = AsyncMock(side_effect=agent_responses)
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert result.completion_response == "Main answer."
+
+    async def test_status_update_callback_called_at_each_stage(self):
+        statuses = []
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="Good answer."))
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+        )
+        options.on_status_update = lambda status, ctx: statuses.append(status)
+        await client.optimize_from_options("test-agent", options)
+        assert "init" in statuses
+        assert "generating" in statuses
+        assert "evaluating" in statuses
+        assert "success" in statuses
+
+
+# ---------------------------------------------------------------------------
+# _compute_validation_count
+# ---------------------------------------------------------------------------
+
+
+class TestComputeValidationCount:
+    def test_pool_of_10_returns_2(self):
+        assert _compute_validation_count(10) == 2
+
+    def test_pool_of_20_returns_5(self):
+        assert _compute_validation_count(20) == 5
+
+    def test_pool_of_16_returns_4(self):
+        assert _compute_validation_count(16) == 4
+
+    def test_small_pool_floors_at_2(self):
+        assert _compute_validation_count(1) == 2
+        assert _compute_validation_count(3) == 2
+
+    def test_large_pool_caps_at_5(self):
+        assert _compute_validation_count(100) == 5
+
+    def test_pool_of_8_returns_2(self):
+        assert _compute_validation_count(8) == 2
+
+
+# ---------------------------------------------------------------------------
+# Validation phase (chaos mode)
+# ---------------------------------------------------------------------------
+
+# Helper: build OptimizationOptions with multiple variable choices so the
+# validation phase has a non-empty distinct pool to sample from.
+def _make_multi_options(
+    *,
+    variable_count: int = 8,
+    user_input_options=None,
+    on_turn=None,
+    handle_agent_call=None,
+    handle_judge_call=None,
+    on_passing_result=None,
+    max_attempts: int = 5,
+) -> OptimizationOptions:
+    if handle_agent_call is None:
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output="answer"))
+    if handle_judge_call is None:
+        handle_judge_call = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+    judges = None if on_turn is not None else {
+        "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+    }
+    return OptimizationOptions(
+        context_choices=[LD_CONTEXT],
+        max_attempts=max_attempts,
+        model_choices=["gpt-4o"],
+        judge_model="gpt-4o",
+        variable_choices=[{"x": i} for i in range(variable_count)],
+        user_input_options=user_input_options,
+        handle_agent_call=handle_agent_call,
+        handle_judge_call=handle_judge_call,
+        judges=judges,
+        on_turn=on_turn,
+        on_passing_result=on_passing_result,
+    )
+
+
+class TestValidationPhase:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    def _make_client(self) -> OptimizationClient:
+        return _make_client(self.mock_ldai)
+
+    async def test_on_passing_result_fires_only_after_all_validation_passes(self):
+        """on_passing_result must not fire until all validation samples pass."""
+        on_passing = MagicMock()
+        client = self._make_client()
+        # 8 variable_choices → validation_count = 2; all judges always pass
+        opts = _make_multi_options(on_passing_result=on_passing)
+        await client.optimize_from_options("test-agent", opts)
+        on_passing.assert_called_once()
+
+    async def test_validation_runs_additional_agent_calls(self):
+        """With 8 variable choices, validation runs 2 extra agent calls after the initial pass."""
+        call_count = [0]
+
+        async def counting_agent(key, config, ctx, is_evaluation=False):
+            call_count[0] += 1
+            return OptimizationResponse(output="answer")
+
+        client = self._make_client()
+        opts = _make_multi_options(handle_agent_call=counting_agent)
+        await client.optimize_from_options("test-agent", opts)
+        # 1 initial pass + 2 validation samples
+        assert call_count[0] == 3
+
+    async def test_validation_failure_suppresses_on_passing_result_then_retries(self):
+        """When a validation sample fails, on_passing_result is not fired and the loop retries."""
+        turn_calls = [0]
+
+        def on_turn(ctx):
+            turn_calls[0] += 1
+            # call 1: initial pass, call 2: first validation FAIL, everything else passes
+            return turn_calls[0] != 2
+
+        on_passing = MagicMock()
+        client = self._make_client()
+        opts = _make_multi_options(
+            on_turn=on_turn,
+            # 8 items → validation_count = 2
+            variable_count=8,
+            handle_agent_call=AsyncMock(side_effect=[
+                OptimizationResponse(output="iter1"),            # initial turn (passes)
+                OptimizationResponse(output="val_iter2"),        # validation sample 1 (fails)
+                OptimizationResponse(output=VARIATION_RESPONSE),  # variation generation
+                OptimizationResponse(output="iter3"),            # new attempt initial (passes)
+                OptimizationResponse(output="val_iter4"),        # new validation sample 1 (passes)
+                OptimizationResponse(output="val_iter5"),        # new validation sample 2 (passes)
+            ]),
+            on_passing_result=on_passing,
+            max_attempts=3,
+        )
+        result = await client.optimize_from_options("test-agent", opts)
+        # Eventually succeeds after one failed validation cycle
+        on_passing.assert_called_once()
+        assert result is not None
+
+    async def test_validation_does_not_reuse_passing_turn_variable(self):
+        """The variable set used in the initial passing turn must not appear in validation."""
+        seen_variables = []
+
+        async def capture_agent(key, config, ctx, is_evaluation=False):
+            seen_variables.append(ctx.current_variables)
+            return OptimizationResponse(output="answer")
+
+        client = self._make_client()
+        opts = _make_multi_options(handle_agent_call=capture_agent, variable_count=8)
+        await client.optimize_from_options("test-agent", opts)
+
+        # First call is the initial passing turn
+        initial_vars = seen_variables[0]
+        # Remaining calls are validation samples — none should match the initial
+        for val_vars in seen_variables[1:]:
+            assert val_vars != initial_vars, (
+                f"Validation reused the passing turn's variables: {initial_vars}"
+            )
+
+    async def test_validation_uses_user_input_options_as_pool_when_provided(self):
+        """When user_input_options is provided, validation samples from that pool."""
+        seen_inputs = []
+
+        async def capture_agent(key, config, ctx, is_evaluation=False):
+            seen_inputs.append(ctx.user_input)
+            return OptimizationResponse(output="answer")
+
+        client = self._make_client()
+        user_inputs = [f"question {i}" for i in range(8)]
+        opts = _make_multi_options(
+            handle_agent_call=capture_agent,
+            user_input_options=user_inputs,
+        )
+        await client.optimize_from_options("test-agent", opts)
+
+        # Initial input is at index 0; all validation inputs must be different
+        initial_input = seen_inputs[0]
+        for val_input in seen_inputs[1:]:
+            assert val_input != initial_input, (
+                f"Validation reused the passing turn's user_input: {initial_input}"
+            )
+
+    async def test_pool_exhaustion_caps_validation_at_available_distinct_items(self):
+        """When fewer distinct items remain than validation_count, all available ones are used."""
+        call_count = [0]
+
+        async def counting_agent(key, config, ctx, is_evaluation=False):
+            call_count[0] += 1
+            return OptimizationResponse(output="answer")
+
+        client = self._make_client()
+        # 3 variable choices → _compute_validation_count(3) = 2, but only 2 remain after
+        # excluding the passing item, so validation_count is still 2 (min of 2 and 2)
+        opts = _make_multi_options(handle_agent_call=counting_agent, variable_count=3)
+        await client.optimize_from_options("test-agent", opts)
+        # 1 initial + 2 validation (uses all remaining distinct items)
+        assert call_count[0] == 3
+
+    async def test_single_variable_choice_falls_back_to_repeated_draw(self):
+        """With only 1 variable choice validation still runs 1 sample (repeated draw)."""
+        call_count = [0]
+
+        async def counting_agent(key, config, ctx, is_evaluation=False):
+            call_count[0] += 1
+            return OptimizationResponse(output="answer")
+
+        client = self._make_client()
+        opts = _make_multi_options(handle_agent_call=counting_agent, variable_count=1)
+        await client.optimize_from_options("test-agent", opts)
+        # 1 initial pass + 1 validation sample (repeated draw from the only item)
+        assert call_count[0] == 2
+
+    async def test_validation_does_not_consume_attempt_budget(self):
+        """Validation samples must not count against max_attempts.
+
+        With max_attempts=2 and 8 variable choices (validation_count=2), a failed
+        validation on attempt 1 should still leave a full attempt 2 available.
+        Without the fix, iteration would be inflated to 3 after validation, which
+        exceeds max_attempts=2 and would trigger _handle_failure prematurely.
+        """
+        turn_calls = [0]
+
+        def on_turn(ctx):
+            turn_calls[0] += 1
+            # attempt 1 passes initial, validation sample 1 fails
+            # attempt 2 passes initial and all validation
+            return turn_calls[0] != 2
+
+        on_passing = MagicMock()
+        client = self._make_client()
+        opts = _make_multi_options(
+            on_turn=on_turn,
+            variable_count=8,
+            handle_agent_call=AsyncMock(side_effect=[
+                OptimizationResponse(output="iter1"),            # attempt 1 initial (passes)
+                OptimizationResponse(output="val_iter"),         # validation sample 1 (fails)
+                OptimizationResponse(output=VARIATION_RESPONSE),  # variation generation
+                OptimizationResponse(output="iter2"),            # attempt 2 initial (passes)
+                OptimizationResponse(output="val_iter3"),        # validation sample 1 (passes)
+                OptimizationResponse(output="val_iter4"),        # validation sample 2 (passes)
+            ]),
+            on_passing_result=on_passing,
+            max_attempts=2,
+        )
+        result = await client.optimize_from_options("test-agent", opts)
+        on_passing.assert_called_once()
+        assert result is not None
+
+    async def test_validating_status_emitted(self):
+        """The 'validating' status must be emitted when entering the validation phase."""
+        statuses = []
+        client = self._make_client()
+        opts = _make_multi_options()
+        opts.on_status_update = lambda s, ctx: statuses.append(s)
+        await client.optimize_from_options("test-agent", opts)
+        assert "validating" in statuses
+
+    async def test_turn_completed_after_validation_failure_uses_main_iteration_context(self):
+        """When validation fails, the 'turn completed' event must carry the MAIN iteration's
+        user_input and completion_response — not the failing validation sample's values.
+
+        Regression test for the mismatch where a record stored userInput='hostel near paris'
+        but completionResponse described 'airbmbs near tahoe' (from a validation run with a
+        different user_input that was folded back onto the main iteration's API record).
+        """
+        turn_calls = [0]
+        status_events: list = []
+
+        user_inputs = [f"query-{i}" for i in range(8)]
+
+        def on_turn(ctx):
+            turn_calls[0] += 1
+            # Call 1: main iteration passes. Call 2: first validation sample FAILS.
+            # Call 3+: everything passes (new attempt succeeds).
+            return turn_calls[0] != 2
+
+        def capture_status(status, ctx):
+            status_events.append((status, ctx.user_input, ctx.completion_response))
+
+        client = self._make_client()
+        opts = _make_multi_options(
+            on_turn=on_turn,
+            variable_count=8,
+            user_input_options=user_inputs,
+            handle_agent_call=AsyncMock(side_effect=[
+                OptimizationResponse(output="main-response"),      # main turn (passes)
+                OptimizationResponse(output="val-response"),       # validation sample (fails)
+                OptimizationResponse(output=VARIATION_RESPONSE),   # variation generation
+                OptimizationResponse(output="main-response-2"),    # 2nd attempt main (passes)
+                OptimizationResponse(output="val-response-2"),     # 2nd attempt validation (passes)
+                OptimizationResponse(output="val-response-3"),     # 2nd attempt validation (passes)
+            ]),
+            max_attempts=3,
+        )
+        opts.on_status_update = capture_status
+        await client.optimize_from_options("test-agent", opts)
+
+        # The 'generating' event captures the main iteration's user_input.
+        # The validation run fires 'generating' as well, but with a different user_input.
+        # The first 'generating' is always the main iteration.
+        generating_events = [(u, r) for s, u, r in status_events if s == "generating"]
+        main_user_input = generating_events[0][0]
+
+        # Find the 'turn completed' event from the first attempt (after validation failure)
+        tc_events = [(u, r) for s, u, r in status_events if s == "turn completed"]
+        assert len(tc_events) >= 1, "Expected at least one 'turn completed' event"
+
+        tc_user_input, tc_completion = tc_events[0]
+        # turn completed must use the MAIN iteration's data, not the validation sample's.
+        # If the bug is present, tc_completion would be "val-response" and tc_user_input
+        # would be the validation sample's query (different from main_user_input).
+        assert tc_completion == "main-response", (
+            f"turn completed should carry the main iteration's completion_response "
+            f"('main-response'), not the validation run's (got: {tc_completion!r})"
+        )
+        assert tc_user_input == main_user_input, (
+            f"turn completed should carry the main iteration's user_input "
+            f"('{main_user_input}'), not the validation run's (got: {tc_user_input!r})"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — acceptance criteria section
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptAcceptanceCriteria:
+    def test_includes_acceptance_statement_in_section(self):
+        judges = {
+            "quality": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="Responses must be concise and factual.",
+            )
+        }
+        section = variation_prompt_acceptance_criteria(judges)
+        assert "Responses must be concise and factual." in section
+        assert "quality" in section
+
+    def test_labels_all_judges(self):
+        judges = {
+            "a": OptimizationJudge(threshold=0.8, acceptance_statement="Must be brief."),
+            "b": OptimizationJudge(threshold=0.9, acceptance_statement="Must cite sources."),
+        }
+        section = variation_prompt_acceptance_criteria(judges)
+        assert "[a]" in section
+        assert "[b]" in section
+        assert "Must be brief." in section
+        assert "Must cite sources." in section
+
+    def test_returns_empty_string_when_no_acceptance_statements(self):
+        judges = {
+            "ld-judge": OptimizationJudge(threshold=0.8, judge_key="some-ld-key"),
+        }
+        section = variation_prompt_acceptance_criteria(judges)
+        assert section == ""
+
+    def test_returns_empty_string_with_no_judges(self):
+        section = variation_prompt_acceptance_criteria(None)
+        assert section == ""
+
+    def test_section_appears_in_full_prompt(self):
+        judges = {
+            "accuracy": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="Facts only.",
+            )
+        }
+        options = _make_options(judges=judges)
+        prompt = build_new_variation_prompt(
+            history=[],
+            judges=judges,
+            current_model="gpt-4o",
+            current_instructions=AGENT_INSTRUCTIONS,
+            current_parameters={},
+            model_choices=options.model_choices,
+            variable_choices=options.variable_choices,
+            initial_instructions=AGENT_INSTRUCTIONS,
+        )
+        assert "Facts only." in prompt
+        assert "ACCEPTANCE CRITERIA" in prompt
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — overfitting warning section
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptOverfitWarning:
+    def _make_ctx(self, user_input=None, variables=None, iteration=1):
+        return OptimizationContext(
+            iteration=iteration,
+            current_instructions=AGENT_INSTRUCTIONS,
+            current_parameters={},
+            current_model="gpt-4o",
+            current_variables=variables or {},
+            user_input=user_input,
+            completion_response=None,
+            scores={},
+        )
+
+    def test_returns_empty_string_with_no_history(self):
+        assert variation_prompt_overfit_warning([]) == ""
+
+    def test_contains_general_overfitting_reminder(self):
+        ctx = self._make_ctx(user_input="What is 2+2?")
+        section = variation_prompt_overfit_warning([ctx])
+        assert "OVERFITTING" in section.upper()
+        assert "generalise" in section.lower() or "generalize" in section.lower() or "generaliz" in section.lower() or "general" in section.lower()
+
+    def test_includes_recent_user_input(self):
+        ctx = self._make_ctx(user_input="What is the capital of France?")
+        section = variation_prompt_overfit_warning([ctx])
+        assert "What is the capital of France?" in section
+
+    def test_includes_recent_variables_as_structured_breakdown(self):
+        ctx = self._make_ctx(variables={"language": "English", "tone": "formal"})
+        section = variation_prompt_overfit_warning([ctx])
+        # Keys (placeholder names) and values must both appear
+        assert "{{language}}" in section
+        assert '"English"' in section
+        assert "{{tone}}" in section
+        assert '"formal"' in section
+
+    def test_variables_section_labels_name_vs_value(self):
+        ctx = self._make_ctx(variables={"user_id": "user-125"})
+        section = variation_prompt_overfit_warning([ctx])
+        assert "{{user_id}}" in section
+        assert '"user-125"' in section
+        assert "placeholder" in section.lower()
+        assert "value" in section.lower()
+        # Must NOT render as a raw Python dict
+        assert "{'user_id': 'user-125'}" not in section
+
+    def test_uses_most_recent_history_entry(self):
+        ctx_old = self._make_ctx(user_input="old question", iteration=1)
+        ctx_new = self._make_ctx(user_input="new question", iteration=2)
+        section = variation_prompt_overfit_warning([ctx_old, ctx_new])
+        assert "new question" in section
+        assert "old question" not in section
+
+    def test_omits_user_input_line_when_none(self):
+        ctx = self._make_ctx(user_input=None, variables={"lang": "en"})
+        section = variation_prompt_overfit_warning([ctx])
+        assert "User input" not in section
+        assert "lang" in section
+
+    def test_omits_variables_line_when_empty(self):
+        ctx = self._make_ctx(user_input="hello", variables={})
+        section = variation_prompt_overfit_warning([ctx])
+        assert "Variables" not in section
+        assert "hello" in section
+
+    def test_warning_appears_in_full_prompt_when_history_present(self):
+        ctx = self._make_ctx(user_input="test question", variables={"k": "v"})
+        prompt = build_new_variation_prompt(
+            history=[ctx],
+            judges=None,
+            current_model="gpt-4o",
+            current_instructions=AGENT_INSTRUCTIONS,
+            current_parameters={},
+            model_choices=["gpt-4o"],
+            variable_choices=[{"k": "v"}],
+            initial_instructions=AGENT_INSTRUCTIONS,
+        )
+        assert "OVERFITTING" in prompt.upper()
+        assert "test question" in prompt
+
+    def test_warning_absent_from_full_prompt_when_no_history(self):
+        prompt = build_new_variation_prompt(
+            history=[],
+            judges=None,
+            current_model="gpt-4o",
+            current_instructions=AGENT_INSTRUCTIONS,
+            current_parameters={},
+            model_choices=["gpt-4o"],
+            variable_choices=[{"k": "v"}],
+            initial_instructions=AGENT_INSTRUCTIONS,
+        )
+        assert "OVERFITTING" not in prompt.upper()
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — preamble key-vs-value note
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptPreamble:
+    def test_contains_key_vs_value_important_note(self):
+        preamble = variation_prompt_preamble()
+        assert "IMPORTANT" in preamble
+        assert "placeholder" in preamble.lower()
+        assert "value" in preamble.lower()
+
+    def test_never_use_value_as_placeholder_name(self):
+        preamble = variation_prompt_preamble()
+        assert "never" in preamble.lower()
+
+
+# ---------------------------------------------------------------------------
+# Variation prompt — placeholder table
+# ---------------------------------------------------------------------------
+
+
+class TestVariationPromptPlaceholderTable:
+    _variable_choices = [
+        {"user_id": "user-123", "trip_purpose": "business"},
+        {"user_id": "user-125", "trip_purpose": "personal"},
+    ]
+
+    def _section(self, variable_choices=None, history=None):
+        return variation_prompt_improvement_instructions(
+            history=history or [],
+            model_choices=["gpt-4o"],
+            variable_choices=variable_choices or self._variable_choices,
+            initial_instructions=AGENT_INSTRUCTIONS,
+        )
+
+    def test_placeholder_names_appear_in_table(self):
+        section = self._section()
+        assert "{{user_id}}" in section
+        assert "{{trip_purpose}}" in section
+
+    def test_example_values_appear_alongside_keys(self):
+        section = self._section()
+        assert '<untrusted>user-123</untrusted>' in section or '<untrusted>user-125</untrusted>' in section
+        assert '<untrusted>business</untrusted>' in section or '<untrusted>personal</untrusted>' in section
+
+    def test_keys_and_values_clearly_separated(self):
+        section = self._section()
+        assert "example values" in section.lower()
+
+    def test_bad_good_counterexamples_use_actual_values(self):
+        section = self._section()
+        # The bad example must reference a runtime value, good example the key
+        assert "BAD" in section
+        assert "GOOD" in section
+        # At least one of the real values should appear in the bad example
+        assert "user-123" in section or "user-125" in section \
+            or "business" in section or "personal" in section
+
+    def test_raw_placeholder_list_not_used(self):
+        # The old format was a comma-separated list like "{{trip_purpose}}, {{user_id}}"
+        # The new format is a structured table; confirm no bare comma-list
+        section = self._section()
+        assert "{{trip_purpose}}, {{user_id}}" not in section
+        assert "{{user_id}}, {{trip_purpose}}" not in section
+
+    def test_single_variable_choice(self):
+        section = self._section(variable_choices=[{"lang": "en"}])
+        assert "{{lang}}" in section
+        assert '<untrusted>en</untrusted>' in section
+
+    def test_table_appears_in_full_prompt(self):
+        prompt = build_new_variation_prompt(
+            history=[],
+            judges=None,
+            current_model="gpt-4o",
+            current_instructions=AGENT_INSTRUCTIONS,
+            current_parameters={},
+            model_choices=["gpt-4o"],
+            variable_choices=self._variable_choices,
+            initial_instructions=AGENT_INSTRUCTIONS,
+        )
+        assert "{{user_id}}" in prompt
+        assert "{{trip_purpose}}" in prompt
+        assert "example values" in prompt.lower()
+
+
+# ---------------------------------------------------------------------------
+# interpolate_variables — hyphenated key support
+# ---------------------------------------------------------------------------
+
+
+class TestInterpolateVariables:
+    def test_substitutes_standard_underscore_key(self):
+        result = interpolate_variables("Hello {{user_id}}", {"user_id": "abc"})
+        assert result == "Hello abc"
+
+    def test_substitutes_hyphenated_key(self):
+        result = interpolate_variables("Hello {{user-id}}", {"user-id": "abc"})
+        assert result == "Hello abc"
+
+    def test_leaves_unknown_placeholder_unchanged(self):
+        result = interpolate_variables("Hello {{unknown}}", {"user_id": "abc"})
+        assert result == "Hello {{unknown}}"
+
+    def test_leaves_unknown_hyphenated_placeholder_unchanged(self):
+        result = interpolate_variables("Hello {{bad-125}}", {"user_id": "abc"})
+        assert result == "Hello {{bad-125}}"
+
+    def test_mixed_keys_in_same_string(self):
+        result = interpolate_variables(
+            "{{user-id}} and {{trip_purpose}}",
+            {"user-id": "u-1", "trip_purpose": "leisure"},
+        )
+        assert result == "u-1 and leisure"
+
+    def test_empty_variables_leaves_text_unchanged(self):
+        result = interpolate_variables("{{foo}} bar", {})
+        assert result == "{{foo}} bar"
+
+
+# ---------------------------------------------------------------------------
+# restore_variable_placeholders
+# ---------------------------------------------------------------------------
+
+
+class TestRestoreVariablePlaceholders:
+    _CHOICES = [{"user_id": "user-123", "trip_purpose": "business"}]
+
+    def test_replaces_hardcoded_id_value(self):
+        text = "Use the user ID user-123 to look up preferences."
+        result, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert "{{user_id}}" in result
+        assert "user-123" not in result
+        assert len(warnings) == 1
+        assert "user-123" in warnings[0]
+        assert "{{user_id}}" in warnings[0]
+
+    def test_replaces_multiline_value_verbatim(self):
+        multiline_value = "line one\nline two\nline three"
+        choices = [{"body_text": multiline_value}]
+        text = f"Instructions:\n{multiline_value}\nEnd."
+        result, warnings = restore_variable_placeholders(text, choices)
+        assert "{{body_text}}" in result
+        assert multiline_value not in result
+        assert len(warnings) == 1
+
+    def test_skips_value_shorter_than_min_length(self):
+        choices = [{"lang": "en"}]  # "en" is only 2 chars
+        text = "Use language en for this request."
+        result, warnings = restore_variable_placeholders(text, choices, min_value_length=3)
+        assert result == text
+        assert warnings == []
+
+    def test_does_not_partially_match_longer_token(self):
+        """'user-123' must not be replaced inside 'user-1234'."""
+        text = "Contact user-1234 for help."
+        result, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert "user-1234" in result
+        assert warnings == []
+
+    def test_replaces_multiple_variables(self):
+        text = "User user-123 is on a business trip."
+        result, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert "{{user_id}}" in result
+        assert "{{trip_purpose}}" in result
+        assert "user-123" not in result
+        assert "business" not in result
+        assert len(warnings) == 2
+
+    def test_leaves_correct_placeholder_unchanged(self):
+        text = "User {{user_id}} is on a {{trip_purpose}} trip."
+        result, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert result == text
+        assert warnings == []
+
+    def test_replaces_multiple_occurrences_of_same_value(self):
+        text = "user-123 and user-123 are duplicates."
+        result, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert result == "{{user_id}} and {{user_id}} are duplicates."
+        assert "2 occurrence(s)" in warnings[0]
+
+    def test_longer_value_replaced_before_shorter_substring(self):
+        """When one value is a prefix of another, the longer one is replaced first."""
+        choices = [{"full_id": "user-123-admin", "short_id": "user-123"}]
+        text = "Admin is user-123-admin, regular is user-123."
+        result, warnings = restore_variable_placeholders(text, choices)
+        assert "{{full_id}}" in result
+        assert "{{short_id}}" in result
+        assert "user-123-admin" not in result
+        # The shorter value should not have corrupted the longer replacement
+        assert result.count("{{full_id}}") == 1
+        assert result.count("{{short_id}}") == 1
+
+    def test_replaces_brace_wrapped_value_without_double_bracketing(self):
+        """{{user-125}} must become {{user_id}}, not {{{{user_id}}}}."""
+        text = "Fetch preferences for user {{user-123}}."
+        result, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert result == "Fetch preferences for user {{user_id}}."
+        assert len(warnings) == 1
+
+    def test_empty_variable_choices_returns_text_unchanged(self):
+        text = "Some instructions here."
+        result, warnings = restore_variable_placeholders(text, [])
+        assert result == text
+        assert warnings == []
+
+    def test_warning_message_format(self):
+        text = "Handle user user-123 carefully."
+        _, warnings = restore_variable_placeholders(text, self._CHOICES)
+        assert any("user-123" in w for w in warnings)
+        assert any("{{user_id}}" in w for w in warnings)
+
+    async def test_apply_variation_response_calls_restore_and_logs_warning(self):
+        """_apply_new_variation_response must restore leaked values and log warnings."""
+        leaked_instructions = "You serve user user-123 on a business trip."
+        variation_response = json.dumps({
+            "current_instructions": leaked_instructions,
+            "current_parameters": {},
+            "model": "gpt-4o",
+        })
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=variation_response))
+        client = _make_client()
+        agent_config = _make_agent_config()
+        client._agent_key = "test-agent"
+        client._agent_config = agent_config
+        client._initial_instructions = AGENT_INSTRUCTIONS
+        client._initialize_class_members_from_config(agent_config)
+        client._options = _make_options(
+            handle_agent_call=handle_agent_call,
+            variable_choices=[{"user_id": "user-123", "trip_purpose": "business"}],
+        )
+
+        with patch("ldai_optimizer.client.logger") as mock_logger:
+            await client._generate_new_variation(iteration=1, variables={})
+            warning_calls = [
+                call for call in mock_logger.warning.call_args_list
+                if "user-123" in str(call) or "business" in str(call)
+            ]
+            assert len(warning_calls) >= 1
+
+        assert "{{user_id}}" in client._current_instructions
+        assert "user-123" not in client._current_instructions
+
+
+# ---------------------------------------------------------------------------
+# _build_options_from_config helpers
+# ---------------------------------------------------------------------------
+
+_API_CONFIG: Dict[str, Any] = {
+    "id": "opt-uuid-123",
+    "key": "my-optimization",
+    "aiConfigKey": "my-agent",
+    "maxAttempts": 3,
+    "modelChoices": ["gpt-4o", "gpt-4o-mini"],
+    "judgeModel": "gpt-4o",
+    "variableChoices": [{"language": "English"}],
+    "acceptanceStatements": [{"statement": "Be accurate.", "threshold": 0.9}],
+    "judges": [],
+    "userInputOptions": ["What is 2+2?"],
+    "version": 2,
+    "createdAt": 1700000000,
+}
+
+
+def _make_from_config_options(**overrides: Any) -> OptimizationFromConfigOptions:
+    defaults: Dict[str, Any] = dict(
+        project_key="my-project",
+        context_choices=[LD_CONTEXT],
+        handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="The answer is 4.")),
+        handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+    )
+    defaults.update(overrides)
+    return OptimizationFromConfigOptions(**defaults)
+
+
+def _make_mock_api_client() -> MagicMock:
+    mock = MagicMock()
+    mock.post_agent_optimization_result = MagicMock(return_value="result-uuid-789")
+    mock.patch_agent_optimization_result = MagicMock()
+    mock.get_model_configs = MagicMock(return_value=[])
+    return mock
+
+
+# ---------------------------------------------------------------------------
+# _build_options_from_config
+# ---------------------------------------------------------------------------
+
+
+class TestBuildOptionsFromConfig:
+    def setup_method(self):
+        self.client = _make_client()
+        self.client._agent_key = "my-agent"
+        self.client._initialize_class_members_from_config(_make_agent_config())
+        self.client._options = _make_options()
+        self.api_client = _make_mock_api_client()
+
+    def _build(self, config=None, options=None) -> OptimizationOptions:
+        return self.client._build_options_from_config(
+            config or dict(_API_CONFIG),
+            options or _make_from_config_options(),
+            self.api_client,
+            optimization_key="opt-key-123",
+            run_id="run-uuid-456",
+            model_configs=[],
+        )
+
+    def test_acceptance_statements_mapped_to_judges(self):
+        result = self._build()
+        assert "acceptance-statement-0" in result.judges
+        judge = result.judges["acceptance-statement-0"]
+        assert judge.acceptance_statement == "Be accurate."
+        assert judge.threshold == 0.9
+
+    def test_multiple_acceptance_statements_get_indexed_keys(self):
+        config = dict(_API_CONFIG, acceptanceStatements=[
+            {"statement": "First.", "threshold": 0.8},
+            {"statement": "Second.", "threshold": 0.7},
+        ])
+        result = self._build(config=config)
+        assert "acceptance-statement-0" in result.judges
+        assert "acceptance-statement-1" in result.judges
+        assert result.judges["acceptance-statement-0"].acceptance_statement == "First."
+        assert result.judges["acceptance-statement-1"].acceptance_statement == "Second."
+
+    def test_judges_mapped_by_key(self):
+        config = dict(_API_CONFIG, acceptanceStatements=[], judges=[
+            {"key": "accuracy", "threshold": 0.85},
+        ])
+        result = self._build(config=config)
+        assert "accuracy" in result.judges
+        judge = result.judges["accuracy"]
+        assert judge.judge_key == "accuracy"
+        assert judge.threshold == 0.85
+
+    def test_acceptance_statements_and_judges_merged(self):
+        config = dict(_API_CONFIG,
+            acceptanceStatements=[{"statement": "Be brief.", "threshold": 0.8}],
+            judges=[{"key": "accuracy", "threshold": 0.9}],
+        )
+        result = self._build(config=config)
+        assert "acceptance-statement-0" in result.judges
+        assert "accuracy" in result.judges
+
+    def test_raises_when_no_judges_no_ground_truth_no_on_turn(self):
+        config = dict(_API_CONFIG, acceptanceStatements=[], judges=[])
+        with pytest.raises(ValueError, match="no acceptance statements or judges"):
+            self._build(config=config)
+
+    def test_ground_truth_responses_alone_does_not_pass_no_criteria_check(self):
+        # groundTruthResponses is not yet implemented as standalone criteria;
+        # OptimizationOptions still requires judges or on_turn.
+        config = dict(_API_CONFIG, acceptanceStatements=[], judges=[], groundTruthResponses=["4"])
+        with pytest.raises((ValueError, Exception)):
+            self._build(config=config)
+
+    def test_on_turn_satisfies_no_judges_requirement(self):
+        config = dict(_API_CONFIG, acceptanceStatements=[], judges=[])
+        options = _make_from_config_options(on_turn=lambda ctx: True)
+        result = self._build(config=config, options=options)
+        assert result.on_turn is not None
+
+    def test_empty_variable_choices_defaults_to_single_empty_dict(self):
+        config = dict(_API_CONFIG, variableChoices=[])
+        result = self._build(config=config)
+        assert result.variable_choices == [{}]
+
+    def test_non_empty_variable_choices_passed_through(self):
+        result = self._build()
+        assert result.variable_choices == [{"language": "English"}]
+
+    def test_empty_user_input_options_becomes_none(self):
+        config = dict(_API_CONFIG, userInputOptions=[])
+        result = self._build(config=config)
+        assert result.user_input_options is None
+
+    def test_non_empty_user_input_options_passed_through(self):
+        result = self._build()
+        assert result.user_input_options == ["What is 2+2?"]
+
+    def test_max_attempts_from_config(self):
+        result = self._build()
+        assert result.max_attempts == 3
+
+    def test_model_choices_provider_prefix_stripped(self):
+        config = dict(_API_CONFIG, modelChoices=["OpenAI.gpt-4o", "Anthropic.claude-opus-4-5"])
+        result = self._build(config=config)
+        assert result.model_choices == ["gpt-4o", "claude-opus-4-5"]
+
+    def test_judge_model_provider_prefix_stripped(self):
+        config = dict(_API_CONFIG, judgeModel="OpenAI.gpt-4o")
+        result = self._build(config=config)
+        assert result.judge_model == "gpt-4o"
+
+    def test_model_choices_without_prefix_unchanged(self):
+        result = self._build()
+        assert result.model_choices == ["gpt-4o", "gpt-4o-mini"]
+
+    def test_judge_model_without_prefix_unchanged(self):
+        result = self._build()
+        assert result.judge_model == "gpt-4o"
+
+    def test_model_with_multiple_dots_only_prefix_stripped(self):
+        config = dict(_API_CONFIG, judgeModel="Anthropic.claude-opus-4.6")
+        result = self._build(config=config)
+        assert result.judge_model == "claude-opus-4.6"
+
+    def test_callbacks_forwarded_from_options(self):
+        handle_agent = AsyncMock(return_value=OptimizationResponse(output="ok"))
+        handle_judge = AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE))
+        options = _make_from_config_options(
+            handle_agent_call=handle_agent,
+            handle_judge_call=handle_judge,
+            on_passing_result=MagicMock(),
+            on_failing_result=MagicMock(),
+        )
+        result = self._build(options=options)
+        assert result.handle_agent_call is handle_agent
+        assert result.handle_judge_call is handle_judge
+        assert result.on_passing_result is options.on_passing_result
+        assert result.on_failing_result is options.on_failing_result
+
+    def test_persist_and_forward_posts_result_on_status_update(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={},
+            completion_response="The answer is 4.",
+            current_instructions="Be helpful.",
+            current_parameters={"temperature": 0.7},
+            current_variables={"language": "English"},
+            current_model="gpt-4o",
+            user_input="What is 2+2?",
+            iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        self.api_client.post_agent_optimization_result.assert_called_once()
+        call_args = self.api_client.post_agent_optimization_result.call_args
+        assert call_args[0][0] == "my-project"
+        assert call_args[0][1] == "opt-key-123"
+
+    def test_persist_and_forward_payload_has_correct_field_names(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={"j": JudgeResult(score=0.9, rationale="Good.")},
+            completion_response="Paris.",
+            current_instructions="Be helpful.",
+            current_parameters={"temperature": 0.5},
+            current_variables={},
+            current_model="gpt-4o",
+            user_input="Capital of France?",
+            iteration=2,
+        )
+        result.on_status_update("evaluating", ctx)
+        # POST payload contains the camelCase iteration-level fields
+        post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+        assert post_payload["instructions"] == "Be helpful."
+        assert post_payload["parameters"] == {"temperature": 0.5}
+        assert post_payload["userInput"] == "Capital of France?"
+        assert post_payload["iteration"] == 2
+        # Telemetry and scores are in the PATCH payload
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        assert patch_payload["completionResponse"] == "Paris."
+        assert "j" in patch_payload["scores"]
+
+    def test_persist_and_forward_scores_include_threshold_for_known_judges(self):
+        # Build with a config that has a known acceptance-statement judge (threshold=0.9)
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={"acceptance-statement-0": JudgeResult(score=0.85, rationale="Close.")},
+            completion_response="An answer.",
+            current_instructions="Be helpful.",
+            current_parameters={},
+            current_variables={},
+            iteration=1,
+        )
+        result.on_status_update("evaluating", ctx)
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        score_entry = patch_payload["scores"]["acceptance-statement-0"]
+        assert score_entry["score"] == 0.85
+        assert score_entry["rationale"] == "Close."
+        assert score_entry["threshold"] == 0.9
+
+    def test_persist_and_forward_scores_omit_threshold_for_unknown_judge_key(self):
+        # A score whose key doesn't match any configured judge should not include threshold
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={"unknown-judge": JudgeResult(score=0.5, rationale="Unknown.")},
+            completion_response="Answer.",
+            current_instructions="",
+            current_parameters={},
+            current_variables={},
+            iteration=1,
+        )
+        result.on_status_update("evaluating", ctx)
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        score_entry = patch_payload["scores"]["unknown-judge"]
+        assert score_entry["score"] == 0.5
+        assert "threshold" not in score_entry
+
+    def test_persist_and_forward_includes_run_id_and_version(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+        assert post_payload["runId"] == "run-uuid-456"
+        assert post_payload["agentOptimizationVersion"] == 2
+
+    def test_second_call_same_iteration_does_not_post_again(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        result.on_status_update("evaluating", ctx)
+        # POST is called only once (first encounter of iteration 1)
+        assert self.api_client.post_agent_optimization_result.call_count == 1
+        # PATCH is called twice
+        assert self.api_client.patch_agent_optimization_result.call_count == 2
+
+    def test_each_new_iteration_posts_a_new_record(self):
+        result = self._build()
+        ctx1 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        ctx2 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=2,
+        )
+        result.on_status_update("generating", ctx1)
+        result.on_status_update("generating", ctx2)
+        assert self.api_client.post_agent_optimization_result.call_count == 2
+
+    @pytest.mark.parametrize("sdk_status,expected_status,expected_activity", [
+        ("init", "RUNNING", "PENDING"),
+        ("generating", "RUNNING", "GENERATING"),
+        ("evaluating", "RUNNING", "EVALUATING"),
+        ("generating variation", "RUNNING", "GENERATING_VARIATION"),
+        ("validating", "RUNNING", "EVALUATING"),
+        ("turn completed", "RUNNING", "COMPLETED"),
+        ("success", "PASSED", "COMPLETED"),
+        ("failure", "FAILED", "COMPLETED"),
+    ])
+    def test_status_mapping(self, sdk_status, expected_status, expected_activity):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update(sdk_status, ctx)
+        # status and activity are in the PATCH payload, not the POST payload
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        assert patch_payload["status"] == expected_status
+        assert patch_payload["activity"] == expected_activity
+
+    def test_user_on_status_update_chained_after_post_and_patch(self):
+        call_order = []
+        self.api_client.post_agent_optimization_result.side_effect = (
+            lambda *a, **kw: call_order.append("post") or "result-id"
+        )
+        self.api_client.patch_agent_optimization_result.side_effect = (
+            lambda *a, **kw: call_order.append("patch")
+        )
+        user_cb = MagicMock(side_effect=lambda s, c: call_order.append("user"))
+        options = _make_from_config_options(on_status_update=user_cb)
+        result = self._build(options=options)
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        assert call_order == ["post", "patch", "user"]
+
+    def test_user_on_status_update_exception_does_not_propagate(self):
+        options = _make_from_config_options(
+            on_status_update=MagicMock(side_effect=RuntimeError("cb boom"))
+        )
+        result = self._build(options=options)
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("generating", ctx)  # must not raise
+
+    def test_post_payload_does_not_contain_history(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+        assert "history" not in post_payload
+
+    @pytest.mark.parametrize("status", [
+        "init", "generating", "evaluating", "generating variation",
+        "validating", "turn completed", "success", "failure",
+    ])
+    def test_variation_included_in_patch_for_all_statuses(self, status):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={},
+            completion_response="answer",
+            current_instructions="Be concise.",
+            current_parameters={"temperature": 0.3},
+            current_variables={},
+            current_model="gpt-4o",
+            iteration=1,
+        )
+        result.on_status_update(status, ctx)
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        assert "variation" in patch_payload
+        assert patch_payload["variation"]["instructions"] == "Be concise."
+        assert patch_payload["variation"]["parameters"] == {"temperature": 0.3}
+
+    @pytest.mark.parametrize("status", ["generating", "evaluating", "success"])
+    def test_model_config_key_prefers_global_in_variation(self, status):
+        model_configs = [
+            {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+            {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+        ]
+        result = self.client._build_options_from_config(
+            dict(_API_CONFIG),
+            _make_from_config_options(),
+            self.api_client,
+            optimization_key="opt-key-123",
+            run_id="run-uuid-456",
+            model_configs=model_configs,
+        )
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="instr",
+            current_parameters={}, current_variables={}, current_model="gpt-4o",
+            iteration=1,
+        )
+        result.on_status_update(status, ctx)
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        assert patch_payload["variation"]["modelConfigKey"] == "global.gpt-4o"
+
+    @pytest.mark.parametrize("status", ["generating", "evaluating", "success"])
+    def test_model_config_key_resolved_in_variation(self, status):
+        model_configs = [{"id": "gpt-4o", "key": "OpenAI.gpt-4o"}]
+        result = self.client._build_options_from_config(
+            dict(_API_CONFIG),
+            _make_from_config_options(),
+            self.api_client,
+            optimization_key="opt-key-123",
+            run_id="run-uuid-456",
+            model_configs=model_configs,
+        )
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="instr",
+            current_parameters={}, current_variables={}, current_model="gpt-4o",
+            iteration=1,
+        )
+        result.on_status_update(status, ctx)
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        assert patch_payload["variation"]["modelConfigKey"] == "OpenAI.gpt-4o"
+
+    def test_generation_latency_cast_to_int(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, duration_ms=123.7,
+            iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        patch_payload = self.api_client.patch_agent_optimization_result.call_args[0][3]
+        assert patch_payload["generationLatency"] == 123
+        assert isinstance(patch_payload["generationLatency"], int)
+
+    def test_last_optimization_result_id_updated_on_post(self):
+        result = self._build()
+        ctx = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("generating", ctx)
+        assert self.client._last_optimization_result_id == "result-uuid-789"
+
+    def test_validation_sub_iterations_do_not_create_new_records(self):
+        """Validation sub-iterations should be folded into the parent iteration's record."""
+        result = self._build()
+        ctx_main = OptimizationContext(
+            scores={}, completion_response="a", current_instructions="i",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        ctx_val1 = OptimizationContext(
+            scores={}, completion_response="b", current_instructions="i",
+            current_parameters={}, current_variables={}, iteration=2,
+        )
+        ctx_val2 = OptimizationContext(
+            scores={}, completion_response="c", current_instructions="i",
+            current_parameters={}, current_variables={}, iteration=3,
+        )
+        result.on_status_update("generating", ctx_main)   # POST iter 1
+        result.on_status_update("evaluating", ctx_main)   # PATCH iter 1
+        result.on_status_update("validating", ctx_main)   # enter validation; PATCH iter 1
+        result.on_status_update("generating", ctx_val1)   # validation sub-iter → folded to iter 1
+        result.on_status_update("evaluating", ctx_val1)   # folded to iter 1
+        result.on_status_update("generating", ctx_val2)   # validation sub-iter → folded to iter 1
+        result.on_status_update("evaluating", ctx_val2)   # folded to iter 1
+        result.on_status_update("success", ctx_val2)      # folded to iter 1; reset validation
+
+        # Only one POST for the single main iteration
+        assert self.api_client.post_agent_optimization_result.call_count == 1
+        post_payload = self.api_client.post_agent_optimization_result.call_args[0][2]
+        assert post_payload["iteration"] == 1
+
+    def test_validation_success_patches_parent_iteration_record(self):
+        """success event during validation should PATCH the main iteration's record, not a new one."""
+        result = self._build()
+        ctx_main = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=2,
+        )
+        ctx_val = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=3,
+        )
+        result.on_status_update("generating", ctx_main)
+        result.on_status_update("validating", ctx_main)
+        result.on_status_update("generating", ctx_val)
+        result.on_status_update("success", ctx_val)
+
+        # PATCH for success should use the result_id of the parent (iter 2) record
+        patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+        success_patch = next(
+            c for c in patch_calls if c[0][3].get("status") == "PASSED"
+        )
+        # Third positional arg is result_id — it should be the one returned from the POST for iter 2
+        assert success_patch[0][2] == "result-uuid-789"
+
+    def test_validation_phase_resets_after_turn_completed(self):
+        """After turn completed, subsequent main-loop iterations create their own records."""
+        result = self._build()
+        ctx1 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        ctx_val = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=2,
+        )
+        ctx2 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=2,
+        )
+        result.on_status_update("generating", ctx1)      # POST iter 1
+        result.on_status_update("validating", ctx1)      # enter validation
+        result.on_status_update("generating", ctx_val)   # folded to iter 1
+        result.on_status_update("turn completed", ctx_val)  # reset validation phase
+        result.on_status_update("generating", ctx2)      # POST iter 2 (new main attempt)
+
+        assert self.api_client.post_agent_optimization_result.call_count == 2
+
+    def test_init_iteration_closed_when_first_real_iteration_begins(self):
+        """The init record (iter 0) must receive a RUNNING:COMPLETED patch before iter 1 starts."""
+        result = self._build()
+        ctx0 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=0,
+        )
+        ctx1 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        result.on_status_update("init", ctx0)       # POST iter 0, PATCH RUNNING:PENDING
+        result.on_status_update("generating", ctx1) # should close iter 0, then POST iter 1
+
+        # iter 0 POSTed + iter 1 POSTed
+        assert self.api_client.post_agent_optimization_result.call_count == 2
+        patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+        # Patches: (1) init PENDING, (2) auto-close COMPLETED, (3) generating GENERATING
+        assert len(patch_calls) == 3
+        payloads = [c[0][3] for c in patch_calls]
+        assert payloads[0]["status"] == "RUNNING"
+        assert payloads[0]["activity"] == "PENDING"
+        assert "variation" in payloads[0]
+        assert payloads[1] == {"status": "RUNNING", "activity": "COMPLETED"}  # auto-close patch has no variation
+        assert payloads[2]["status"] == "RUNNING"
+        assert payloads[2]["activity"] == "GENERATING"
+        assert "variation" in payloads[2]
+
+    def test_non_final_gt_sample_closed_when_next_sample_begins(self):
+        """In a GT batch, each sample except the last should receive a RUNNING:COMPLETED patch
+        when the next sample's generating event fires."""
+        result = self._build()
+        ctx1 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, user_input="What is 2+2?", iteration=1,
+        )
+        ctx2 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, user_input="What is 3+3?", iteration=2,
+        )
+        ctx3 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, user_input="What is 4+4?", iteration=3,
+        )
+        result.on_status_update("generating", ctx1)  # POST iter 1
+        result.on_status_update("evaluating", ctx1)  # PATCH iter 1 (EVALUATING)
+        result.on_status_update("generating", ctx2)  # should auto-close iter 1, then POST iter 2
+        result.on_status_update("evaluating", ctx2)  # PATCH iter 2 (EVALUATING)
+        result.on_status_update("generating", ctx3)  # should auto-close iter 2, then POST iter 3
+
+        patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+        activities = [c[0][3].get("activity") for c in patch_calls]
+        # Expected sequence: GENERATING, EVALUATING, COMPLETED (auto-close 1),
+        # GENERATING, EVALUATING, COMPLETED (auto-close 2), GENERATING
+        assert activities.count("COMPLETED") >= 2, (
+            f"Expected at least 2 COMPLETED patches, got: {activities}"
+        )
+        # The auto-close patches must appear BEFORE the subsequent GENERATING patches
+        completed_indices = [i for i, a in enumerate(activities) if a == "COMPLETED"]
+        generating_indices = [i for i, a in enumerate(activities) if a == "GENERATING"]
+        # Each auto-close patch should precede the next generating patch
+        assert completed_indices[0] < generating_indices[1]
+        assert completed_indices[1] < generating_indices[2]
+
+    def test_terminal_event_clears_open_iteration_so_next_generating_does_not_double_close(self):
+        """After a terminal event (turn completed), the next generating should not try to
+        close the already-closed iteration again."""
+        result = self._build()
+        ctx1 = OptimizationContext(
+            scores={}, completion_response="answer", current_instructions="Be helpful.",
+            current_parameters={}, current_variables={}, iteration=1,
+        )
+        ctx2 = OptimizationContext(
+            scores={}, completion_response="", current_instructions="",
+            current_parameters={}, current_variables={}, iteration=2,
+        )
+        result.on_status_update("generating", ctx1)       # open iter 1
+        result.on_status_update("turn completed", ctx1)   # close iter 1 explicitly
+        result.on_status_update("generating", ctx2)       # new iter — should NOT re-close iter 1
+
+        patch_calls = self.api_client.patch_agent_optimization_result.call_args_list
+        # The only RUNNING:COMPLETED patch should be from "turn completed", not from the
+        # auto-close triggered by iter 2's generating event.
+        completed_patches = [
+            c for c in patch_calls
+            if c[0][3].get("status") == "RUNNING" and c[0][3].get("activity") == "COMPLETED"
+        ]
+        assert len(completed_patches) == 1, (
+            "Expected exactly one RUNNING:COMPLETED patch (from turn completed), not a duplicate"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Token limiting
+# ---------------------------------------------------------------------------
+
+
+class TestTokenLimiting:
+    """Tests that the process halts and marks itself failed when token usage
+    meets or exceeds the configured token_limit."""
+
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    # -- chaos (optimize_from_options) -----------------------------------
+
+    async def test_chaos_stops_when_token_limit_exceeded_on_first_iteration(self):
+        """Token limit exceeded after the first agent turn should immediately fail."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Some answer.",
+                usage=TokenUsage(total=500, input=300, output=200),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=499,  # limit is below the 500 tokens returned
+            max_attempts=5,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        # Should have called the agent exactly once then stopped
+        assert handle_agent_call.call_count == 1
+        assert client._last_run_succeeded is False
+
+    async def test_chaos_does_not_stop_when_token_limit_not_exceeded(self):
+        """Process should continue normally when total tokens stay below limit."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Some answer.",
+                usage=TokenUsage(total=100, input=60, output=40),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=10000,
+            max_attempts=3,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert client._last_run_succeeded is True
+
+    async def test_chaos_stops_when_limit_reached_exactly(self):
+        """gte logic: limit == total usage should trigger failure."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Some answer.",
+                usage=TokenUsage(total=500, input=300, output=200),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=500,  # exactly equal — should trigger
+            max_attempts=5,
+        )
+        await client.optimize_from_options("test-agent", options)
+        assert handle_agent_call.call_count == 1
+        assert client._last_run_succeeded is False
+
+    async def test_chaos_judge_tokens_accumulate_toward_limit(self):
+        """Judge token usage is included in the running total."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Some answer.",
+                usage=TokenUsage(total=100, input=60, output=40),
+            )
+        )
+        # Judge response contributes 450 tokens — combined with agent's 100 → 550 > 200
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output=JUDGE_PASS_RESPONSE,
+                usage=TokenUsage(total=450, input=300, output=150),
+            )
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=200,  # 100 (agent) + 450 (judge) = 550 > 200
+            max_attempts=5,
+        )
+        await client.optimize_from_options("test-agent", options)
+        assert client._last_run_succeeded is False
+
+    async def test_chaos_no_limit_does_not_enforce_token_cap(self):
+        """When token_limit is None, no cap is applied regardless of usage."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Some answer.",
+                usage=TokenUsage(total=999999, input=500000, output=499999),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            # no token_limit set
+            max_attempts=3,
+        )
+        result = await client.optimize_from_options("test-agent", options)
+        assert client._last_run_succeeded is True
+
+    async def test_chaos_on_failing_result_called_on_token_limit(self):
+        """on_failing_result callback is fired when token limit halts the run."""
+        on_failing = MagicMock()
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Some answer.",
+                usage=TokenUsage(total=500, input=300, output=200),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=100,
+            max_attempts=5,
+            on_failing_result=on_failing,
+        )
+        await client.optimize_from_options("test-agent", options)
+        on_failing.assert_called_once()
+
+    async def test_chaos_accumulates_across_multiple_iterations(self):
+        """Tokens from successive iterations add up until the limit is hit."""
+        # Each agent call returns 100 tokens; limit is 250, so it trips on the 3rd call
+        agent_responses = [
+            OptimizationResponse(output="Bad.", usage=TokenUsage(total=100, input=60, output=40)),
+            OptimizationResponse(output=VARIATION_RESPONSE),   # variation (no usage)
+            OptimizationResponse(output="Still bad.", usage=TokenUsage(total=100, input=60, output=40)),
+            OptimizationResponse(output=VARIATION_RESPONSE),   # variation (no usage)
+            OptimizationResponse(output="Still bad.", usage=TokenUsage(total=100, input=60, output=40)),
+        ]
+        handle_agent_call = AsyncMock(side_effect=agent_responses)
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=250,  # 100+100 = 200 ok; 200+100 = 300 ≥ 250 → stop on 3rd
+            max_attempts=10,
+        )
+        await client.optimize_from_options("test-agent", options)
+        assert client._last_run_succeeded is False
+        # 3 agent calls + 2 variation calls = 5 total; no more
+        assert handle_agent_call.call_count == 5
+
+    async def test_chaos_token_limit_in_validation_phase_stops_run(self):
+        """Exceeding the limit during the validation phase also halts the run."""
+        # The main iteration passes judges (50 tokens), but each validation call
+        # adds 300 tokens, pushing the total over 200.
+        agent_responses = [
+            OptimizationResponse(output="Good answer.", usage=TokenUsage(total=50, input=30, output=20)),   # main turn
+            OptimizationResponse(output="Validation answer.", usage=TokenUsage(total=300, input=200, output=100)),  # validation
+        ]
+        handle_agent_call = AsyncMock(side_effect=agent_responses)
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=200,  # 50 (main) + 300 (validation) = 350 > 200
+            max_attempts=5,
+            variable_choices=[{"language": "English"}, {"language": "French"}],
+        )
+        await client.optimize_from_options("test-agent", options)
+        assert client._last_run_succeeded is False
+
+    # -- ground truth (optimize_from_ground_truth_options) ---------------
+
+    async def test_gt_stops_when_token_limit_exceeded_on_first_sample(self):
+        """Token limit exceeded on the first sample should immediately fail the GT run."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Answer.",
+                usage=TokenUsage(total=600, input=400, output=200),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        opts = _make_gt_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=500,  # 600 > 500 → trip on first sample
+            max_attempts=5,
+        )
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert client._last_run_succeeded is False
+        # Only one agent call should have happened
+        assert handle_agent_call.call_count == 1
+        # The offending context is still returned in the results list
+        assert len(results) == 1
+
+    async def test_gt_stops_mid_batch_when_limit_exceeded_on_second_sample(self):
+        """Token limit exceeded on the second of two samples stops after that sample."""
+        agent_responses = [
+            OptimizationResponse(output="Answer 1.", usage=TokenUsage(total=100, input=60, output=40)),
+            OptimizationResponse(output="Answer 2.", usage=TokenUsage(total=200, input=120, output=80)),
+        ]
+        handle_agent_call = AsyncMock(side_effect=agent_responses)
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        opts = _make_gt_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=250,  # 100 + 200 = 300 ≥ 250 → trip on second sample
+            max_attempts=5,
+        )
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert client._last_run_succeeded is False
+        assert handle_agent_call.call_count == 2
+        # Both samples processed so far are in the results
+        assert len(results) == 2
+
+    async def test_gt_on_failing_result_called_on_token_limit(self):
+        """on_failing_result callback fires when GT run halts due to token limit."""
+        on_failing = MagicMock()
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Answer.",
+                usage=TokenUsage(total=600, input=400, output=200),
+            )
+        )
+        client = _make_client(self.mock_ldai)
+        opts = _make_gt_options(
+            handle_agent_call=handle_agent_call,
+            token_limit=100,
+            max_attempts=5,
+            on_failing_result=on_failing,
+        )
+        await client.optimize_from_ground_truth_options("test-agent", opts)
+        on_failing.assert_called_once()
+
+    async def test_gt_no_limit_does_not_enforce_token_cap(self):
+        """When token_limit is None on GT options, no cap is applied."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Answer.",
+                usage=TokenUsage(total=999999, input=500000, output=499999),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        opts = _make_gt_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            # no token_limit
+        )
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert client._last_run_succeeded is True
+
+    async def test_gt_accumulates_tokens_across_samples_in_same_attempt(self):
+        """Tokens from all samples in the same attempt add up correctly."""
+        agent_responses = [
+            OptimizationResponse(output="Answer 1.", usage=TokenUsage(total=80, input=50, output=30)),
+            OptimizationResponse(output="Answer 2.", usage=TokenUsage(total=80, input=50, output=30)),
+        ]
+        handle_agent_call = AsyncMock(side_effect=agent_responses)
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+        opts = _make_gt_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=200,  # 80 + 80 = 160 < 200, run should succeed
+        )
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert client._last_run_succeeded is True
+        assert len(results) == 2
+
+    # -- _total_token_usage reset between runs ---------------------------
+
+    async def test_total_token_usage_resets_between_runs(self):
+        """_total_token_usage is reset at the start of each run so a reused
+        client does not carry over counts from previous optimizations."""
+        handle_agent_call = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Answer.",
+                usage=TokenUsage(total=100, input=60, output=40),
+            )
+        )
+        handle_judge_call = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        client = _make_client(self.mock_ldai)
+
+        # First run accumulates tokens
+        options = _make_options(
+            handle_agent_call=handle_agent_call,
+            handle_judge_call=handle_judge_call,
+            token_limit=10000,
+        )
+        await client.optimize_from_options("test-agent", options)
+        assert client._total_token_usage > 0
+
+        # Second run starts fresh — use a tight limit that would fail if
+        # tokens from run 1 were carried over
+        handle_agent_call2 = AsyncMock(
+            return_value=OptimizationResponse(
+                output="Answer.",
+                usage=TokenUsage(total=50, input=30, output=20),
+            )
+        )
+        handle_judge_call2 = AsyncMock(
+            return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+        )
+        options2 = _make_options(
+            handle_agent_call=handle_agent_call2,
+            handle_judge_call=handle_judge_call2,
+            token_limit=10000,
+        )
+        await client.optimize_from_options("test-agent", options2)
+        assert client._last_run_succeeded is True
+
+
+# ---------------------------------------------------------------------------
+# optimize_from_config
+# ---------------------------------------------------------------------------
+
+
+class TestOptimizeFromConfig:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    def _make_client_with_key(self) -> OptimizationClient:
+        with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+            return _make_client(self.mock_ldai)
+
+    def _make_client_without_key(self) -> OptimizationClient:
+        with patch.dict("os.environ", {}, clear=True):
+            import os
+            os.environ.pop("LAUNCHDARKLY_API_KEY", None)
+            client = OptimizationClient(self.mock_ldai)
+            client._has_api_key = False
+            client._api_key = None
+            return client
+
+    async def test_raises_without_api_key(self):
+        client = self._make_client_without_key()
+        options = _make_from_config_options()
+        with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY is not set"):
+            await client.optimize_from_config("my-opt", options)
+
+    async def test_fetches_config_and_uses_ai_config_key(self):
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            options = _make_from_config_options()
+            await client.optimize_from_config("my-opt", options)
+
+        mock_api.get_agent_optimization.assert_called_once_with("my-project", "my-opt")
+        assert client._agent_key == "my-agent"
+
+    async def test_posts_result_on_each_status_event(self):
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            options = _make_from_config_options()
+            await client.optimize_from_config("my-opt", options)
+
+        assert mock_api.post_agent_optimization_result.call_count >= 1
+
+    async def test_user_on_status_update_called_during_run(self):
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+        statuses = []
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            options = _make_from_config_options(
+                on_status_update=lambda status, ctx: statuses.append(status)
+            )
+            await client.optimize_from_config("my-opt", options)
+
+        assert "generating" in statuses
+        assert "success" in statuses
+
+    async def test_custom_base_url_passed_to_api_client(self):
+        client = self._make_client_with_key()
+
+        with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+            instance = _make_mock_api_client()
+            instance.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+            MockLDApiClient.return_value = instance
+            options = _make_from_config_options(base_url="https://staging.launchdarkly.com")
+            await client.optimize_from_config("my-opt", options)
+
+        MockLDApiClient.assert_called_once_with(
+            "test-api-key", base_url="https://staging.launchdarkly.com"
+        )
+
+    async def test_no_base_url_does_not_pass_kwarg(self):
+        client = self._make_client_with_key()
+
+        with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+            instance = _make_mock_api_client()
+            instance.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+            MockLDApiClient.return_value = instance
+            options = _make_from_config_options()
+            await client.optimize_from_config("my-opt", options)
+
+        MockLDApiClient.assert_called_once_with("test-api-key")
+
+    async def test_returns_optimization_context_on_success(self):
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            options = _make_from_config_options()
+            result = await client.optimize_from_config("my-opt", options)
+
+        assert isinstance(result, OptimizationContext)
+        assert result.completion_response == "The answer is 4."
+
+
+# ---------------------------------------------------------------------------
+# GroundTruthSample / GroundTruthOptimizationOptions dataclass validation
+# ---------------------------------------------------------------------------
+
+
+class TestGroundTruthSampleDataclass:
+    def test_required_fields(self):
+        s = GroundTruthSample(user_input="hi", expected_response="hello")
+        assert s.user_input == "hi"
+        assert s.expected_response == "hello"
+        assert s.variables == {}
+
+    def test_variables_populated(self):
+        s = GroundTruthSample(user_input="hi", expected_response="hello", variables={"lang": "en"})
+        assert s.variables == {"lang": "en"}
+
+
+class TestGroundTruthOptimizationOptionsValidation:
+    def _make(self, **overrides) -> GroundTruthOptimizationOptions:
+        defaults = dict(
+            context_choices=[LD_CONTEXT],
+            ground_truth_responses=[
+                GroundTruthSample(user_input="q1", expected_response="a1"),
+            ],
+            max_attempts=3,
+            model_choices=["gpt-4o"],
+            judge_model="gpt-4o",
+            handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="ans")),
+            handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+            judges={
+                "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+            },
+        )
+        defaults.update(overrides)
+        return GroundTruthOptimizationOptions(**defaults)
+
+    def test_valid_options_created(self):
+        opts = self._make()
+        assert len(opts.ground_truth_responses) == 1
+
+    def test_raises_empty_model_choices(self):
+        with pytest.raises(ValueError, match="model_choices"):
+            self._make(model_choices=[])
+
+    def test_raises_empty_ground_truth_responses(self):
+        with pytest.raises(ValueError, match="ground_truth_responses"):
+            self._make(ground_truth_responses=[])
+
+    def test_raises_no_judges_and_no_on_turn(self):
+        with pytest.raises(ValueError, match="judges or on_turn"):
+            self._make(judges=None, on_turn=None)
+
+    def test_on_turn_satisfies_criteria_requirement(self):
+        opts = self._make(judges=None, on_turn=lambda ctx: True)
+        assert opts.on_turn is not None
+
+
+# ---------------------------------------------------------------------------
+# _run_ground_truth_optimization / optimize_from_ground_truth_options
+# ---------------------------------------------------------------------------
+
+
+def _make_gt_options(**overrides) -> GroundTruthOptimizationOptions:
+    defaults: Dict[str, Any] = dict(
+        context_choices=[LD_CONTEXT],
+        ground_truth_responses=[
+            GroundTruthSample(user_input="What is 2+2?", expected_response="4", variables={"lang": "English"}),
+            GroundTruthSample(user_input="What is 3+3?", expected_response="6", variables={"lang": "English"}),
+        ],
+        max_attempts=3,
+        model_choices=["gpt-4o", "gpt-4o-mini"],
+        judge_model="gpt-4o",
+        handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="The answer is correct.")),
+        handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+        judges={
+            "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+        },
+    )
+    defaults.update(overrides)
+    return GroundTruthOptimizationOptions(**defaults)
+
+
+def _make_winning_context(
+    model: str = "gpt-4o",
+    instructions: str = "Be helpful.",
+    parameters: Dict[str, Any] | None = None,
+) -> OptimizationContext:
+    """Return a minimal OptimizationContext representing a successful run."""
+    return OptimizationContext(
+        scores={},
+        completion_response="The answer is 4.",
+        current_instructions=instructions,
+        current_parameters=parameters or {},
+        current_variables={},
+        current_model=model,
+        iteration=1,
+    )
+
+
+def _make_api_client_for_commit(
+    existing_variation_keys: list | None = None,
+    model_configs: list | None = None,
+) -> MagicMock:
+    """Return a mock LDApiClient pre-configured for _commit_variation calls."""
+    mock = MagicMock()
+    existing = existing_variation_keys or []
+    mock.get_ai_config.return_value = {"variations": [{"key": k} for k in existing]}
+    mock.get_model_configs.return_value = model_configs if model_configs is not None else [
+        {"id": "gpt-4o", "key": "OpenAI.gpt-4o"},
+        {"id": "gpt-4o-mini", "key": "OpenAI.gpt-4o-mini"},
+    ]
+    mock.create_ai_config_variation.return_value = {"key": "new-variation"}
+    return mock
+
+
+class TestRunGroundTruthOptimization:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    def _make_client(self) -> OptimizationClient:
+        return _make_client(self.mock_ldai)
+
+    async def test_returns_list_of_contexts_on_success(self):
+        client = self._make_client()
+        opts = _make_gt_options()
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert isinstance(results, list)
+        assert len(results) == 2
+        for ctx in results:
+            assert isinstance(ctx, OptimizationContext)
+
+    async def test_each_context_has_correct_user_input(self):
+        client = self._make_client()
+        opts = _make_gt_options()
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert results[0].user_input == "What is 2+2?"
+        assert results[1].user_input == "What is 3+3?"
+
+    async def test_completion_response_set_on_each_context(self):
+        client = self._make_client()
+        opts = _make_gt_options(handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="42")))
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        for ctx in results:
+            assert ctx.completion_response == "42"
+
+    async def test_on_sample_result_called_per_sample(self):
+        client = self._make_client()
+        sample_results = []
+        opts = _make_gt_options(on_sample_result=lambda ctx: sample_results.append(ctx))
+        await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert len(sample_results) == 2
+
+    async def test_on_passing_result_called_once_on_success(self):
+        client = self._make_client()
+        passing_calls = []
+        opts = _make_gt_options(on_passing_result=lambda ctx: passing_calls.append(ctx))
+        await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert len(passing_calls) == 1
+
+    async def test_on_failing_result_called_when_max_attempts_exceeded(self):
+        client = self._make_client()
+        failing_calls = []
+        opts = _make_gt_options(
+            handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)),
+            max_attempts=2,
+            on_failing_result=lambda ctx: failing_calls.append(ctx),
+        )
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert isinstance(results, list)
+        assert len(failing_calls) == 1
+
+    async def test_generates_variation_when_any_sample_fails(self):
+        client = self._make_client()
+        judge_responses = [
+            JUDGE_PASS_RESPONSE,       # sample 1 attempt 1 — pass
+            JUDGE_FAIL_RESPONSE,       # sample 2 attempt 1 — fail → trigger variation
+            JUDGE_PASS_RESPONSE,       # sample 1 attempt 2 — pass
+            JUDGE_PASS_RESPONSE,       # sample 2 attempt 2 — pass
+        ]
+        call_count = 0
+        async def side_effect(*args, **kwargs):
+            nonlocal call_count
+            resp = judge_responses[call_count]
+            call_count += 1
+            return OptimizationResponse(output=resp)
+
+        opts = _make_gt_options(
+            handle_judge_call=side_effect,
+            handle_agent_call=AsyncMock(side_effect=[
+                OptimizationResponse(output="ans1"),
+                OptimizationResponse(output="ans2"),           # attempt 1 samples
+                OptimizationResponse(output=VARIATION_RESPONSE),       # variation generation
+                OptimizationResponse(output="ans3"),
+                OptimizationResponse(output="ans4"),           # attempt 2 samples
+            ]),
+            max_attempts=3,
+        )
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert isinstance(results, list)
+        assert len(results) == 2
+
+    async def test_iteration_numbers_are_linear_and_unique(self):
+        client = self._make_client()
+        opts = _make_gt_options()
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        iterations = [ctx.iteration for ctx in results]
+        assert len(set(iterations)) == len(iterations)
+
+    async def test_on_sample_result_exception_does_not_abort(self):
+        client = self._make_client()
+
+        def bad_callback(ctx):
+            raise RuntimeError("boom")
+
+        opts = _make_gt_options(on_sample_result=bad_callback)
+        results = await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert len(results) == 2
+
+    async def test_variables_from_samples_used_per_evaluation(self):
+        client = self._make_client()
+        received_contexts = []
+        async def capture_agent_call(key, config, ctx, is_evaluation=False):
+            received_contexts.append(ctx)
+            return OptimizationResponse(output="response")
+
+        opts = _make_gt_options(
+            ground_truth_responses=[
+                GroundTruthSample(user_input="q1", expected_response="a1", variables={"lang": "English"}),
+                GroundTruthSample(user_input="q2", expected_response="a2", variables={"lang": "French"}),
+            ],
+            handle_agent_call=capture_agent_call,
+        )
+        await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert received_contexts[0].current_variables == {"lang": "English"}
+        assert received_contexts[1].current_variables == {"lang": "French"}
+
+    async def test_model_falls_back_to_first_model_choice_when_agent_config_has_no_model(self):
+        """When the LD agent config has no model name the first model_choices entry is used."""
+        config_without_model = _make_agent_config(model_name="")
+        mock_ldai = _make_ldai_client(agent_config=config_without_model)
+        client = _make_client(mock_ldai)
+
+        observed_models = []
+        async def capture(key, config, ctx, is_evaluation=False):
+            observed_models.append(config.model.name if config.model else None)
+            return OptimizationResponse(output="answer")
+
+        opts = _make_gt_options(
+            handle_agent_call=capture,
+            model_choices=["gpt-4o", "gpt-4o-mini"],
+        )
+        await client.optimize_from_ground_truth_options("test-agent", opts)
+        assert all(m == "gpt-4o" for m in observed_models), (
+            f"Expected all agent calls to use 'gpt-4o' (fallback), got: {observed_models}"
+        )
+
+    async def test_missing_instructions_raises_value_error(self):
+        """An agent config with no instructions raises ValueError before the loop starts."""
+        config_no_instructions = _make_agent_config(instructions="")
+        mock_ldai = _make_ldai_client(agent_config=config_no_instructions)
+        # variation() also needs to return no instructions so the fallback doesn't hide the gap.
+        mock_ldai._client.variation.return_value = {"instructions": ""}
+        client = _make_client(mock_ldai)
+
+        opts = _make_gt_options()
+        with pytest.raises(ValueError, match="has no instructions configured"):
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+
+# ---------------------------------------------------------------------------
+# expected_response in judge evaluation
+# ---------------------------------------------------------------------------
+
+
+class TestExpectedResponseInJudges:
+    def setup_method(self):
+        self.client = _make_client()
+        self.client._agent_key = "test-agent"
+        self.client._options = _make_options()
+        self.client._agent_config = _make_agent_config()
+        self.client._initialize_class_members_from_config(_make_agent_config())
+
+    async def test_expected_response_included_in_acceptance_judge_user_message(self):
+        captured_configs = []
+
+        async def capture_judge_call(key, config, ctx, is_evaluation=False):
+            captured_configs.append(config)
+            return OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+
+        self.client._options = _make_options(
+            judges={
+                "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+            },
+            handle_judge_call=capture_judge_call,
+        )
+        await self.client._execute_agent_turn(
+            self.client._create_optimization_context(iteration=1, variables={}),
+            1,
+            expected_response="The expected answer is 42.",
+        )
+        assert len(captured_configs) == 1
+        user_msg = captured_configs[0].messages[-1].content
+        assert "The expected answer is 42." in user_msg
+
+    async def test_expected_response_in_acceptance_judge_user_message(self):
+        captured_configs = []
+
+        async def capture_judge_call(key, config, ctx, is_evaluation=False):
+            captured_configs.append(config)
+            return OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+
+        self.client._options = _make_options(
+            judges={
+                "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+            },
+            handle_judge_call=capture_judge_call,
+        )
+        await self.client._execute_agent_turn(
+            self.client._create_optimization_context(iteration=1, variables={}),
+            1,
+            expected_response="gold standard",
+        )
+        user_msg = captured_configs[0].messages[1].content
+        assert "gold standard" in user_msg
+        assert "expected response" in user_msg.lower()
+        # Scoring instructions should now live in the user message, not the system prompt
+        system_msg = captured_configs[0].messages[0].content
+        assert "gold standard" not in system_msg
+
+    async def test_no_expected_response_leaves_judge_messages_unchanged(self):
+        captured_configs = []
+
+        async def capture_judge_call(key, config, ctx, is_evaluation=False):
+            captured_configs.append(config)
+            return OptimizationResponse(output=JUDGE_PASS_RESPONSE)
+
+        self.client._options = _make_options(
+            judges={
+                "acc": OptimizationJudge(threshold=0.8, acceptance_statement="Be accurate.")
+            },
+            handle_judge_call=capture_judge_call,
+        )
+        await self.client._execute_agent_turn(
+            self.client._create_optimization_context(iteration=1, variables={}),
+            1,
+        )
+        user_msg = captured_configs[0].messages[-1].content
+        assert "expected response" not in user_msg.lower()
+
+
+# ---------------------------------------------------------------------------
+# _build_options_from_config — ground truth path
+# ---------------------------------------------------------------------------
+
+
+_API_CONFIG_WITH_GT: Dict[str, Any] = {
+    "id": "opt-gt-uuid",
+    "key": "my-gt-optimization",
+    "aiConfigKey": "my-agent",
+    "maxAttempts": 3,
+    "modelChoices": ["gpt-4o"],
+    "judgeModel": "gpt-4o",
+    "variableChoices": [{"lang": "English"}, {"lang": "French"}],
+    "acceptanceStatements": [{"statement": "Be accurate.", "threshold": 0.9}],
+    "judges": [],
+    "userInputOptions": ["What is 2+2?", "What is 3+3?"],
+    "groundTruthResponses": ["4", "6"],
+    "version": 1,
+    "createdAt": 1700000000,
+}
+
+
+class TestBuildOptionsFromConfigGroundTruth:
+    def setup_method(self):
+        self.client = _make_client()
+        self.client._agent_key = "my-agent"
+        self.client._initialize_class_members_from_config(_make_agent_config())
+        self.client._options = _make_options()
+        self.api_client = _make_mock_api_client()
+
+    def _build(self, config=None, options=None):
+        return self.client._build_options_from_config(
+            config or dict(_API_CONFIG_WITH_GT),
+            options or _make_from_config_options(),
+            self.api_client,
+            optimization_key="opt-gt-key",
+            run_id="run-uuid-789",
+            model_configs=[],
+        )
+
+    def test_returns_ground_truth_options_when_gt_present(self):
+        result = self._build()
+        assert isinstance(result, GroundTruthOptimizationOptions)
+
+    def test_samples_zipped_by_index(self):
+        result = self._build()
+        assert isinstance(result, GroundTruthOptimizationOptions)
+        assert len(result.ground_truth_responses) == 2
+        s0 = result.ground_truth_responses[0]
+        assert s0.user_input == "What is 2+2?"
+        assert s0.expected_response == "4"
+        assert s0.variables == {"lang": "English"}
+        s1 = result.ground_truth_responses[1]
+        assert s1.user_input == "What is 3+3?"
+        assert s1.expected_response == "6"
+        assert s1.variables == {"lang": "French"}
+
+    def test_model_choices_have_prefix_stripped(self):
+        config = dict(_API_CONFIG_WITH_GT)
+        config["modelChoices"] = ["OpenAI.gpt-4o"]
+        result = self._build(config=config)
+        assert isinstance(result, GroundTruthOptimizationOptions)
+        assert result.model_choices == ["gpt-4o"]
+
+    def test_raises_on_mismatched_lengths(self):
+        config = dict(_API_CONFIG_WITH_GT)
+        config["userInputOptions"] = ["only one input"]
+        with pytest.raises(ValueError, match="same length"):
+            self._build(config=config)
+
+    def test_returns_standard_options_when_no_gt(self):
+        config = dict(_API_CONFIG)  # no groundTruthResponses
+        result = self._build(config=config)
+        assert isinstance(result, OptimizationOptions)
+
+    async def test_optimize_from_config_dispatches_to_gt_run(self):
+        mock_ldai = _make_ldai_client()
+        with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-key"}):
+            client = _make_client(mock_ldai)
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG_WITH_GT))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            options = _make_from_config_options(
+                handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="correct answer")),
+                handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_PASS_RESPONSE)),
+            )
+            result = await client.optimize_from_config("my-gt-opt", options)
+
+        assert isinstance(result, list)
+        assert len(result) == 2
+
+
+# ---------------------------------------------------------------------------
+# _acceptance_criteria_implies_duration_optimization
+# ---------------------------------------------------------------------------
+
+
+class TestAcceptanceCriteriaImpliesDurationOptimization:
+    def test_returns_false_when_judges_is_none(self):
+        assert _acceptance_criteria_implies_duration_optimization(None) is False
+
+    def test_returns_false_when_judges_is_empty(self):
+        assert _acceptance_criteria_implies_duration_optimization({}) is False
+
+    def test_returns_false_when_no_acceptance_statements(self):
+        judges = {"quality": OptimizationJudge(threshold=0.8, judge_key="judge-1")}
+        assert _acceptance_criteria_implies_duration_optimization(judges) is False
+
+    def test_returns_false_when_acceptance_statement_has_no_latency_keywords(self):
+        judges = {
+            "accuracy": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be accurate and complete.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is False
+
+    def test_detects_fast_keyword(self):
+        judges = {
+            "speed": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be fast.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_faster_keyword(self):
+        judges = {
+            "speed": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The agent should respond faster.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_latency_keyword(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The agent must have low latency.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_duration_keyword(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="Minimize the duration of each response.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_ms_keyword(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="Responses should complete in under 500ms.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_response_time_phrase(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response time should be minimized.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_efficient_keyword(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The model must be efficient.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_detects_snappy_keyword(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="Responses should feel snappy.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_case_insensitive_match(self):
+        judges = {
+            "perf": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The model must be EFFICIENT and FAST.",
+            )
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_returns_true_when_any_judge_matches(self):
+        judges = {
+            "accuracy": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be accurate.",
+            ),
+            "speed": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be fast.",
+            ),
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is True
+
+    def test_returns_false_when_acceptance_statement_is_none(self):
+        judges = {
+            "quality": OptimizationJudge(threshold=0.8, acceptance_statement=None)
+        }
+        assert _acceptance_criteria_implies_duration_optimization(judges) is False
+
+
+# ---------------------------------------------------------------------------
+# _evaluate_duration
+# ---------------------------------------------------------------------------
+
+
+class TestEvaluateDuration:
+    def setup_method(self):
+        self.client = _make_client()
+        self.client._options = _make_options()
+        self.client._agent_config = _make_agent_config()
+        self.client._initialize_class_members_from_config(_make_agent_config())
+
+    def _ctx(self, duration_ms, iteration=1):
+        return OptimizationContext(
+            scores={},
+            completion_response="response",
+            current_instructions="Do X.",
+            current_parameters={},
+            current_variables={},
+            iteration=iteration,
+            duration_ms=duration_ms,
+        )
+
+    def test_returns_true_when_history_is_empty(self):
+        self.client._history = []
+        assert self.client._evaluate_duration(self._ctx(5000)) is True
+
+    def test_returns_true_when_baseline_duration_is_none(self):
+        self.client._history = [self._ctx(None, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(5000, iteration=2)) is True
+
+    def test_returns_true_when_candidate_duration_is_none(self):
+        self.client._history = [self._ctx(2000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(None, iteration=2)) is True
+
+    def test_passes_when_candidate_is_more_than_20_percent_faster(self):
+        # baseline=2000ms, threshold=1600ms, candidate=1500ms → 1500 < 1600 → pass
+        self.client._history = [self._ctx(2000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(1500, iteration=2)) is True
+
+    def test_fails_when_candidate_is_exactly_at_threshold(self):
+        # baseline=2000ms, threshold=1600ms, candidate=1600ms → not strictly less → fail
+        self.client._history = [self._ctx(2000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(1600, iteration=2)) is False
+
+    def test_fails_when_improvement_is_less_than_20_percent(self):
+        # baseline=2000ms, threshold=1600ms, candidate=1800ms → 1800 >= 1600 → fail
+        self.client._history = [self._ctx(2000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(1800, iteration=2)) is False
+
+    def test_fails_when_candidate_matches_baseline(self):
+        self.client._history = [self._ctx(2000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(2000, iteration=2)) is False
+
+    def test_fails_when_candidate_is_slower_than_baseline(self):
+        self.client._history = [self._ctx(2000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(2500, iteration=2)) is False
+
+    def test_uses_history_index_zero_as_baseline_not_last(self):
+        # history[0] is 2000ms (baseline), history[-1] is 500ms (fast, but not the baseline)
+        first = self._ctx(2000, iteration=1)
+        later = self._ctx(500, iteration=2)
+        self.client._history = [first, later]
+        # candidate=1500ms < 2000 * 0.80 = 1600ms → pass (uses history[0], not history[-1])
+        assert self.client._evaluate_duration(self._ctx(1500, iteration=3)) is True
+
+    def test_pass_boundary_just_below_threshold(self):
+        # baseline=1000ms, threshold=800ms, candidate=799ms → pass
+        self.client._history = [self._ctx(1000, iteration=1)]
+        assert self.client._evaluate_duration(self._ctx(799, iteration=2)) is True
+
+
+# ---------------------------------------------------------------------------
+# Duration optimization — chaos mode wiring
+# ---------------------------------------------------------------------------
+
+
+class TestDurationOptimizationChaosMode:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    def _duration_judges(self, statement="The response must be fast."):
+        return {
+            "speed": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement=statement,
+            )
+        }
+
+    def _ctx_with(self, duration_ms, score=1.0, iteration=1):
+        return OptimizationContext(
+            scores={"speed": JudgeResult(score=score)},
+            completion_response="answer",
+            current_instructions="Do X.",
+            current_parameters={},
+            current_variables={"language": "English"},
+            iteration=iteration,
+            duration_ms=duration_ms,
+        )
+
+    async def test_duration_gate_triggers_variation_when_not_fast_enough(self):
+        """Judge passes but duration fails threshold → variation generated → second attempt succeeds."""
+        client = _make_client(self.mock_ldai)
+
+        # Iter 1: judge fails → history[0].duration_ms = 2000
+        # Iter 2: judge passes, duration 1800ms ≥ 2000 * 0.80 = 1600ms → duration fails → variation
+        # Iter 3: judge passes, duration 1500ms < 1600ms → passes → validation → success
+        execute_side_effects = [
+            self._ctx_with(duration_ms=2000, score=0.2, iteration=1),   # iter 1: judge fails
+            self._ctx_with(duration_ms=1800, score=1.0, iteration=2),   # iter 2: judge passes, duration fails
+            self._ctx_with(duration_ms=1500, score=1.0, iteration=3),   # iter 3: both pass
+            self._ctx_with(duration_ms=1500, score=1.0, iteration=4),   # validation
+        ]
+
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+        opts = _make_options(
+            handle_agent_call=handle_agent_call,
+            judges=self._duration_judges(),
+            max_attempts=5,
+        )
+
+        with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+            mock_execute.side_effect = execute_side_effects
+            result = await client.optimize_from_options("test-agent", opts)
+
+        assert result.duration_ms == 1500
+        # 2 variations generated (after iter 1 judge fail, after iter 2 duration fail)
+        assert handle_agent_call.call_count == 2
+        assert mock_execute.call_count == 4
+
+    async def test_duration_check_skipped_on_first_iteration_no_baseline(self):
+        """First iteration has no history → duration check always skipped → succeeds even if slow."""
+        client = _make_client(self.mock_ldai)
+
+        # Iter 1 (no history): judge passes, duration check skipped → validation
+        # Validation: judge passes, duration check still uses history[0] = None since nothing appended yet
+        execute_side_effects = [
+            self._ctx_with(duration_ms=9999, score=1.0, iteration=1),   # iter 1: would fail if checked
+            self._ctx_with(duration_ms=9999, score=1.0, iteration=2),   # validation
+        ]
+
+        opts = _make_options(
+            handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")),
+            judges=self._duration_judges(),
+            max_attempts=3,
+        )
+
+        with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+            mock_execute.side_effect = execute_side_effects
+            result = await client.optimize_from_options("test-agent", opts)
+
+        # Succeeds because history is empty and duration check is skipped
+        assert result.duration_ms == 9999
+
+    async def test_no_duration_gate_when_acceptance_criteria_has_no_latency_keywords(self):
+        """Acceptance statement with no latency keywords → duration gate never applied."""
+        client = _make_client(self.mock_ldai)
+
+        # Judge passes on first try; duration would fail if gate were applied (same as baseline)
+        # but since acceptance criteria has no latency keywords, it should succeed anyway
+        execute_side_effects = [
+            self._ctx_with(duration_ms=2000, score=1.0, iteration=1),
+            self._ctx_with(duration_ms=2000, score=1.0, iteration=2),   # validation
+        ]
+
+        non_latency_judges = {
+            "accuracy": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be accurate and complete.",
+            )
+        }
+        opts = _make_options(
+            handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")),
+            judges=non_latency_judges,
+            max_attempts=3,
+        )
+
+        with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+            mock_execute.side_effect = execute_side_effects
+            # Manually seed history so _evaluate_duration would fire if incorrectly triggered
+            client._history = [self._ctx_with(duration_ms=2000, iteration=0)]
+            result = await client.optimize_from_options("test-agent", opts)
+
+        assert result is not None
+
+    async def test_evaluate_duration_called_in_validation_phase(self):
+        """Duration gate also runs on validation samples, not just the primary turn."""
+        client = _make_client(self.mock_ldai)
+
+        # Iter 1: judge fails → history[0].duration_ms = 2000
+        # Iter 2: judge passes, duration 1500ms → primary passes
+        # Validation sample: judge passes, duration 1800ms ≥ 1600ms → validation fails → variation
+        # Iter 3: judge passes, duration 1500ms → primary passes
+        # Validation: judge passes, duration 1500ms → validation passes → success
+        execute_side_effects = [
+            self._ctx_with(duration_ms=2000, score=0.2, iteration=1),   # iter 1: judge fails
+            self._ctx_with(duration_ms=1500, score=1.0, iteration=2),   # iter 2: passes
+            self._ctx_with(duration_ms=1800, score=1.0, iteration=3),   # validation: duration fails
+            self._ctx_with(duration_ms=1500, score=1.0, iteration=4),   # iter 3: passes
+            self._ctx_with(duration_ms=1500, score=1.0, iteration=5),   # validation: passes
+        ]
+
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+        opts = _make_options(
+            handle_agent_call=handle_agent_call,
+            judges=self._duration_judges(),
+            max_attempts=5,
+        )
+
+        with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+            mock_execute.side_effect = execute_side_effects
+            result = await client.optimize_from_options("test-agent", opts)
+
+        assert result.duration_ms == 1500
+        assert mock_execute.call_count == 5
+
+
+# ---------------------------------------------------------------------------
+# Duration optimization — ground truth mode wiring
+# ---------------------------------------------------------------------------
+
+
+class TestDurationOptimizationGroundTruthMode:
+    def setup_method(self):
+        self.mock_ldai = _make_ldai_client()
+
+    def _duration_judges(self):
+        return {
+            "speed": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be fast.",
+            )
+        }
+
+    def _gt_ctx(self, duration_ms, score=1.0, iteration=1, user_input="q"):
+        return OptimizationContext(
+            scores={"speed": JudgeResult(score=score)},
+            completion_response="answer",
+            current_instructions="Do X.",
+            current_parameters={},
+            current_variables={},
+            iteration=iteration,
+            duration_ms=duration_ms,
+            user_input=user_input,
+        )
+
+    async def test_duration_gate_applied_per_sample_in_ground_truth_mode(self):
+        """In GT mode, the duration check fires per sample, not just once per attempt."""
+        client = _make_client(self.mock_ldai)
+
+        # Attempt 1:
+        #   Sample 1: judge fails (score 0.2) → all_passed = False
+        #   Sample 2: judge passes → duration skipped (history empty for sample 2)
+        #   → history extended with attempt 1 results → variation generated
+        # Attempt 2:
+        #   Sample 1: judge passes, duration 1800ms vs baseline history[0].duration_ms = 2000ms
+        #             → 1800 >= 1600 → duration fails → sample_passed = False → all_passed = False
+        #   (attempt 2 fails due to duration on sample 1)
+        #   → variation generated
+        # Attempt 3:
+        #   Sample 1: judge passes, duration 1500ms < 1600ms → passes
+        #   Sample 2: judge passes, duration 1500ms (history[0] still 2000ms) → passes
+        #   → all_passed = True → success
+        execute_side_effects = [
+            # Attempt 1
+            self._gt_ctx(duration_ms=2000, score=0.2, iteration=1, user_input="q1"),
+            self._gt_ctx(duration_ms=2000, score=1.0, iteration=2, user_input="q2"),
+            # Variation (not from _execute_agent_turn, from handle_agent_call)
+            # Attempt 2
+            self._gt_ctx(duration_ms=1800, score=1.0, iteration=3, user_input="q1"),
+            self._gt_ctx(duration_ms=1800, score=1.0, iteration=4, user_input="q2"),
+            # Variation
+            # Attempt 3
+            self._gt_ctx(duration_ms=1500, score=1.0, iteration=5, user_input="q1"),
+            self._gt_ctx(duration_ms=1500, score=1.0, iteration=6, user_input="q2"),
+        ]
+
+        handle_agent_call = AsyncMock(return_value=OptimizationResponse(output=VARIATION_RESPONSE))
+        opts = _make_gt_options(
+            handle_agent_call=handle_agent_call,
+            judges=self._duration_judges(),
+            max_attempts=5,
+        )
+
+        with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+            mock_execute.side_effect = execute_side_effects
+            results = await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        assert isinstance(results, list)
+        for ctx in results:
+            assert ctx.duration_ms == 1500
+        # 2 variations generated
+        assert handle_agent_call.call_count == 2
+        assert mock_execute.call_count == 6
+
+    async def test_no_duration_gate_in_gt_mode_when_no_latency_keywords(self):
+        """In GT mode, duration gate is not applied when acceptance criteria has no latency keywords."""
+        client = _make_client(self.mock_ldai)
+
+        execute_side_effects = [
+            self._gt_ctx(duration_ms=5000, score=1.0, iteration=1, user_input="q1"),
+            self._gt_ctx(duration_ms=5000, score=1.0, iteration=2, user_input="q2"),
+        ]
+
+        non_latency_judges = {
+            "accuracy": OptimizationJudge(
+                threshold=0.8,
+                acceptance_statement="The response must be accurate.",
+            )
+        }
+        opts = _make_gt_options(
+            handle_agent_call=AsyncMock(return_value=OptimizationResponse(output="answer")),
+            judges=non_latency_judges,
+            max_attempts=3,
+        )
+
+        with patch.object(client, "_execute_agent_turn", new_callable=AsyncMock) as mock_execute:
+            mock_execute.side_effect = execute_side_effects
+            results = await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        # Succeeds on first attempt even with slow duration (no latency keyword → no gate)
+        assert isinstance(results, list)
+        assert mock_execute.call_count == 2
+
+
+# ---------------------------------------------------------------------------
+# _commit_variation
+# ---------------------------------------------------------------------------
+
+
+class TestCommitVariation:
+    def _make_client(self) -> OptimizationClient:
+        with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+            return OptimizationClient(_make_ldai_client())
+
+    # --- key generation ---
+
+    def test_uses_output_key_as_variation_key(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+
+        key = client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="my-custom-key", api_client=api_client,
+        )
+
+        assert key == "my-custom-key"
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["key"] == "my-custom-key"
+        assert payload["name"] == "my-custom-key"
+
+    def test_generates_slug_when_output_key_is_none(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+
+        with patch("ldai_optimizer.client.generate_slug", return_value="fancy-panda"):
+            key = client._commit_variation(
+                _make_winning_context(), project_key="my-project",
+                ai_config_key="my-agent", output_key=None, api_client=api_client,
+            )
+
+        assert key == "fancy-panda"
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["key"] == "fancy-panda"
+        assert payload["name"] == "fancy-panda"
+
+    # --- collision handling ---
+
+    def test_appends_hex_suffix_on_key_collision(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit(existing_variation_keys=["my-key"])
+
+        with patch("ldai_optimizer.client.random.randint", return_value=0x1234):
+            key = client._commit_variation(
+                _make_winning_context(), project_key="my-project",
+                ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+            )
+
+        assert key == "my-key-1234"
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["key"] == "my-key-1234"
+
+    def test_no_suffix_when_key_does_not_collide(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit(existing_variation_keys=["other-key"])
+
+        key = client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+        )
+
+        assert key == "my-key"
+
+    def test_proceeds_with_candidate_when_get_ai_config_raises(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+        api_client.get_ai_config.side_effect = Exception("network error")
+
+        key = client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+        )
+
+        assert key == "my-key"
+        api_client.create_ai_config_variation.assert_called_once()
+
+    # --- payload shape ---
+
+    def test_payload_mode_is_agent(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+
+        client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["mode"] == "agent"
+
+    def test_payload_instructions_from_context(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+        ctx = _make_winning_context(instructions="You are a travel assistant.")
+
+        client._commit_variation(
+            ctx, project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["instructions"] == "You are a travel assistant."
+
+    def test_create_called_with_correct_project_and_config_key(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+
+        client._commit_variation(
+            _make_winning_context(), project_key="proj-abc",
+            ai_config_key="agent-xyz", output_key="k", api_client=api_client,
+        )
+
+        args = api_client.create_ai_config_variation.call_args[0]
+        assert args[0] == "proj-abc"
+        assert args[1] == "agent-xyz"
+
+    # --- modelConfigKey resolution ---
+
+    def test_model_config_key_resolved_via_api_match_on_id(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit(model_configs=[
+            {"id": "gpt-4o", "key": "OpenAI.gpt-4o"},
+            {"id": "claude-3", "key": "Anthropic.claude-3"},
+        ])
+
+        client._commit_variation(
+            _make_winning_context(model="gpt-4o"), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["modelConfigKey"] == "OpenAI.gpt-4o"
+
+    def test_model_config_key_falls_back_to_model_name_when_no_id_match(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit(model_configs=[
+            {"id": "claude-3", "key": "Anthropic.claude-3"},
+        ])
+
+        client._commit_variation(
+            _make_winning_context(model="gpt-4o"), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["modelConfigKey"] == "gpt-4o"
+
+    def test_model_config_key_prefers_global_over_non_global(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit(model_configs=[
+            {"id": "gpt-4o", "key": "project.gpt-4o", "global": False},
+            {"id": "gpt-4o", "key": "global.gpt-4o", "global": True},
+        ])
+
+        client._commit_variation(
+            _make_winning_context(model="gpt-4o"), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["modelConfigKey"] == "global.gpt-4o"
+
+    def test_model_config_key_falls_back_when_get_model_configs_raises(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+        api_client.get_model_configs.side_effect = Exception("network error")
+
+        client._commit_variation(
+            _make_winning_context(model="gpt-4o"), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["modelConfigKey"] == "gpt-4o"
+
+    # --- retry logic ---
+
+    def test_retries_on_transient_failure_and_succeeds(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+        api_client.create_ai_config_variation.side_effect = [
+            Exception("transient"),
+            {"key": "my-key"},
+        ]
+
+        key = client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="my-key", api_client=api_client,
+        )
+
+        assert key == "my-key"
+        assert api_client.create_ai_config_variation.call_count == 2
+
+    def test_raises_after_three_consecutive_failures(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+        api_client.create_ai_config_variation.side_effect = RuntimeError("permanent")
+
+        with pytest.raises(RuntimeError, match="permanent"):
+            client._commit_variation(
+                _make_winning_context(), project_key="my-project",
+                ai_config_key="my-agent", output_key="k", api_client=api_client,
+            )
+
+        assert api_client.create_ai_config_variation.call_count == 3
+
+    # --- LDApiClient construction ---
+
+    def test_creates_api_client_from_stored_key_when_none_provided(self):
+        client = self._make_client()
+
+        with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+            MockLDApiClient.return_value = _make_api_client_for_commit()
+            client._commit_variation(
+                _make_winning_context(), project_key="my-project",
+                ai_config_key="my-agent", output_key="k",
+            )
+
+        MockLDApiClient.assert_called_once_with("test-api-key")
+
+    def test_passes_base_url_when_creating_api_client(self):
+        client = self._make_client()
+
+        with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+            MockLDApiClient.return_value = _make_api_client_for_commit()
+            client._commit_variation(
+                _make_winning_context(), project_key="my-project",
+                ai_config_key="my-agent", output_key="k",
+                base_url="https://app.launchdarkly.us",
+            )
+
+        MockLDApiClient.assert_called_once_with(
+            "test-api-key", base_url="https://app.launchdarkly.us"
+        )
+
+    def test_reuses_provided_api_client_without_creating_new_one(self):
+        client = self._make_client()
+        api_client = _make_api_client_for_commit()
+
+        with patch("ldai_optimizer.client.LDApiClient") as MockLDApiClient:
+            client._commit_variation(
+                _make_winning_context(), project_key="my-project",
+                ai_config_key="my-agent", output_key="k", api_client=api_client,
+            )
+
+        MockLDApiClient.assert_not_called()
+
+    # --- tool key propagation ---
+
+    def test_toolkeys_included_in_payload_when_tools_present(self):
+        client = self._make_client()
+        client._initial_tool_keys = ["search-tool", "calculator"]
+        api_client = _make_api_client_for_commit()
+
+        client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert payload["toolKeys"] == ["search-tool", "calculator"]
+
+    def test_toolkeys_not_in_payload_when_no_tools(self):
+        client = self._make_client()
+        client._initial_tool_keys = []
+        api_client = _make_api_client_for_commit()
+
+        client._commit_variation(
+            _make_winning_context(), project_key="my-project",
+            ai_config_key="my-agent", output_key="k", api_client=api_client,
+        )
+
+        payload = api_client.create_ai_config_variation.call_args[0][2]
+        assert "toolKeys" not in payload
+
+
+# ---------------------------------------------------------------------------
+# Tool key extraction from raw variation (_get_agent_config)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestGetAgentConfigToolKeyExtraction:
+    def _make_client_with_variation(self, raw_variation: dict) -> OptimizationClient:
+        mock_ldai = _make_ldai_client()
+        mock_ldai._client.variation.return_value = raw_variation
+        return _make_client(mock_ldai)
+
+    async def test_extracts_tool_keys_from_raw_variation(self):
+        raw = {
+            "instructions": AGENT_INSTRUCTIONS,
+            "tools": [
+                {"key": "search-tool", "version": 1},
+                {"key": "calculator", "version": 2},
+            ],
+        }
+        client = self._make_client_with_variation(raw)
+        await client._get_agent_config("test-agent", LD_CONTEXT)
+        assert client._initial_tool_keys == ["search-tool", "calculator"]
+
+    async def test_initial_tool_keys_empty_when_no_tools_in_variation(self):
+        raw = {"instructions": AGENT_INSTRUCTIONS}
+        client = self._make_client_with_variation(raw)
+        await client._get_agent_config("test-agent", LD_CONTEXT)
+        assert client._initial_tool_keys == []
+
+    async def test_initial_tool_keys_empty_when_tools_is_empty_list(self):
+        raw = {"instructions": AGENT_INSTRUCTIONS, "tools": []}
+        client = self._make_client_with_variation(raw)
+        await client._get_agent_config("test-agent", LD_CONTEXT)
+        assert client._initial_tool_keys == []
+
+    async def test_skips_tool_entries_without_key(self):
+        raw = {
+            "instructions": AGENT_INSTRUCTIONS,
+            "tools": [
+                {"key": "good-tool", "version": 1},
+                {"version": 2},  # missing key — should be skipped
+            ],
+        }
+        client = self._make_client_with_variation(raw)
+        await client._get_agent_config("test-agent", LD_CONTEXT)
+        assert client._initial_tool_keys == ["good-tool"]
+
+
+# ---------------------------------------------------------------------------
+# auto_commit in optimize_from_options
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestAutoCommitInOptimizeFromOptions:
+    def _make_client_with_key(self) -> OptimizationClient:
+        with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+            return OptimizationClient(_make_ldai_client())
+
+    def _make_client_without_key(self) -> OptimizationClient:
+        client = OptimizationClient(_make_ldai_client())
+        client._has_api_key = False
+        client._api_key = None
+        return client
+
+    async def test_commit_called_on_success_when_auto_commit_true(self):
+        client = self._make_client_with_key()
+        options = _make_options(auto_commit=True, project_key="my-project")
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_options("test-agent", options)
+
+        mock_commit.assert_called_once()
+
+    async def test_commit_not_called_when_auto_commit_false(self):
+        client = self._make_client_with_key()
+        options = _make_options()  # auto_commit defaults to False
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_options("test-agent", options)
+
+        mock_commit.assert_not_called()
+
+    async def test_commit_not_called_when_run_fails(self):
+        client = self._make_client_with_key()
+        options = _make_options(
+            auto_commit=True,
+            project_key="my-project",
+            handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)),
+            max_attempts=1,
+        )
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_options("test-agent", options)
+
+        mock_commit.assert_not_called()
+
+    async def test_succeeds_without_api_key_when_auto_commit_false(self):
+        client = self._make_client_without_key()
+        options = _make_options()  # auto_commit defaults to False
+
+        with patch("ldai_optimizer.client.LDApiClient") as mock_api_cls:
+            result = await client.optimize_from_options("test-agent", options)
+
+        mock_api_cls.assert_not_called()
+        assert result is not None
+
+    async def test_raises_when_auto_commit_true_and_no_api_key(self):
+        client = self._make_client_without_key()
+        options = _make_options(auto_commit=True, project_key="my-project")
+
+        with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY"):
+            await client.optimize_from_options("test-agent", options)
+
+    async def test_raises_when_auto_commit_true_and_no_project_key(self):
+        client = self._make_client_with_key()
+        options = _make_options(auto_commit=True, project_key=None)
+
+        with pytest.raises(ValueError, match="project_key"):
+            await client.optimize_from_options("test-agent", options)
+
+    async def test_output_key_forwarded_to_commit(self):
+        client = self._make_client_with_key()
+        options = _make_options(
+            auto_commit=True, project_key="my-project", output_key="my-variation"
+        )
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_options("test-agent", options)
+
+        assert mock_commit.call_args[1]["output_key"] == "my-variation"
+
+    async def test_base_url_forwarded_to_commit(self):
+        client = self._make_client_with_key()
+        options = _make_options(
+            auto_commit=True,
+            project_key="my-project",
+            base_url="https://app.launchdarkly.us",
+        )
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_options("test-agent", options)
+
+        assert mock_commit.call_args[1]["base_url"] == "https://app.launchdarkly.us"
+
+    async def test_agent_key_used_as_ai_config_key(self):
+        client = self._make_client_with_key()
+        options = _make_options(auto_commit=True, project_key="my-project")
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_options("test-agent", options)
+
+        assert mock_commit.call_args[1]["ai_config_key"] == "test-agent"
+
+
+# ---------------------------------------------------------------------------
+# auto_commit in optimize_from_ground_truth_options
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestAutoCommitInOptimizeFromGroundTruthOptions:
+    def _make_client_with_key(self) -> OptimizationClient:
+        with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+            return OptimizationClient(_make_ldai_client())
+
+    def _make_client_without_key(self) -> OptimizationClient:
+        client = OptimizationClient(_make_ldai_client())
+        client._has_api_key = False
+        client._api_key = None
+        return client
+
+    async def test_commit_called_on_success_when_auto_commit_true(self):
+        client = self._make_client_with_key()
+        opts = _make_gt_options(auto_commit=True, project_key="my-project")
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        mock_commit.assert_called_once()
+
+    async def test_commit_not_called_when_auto_commit_false(self):
+        client = self._make_client_with_key()
+        opts = _make_gt_options()  # auto_commit defaults to False
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        mock_commit.assert_not_called()
+
+    async def test_commit_not_called_when_run_fails(self):
+        client = self._make_client_with_key()
+        opts = _make_gt_options(
+            auto_commit=True,
+            project_key="my-project",
+            handle_judge_call=AsyncMock(return_value=OptimizationResponse(output=JUDGE_FAIL_RESPONSE)),
+            max_attempts=1,
+        )
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        mock_commit.assert_not_called()
+
+    async def test_raises_when_auto_commit_true_and_no_api_key(self):
+        client = self._make_client_without_key()
+        opts = _make_gt_options(auto_commit=True, project_key="my-project")
+
+        with pytest.raises(ValueError, match="LAUNCHDARKLY_API_KEY"):
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+    async def test_raises_when_auto_commit_true_and_no_project_key(self):
+        client = self._make_client_with_key()
+        opts = _make_gt_options(auto_commit=True, project_key=None)
+
+        with pytest.raises(ValueError, match="project_key"):
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+    async def test_output_key_forwarded_to_commit(self):
+        client = self._make_client_with_key()
+        opts = _make_gt_options(
+            auto_commit=True, project_key="my-project", output_key="my-variation"
+        )
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        assert mock_commit.call_args[1]["output_key"] == "my-variation"
+
+    async def test_base_url_forwarded_to_commit(self):
+        client = self._make_client_with_key()
+        opts = _make_gt_options(
+            auto_commit=True,
+            project_key="my-project",
+            base_url="https://app.launchdarkly.us",
+        )
+
+        with patch.object(client, "_commit_variation") as mock_commit:
+            await client.optimize_from_ground_truth_options("test-agent", opts)
+
+        assert mock_commit.call_args[1]["base_url"] == "https://app.launchdarkly.us"
+
+
+# ---------------------------------------------------------------------------
+# auto_commit in optimize_from_config
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.asyncio
+class TestAutoCommitInOptimizeFromConfig:
+    def _make_client_with_key(self) -> OptimizationClient:
+        with patch.dict("os.environ", {"LAUNCHDARKLY_API_KEY": "test-api-key"}):
+            return OptimizationClient(_make_ldai_client())
+
+    async def test_commit_called_by_default(self):
+        """auto_commit=True is the default for optimize_from_config."""
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            with patch.object(client, "_commit_variation") as mock_commit:
+                await client.optimize_from_config("my-opt", _make_from_config_options())
+
+        mock_commit.assert_called_once()
+
+    async def test_commit_not_called_when_auto_commit_false(self):
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            with patch.object(client, "_commit_variation") as mock_commit:
+                await client.optimize_from_config(
+                    "my-opt", _make_from_config_options(auto_commit=False)
+                )
+
+        mock_commit.assert_not_called()
+
+    async def test_commit_receives_pre_built_api_client(self):
+        """The api_client created for fetching config is reused for _commit_variation."""
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            with patch.object(client, "_commit_variation") as mock_commit:
+                await client.optimize_from_config("my-opt", _make_from_config_options())
+
+        assert mock_commit.call_args[1]["api_client"] is mock_api
+
+    async def test_output_key_forwarded_to_commit(self):
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            with patch.object(client, "_commit_variation") as mock_commit:
+                await client.optimize_from_config(
+                    "my-opt", _make_from_config_options(output_key="my-variation")
+                )
+
+        assert mock_commit.call_args[1]["output_key"] == "my-variation"
+
+    async def test_model_configs_forwarded_to_commit(self):
+        """Pre-fetched model configs are passed to _commit_variation to avoid extra API calls."""
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+        mock_api.get_model_configs = MagicMock(return_value=[{"id": "gpt-4o", "key": "OpenAI.gpt-4o"}])
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            with patch.object(client, "_commit_variation") as mock_commit:
+                await client.optimize_from_config("my-opt", _make_from_config_options())
+
+        assert mock_commit.call_args[1]["model_configs"] == [{"id": "gpt-4o", "key": "OpenAI.gpt-4o"}]
+
+    async def test_patches_created_variation_key_after_commit(self):
+        """After _commit_variation succeeds, the last result record is PATCHed with createdVariationKey."""
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            with patch.object(client, "_commit_variation", return_value="my-new-variation"):
+                client._last_optimization_result_id = "result-id-abc"
+                await client.optimize_from_config("my-opt", _make_from_config_options())
+
+        patch_calls = mock_api.patch_agent_optimization_result.call_args_list
+        variation_key_patch = next(
+            (c for c in patch_calls if c[0][3].get("createdVariationKey") == "my-new-variation"),
+            None,
+        )
+        assert variation_key_patch is not None, "Expected a PATCH with createdVariationKey"
+        # URL path uses the string key ("my-optimization"), not the UUID ("opt-uuid-123")
+        assert variation_key_patch[0][1] == "my-optimization"
+
+    async def test_optimization_key_in_post_url_uses_string_key_not_uuid(self):
+        """post_agent_optimization_result is called with config['key'], not config['id']."""
+        client = self._make_client_with_key()
+        mock_api = _make_mock_api_client()
+        mock_api.get_agent_optimization = MagicMock(return_value=dict(_API_CONFIG))
+
+        with patch("ldai_optimizer.client.LDApiClient", return_value=mock_api):
+            await client.optimize_from_config("my-opt", _make_from_config_options())
+
+        post_call_args = mock_api.post_agent_optimization_result.call_args_list
+        assert len(post_call_args) >= 1
+        for call in post_call_args:
+            opt_key_arg = call[0][1]
+            # Must use the string key "my-optimization", never the UUID "opt-uuid-123"
+            assert opt_key_arg == "my-optimization", (
+                f"Expected string key 'my-optimization', got '{opt_key_arg}'"
+            )
diff --git a/packages/optimization/tests/test_ld_api_client.py b/packages/optimization/tests/test_ld_api_client.py
new file mode 100644
index 00000000..4faa750b
--- /dev/null
+++ b/packages/optimization/tests/test_ld_api_client.py
@@ -0,0 +1,371 @@
+"""Tests for ldai_optimizer.ld_api_client."""
+
+import json
+import urllib.error
+import urllib.request
+from io import BytesIO
+from typing import Any, Dict
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from ldai_optimizer.ld_api_client import (
+    AgentOptimizationConfig,
+    AgentOptimizationResultPost as OptimizationResultPayload,
+    LDApiClient,
+    LDApiError,
+    _MAX_RETRIES,
+    _parse_agent_optimization,
+)
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+_BASE_CONFIG: Dict[str, Any] = {
+    "id": "opt-uuid-123",
+    "key": "my-optimization",
+    "aiConfigKey": "my-agent",
+    "maxAttempts": 3,
+    "modelChoices": ["gpt-4o", "gpt-4o-mini"],
+    "judgeModel": "gpt-4o",
+    "variableChoices": [{"language": "English"}],
+    "acceptanceStatements": [{"statement": "Be accurate.", "threshold": 0.9}],
+    "judges": [],
+    "userInputOptions": ["What is 2+2?"],
+    "version": 1,
+    "createdAt": 1700000000,
+}
+
+
+def _make_config(**overrides: Any) -> Dict[str, Any]:
+    return {**_BASE_CONFIG, **overrides}
+
+
+def _mock_urlopen(response_data: Any, status: int = 200) -> MagicMock:
+    """Return a context-manager mock whose .read() returns JSON-encoded response_data."""
+    mock_resp = MagicMock()
+    mock_resp.read.return_value = json.dumps(response_data).encode()
+    mock_resp.__enter__ = lambda s: s
+    mock_resp.__exit__ = MagicMock(return_value=False)
+    return mock_resp
+
+
+# ---------------------------------------------------------------------------
+# _parse_agent_optimization
+# ---------------------------------------------------------------------------
+
+
+class TestParseAgentOptimization:
+    def test_valid_config_is_returned_unchanged(self):
+        config = _make_config()
+        result = _parse_agent_optimization(config)
+        assert result["id"] == "opt-uuid-123"
+        assert result["aiConfigKey"] == "my-agent"
+
+    def test_optional_fields_not_required(self):
+        config = _make_config()
+        # groundTruthResponses and metricKey are optional — should not raise
+        assert "groundTruthResponses" not in config
+        assert "metricKey" not in config
+        _parse_agent_optimization(config)  # must not raise
+
+    def test_raises_on_non_dict_input(self):
+        with pytest.raises(ValueError, match="Expected a JSON object"):
+            _parse_agent_optimization(["not", "a", "dict"])
+
+    def test_raises_on_none_input(self):
+        with pytest.raises(ValueError, match="Expected a JSON object"):
+            _parse_agent_optimization(None)
+
+    @pytest.mark.parametrize("field", ["id", "key", "aiConfigKey", "judgeModel"])
+    def test_raises_on_missing_required_string_field(self, field: str):
+        config = _make_config()
+        del config[field]
+        with pytest.raises(ValueError, match=f"missing required field '{field}'"):
+            _parse_agent_optimization(config)
+
+    @pytest.mark.parametrize("field", ["maxAttempts", "version", "createdAt"])
+    def test_raises_on_missing_required_int_field(self, field: str):
+        config = _make_config()
+        del config[field]
+        with pytest.raises(ValueError, match=f"missing required field '{field}'"):
+            _parse_agent_optimization(config)
+
+    @pytest.mark.parametrize(
+        "field",
+        ["modelChoices", "variableChoices", "acceptanceStatements", "judges", "userInputOptions"],
+    )
+    def test_raises_on_missing_required_list_field(self, field: str):
+        config = _make_config()
+        del config[field]
+        with pytest.raises(ValueError, match=f"missing required field '{field}'"):
+            _parse_agent_optimization(config)
+
+    def test_raises_on_wrong_type_for_string_field(self):
+        config = _make_config(aiConfigKey=123)
+        with pytest.raises(ValueError, match="field 'aiConfigKey' must be a string"):
+            _parse_agent_optimization(config)
+
+    def test_raises_on_wrong_type_for_int_field(self):
+        config = _make_config(maxAttempts="three")
+        with pytest.raises(ValueError, match="field 'maxAttempts' must be an integer"):
+            _parse_agent_optimization(config)
+
+    def test_raises_on_wrong_type_for_list_field(self):
+        config = _make_config(modelChoices="gpt-4o")
+        with pytest.raises(ValueError, match="field 'modelChoices' must be a list"):
+            _parse_agent_optimization(config)
+
+    def test_raises_when_model_choices_is_empty(self):
+        config = _make_config(modelChoices=[])
+        with pytest.raises(ValueError, match="at least 1 entry"):
+            _parse_agent_optimization(config)
+
+    def test_collects_multiple_errors_in_one_raise(self):
+        config = _make_config()
+        del config["id"]
+        del config["maxAttempts"]
+        config["modelChoices"] = "bad"
+        with pytest.raises(ValueError) as exc_info:
+            _parse_agent_optimization(config)
+        msg = str(exc_info.value)
+        assert "id" in msg
+        assert "maxAttempts" in msg
+        assert "modelChoices" in msg
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient._request
+# ---------------------------------------------------------------------------
+
+
+class TestLDApiClientRequest:
+    def test_get_does_not_send_content_type(self):
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client._request("GET", "/some/path")
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert "Content-Type" not in req.headers
+
+    def test_post_sends_content_type(self):
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client._request("POST", "/some/path", body={"key": "value"})
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert req.get_header("Content-type") == "application/json"
+
+    def test_authorization_header_always_sent(self):
+        client = LDApiClient("my-api-key")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client._request("GET", "/path")
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert req.get_header("Authorization") == "my-api-key"
+
+    def test_raises_ld_api_error_on_http_error(self):
+        client = LDApiClient("test-key")
+        http_error = urllib.error.HTTPError(
+            url="http://x", code=404, msg="Not Found", hdrs=MagicMock(), fp=BytesIO(b"not found body")
+        )
+        with patch("urllib.request.urlopen", side_effect=http_error):
+            with pytest.raises(LDApiError) as exc_info:
+                client._request("GET", "/missing")
+        assert exc_info.value.status_code == 404
+        assert "404" in str(exc_info.value)
+
+    def test_raises_ld_api_error_on_url_error(self):
+        client = LDApiClient("test-key")
+        url_error = urllib.error.URLError(reason="Connection refused")
+        with patch("urllib.request.urlopen", side_effect=url_error):
+            with pytest.raises(LDApiError) as exc_info:
+                client._request("GET", "/path")
+        assert exc_info.value.status_code is None
+        assert "Connection refused" in str(exc_info.value)
+
+    def test_401_error_includes_api_key_hint(self):
+        client = LDApiClient("test-key")
+        http_error = urllib.error.HTTPError(
+            url="http://x", code=401, msg="Unauthorized", hdrs=MagicMock(), fp=BytesIO(b"")
+        )
+        with patch("urllib.request.urlopen", side_effect=http_error):
+            with pytest.raises(LDApiError, match="LAUNCHDARKLY_API_KEY"):
+                client._request("GET", "/path")
+
+    def test_404_error_includes_key_hint(self):
+        client = LDApiClient("test-key")
+        http_error = urllib.error.HTTPError(
+            url="http://x", code=404, msg="Not Found", hdrs=MagicMock(), fp=BytesIO(b"")
+        )
+        with patch("urllib.request.urlopen", side_effect=http_error):
+            with pytest.raises(LDApiError, match="project key"):
+                client._request("GET", "/path")
+
+    def test_custom_base_url_used_in_request(self):
+        client = LDApiClient("test-key", base_url="https://staging.launchdarkly.com")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client._request("GET", "/api/v2/test")
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert req.full_url.startswith("https://staging.launchdarkly.com")
+
+    def test_trailing_slash_stripped_from_base_url(self):
+        client = LDApiClient("test-key", base_url="https://app.launchdarkly.com/")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client._request("GET", "/api/v2/test")
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert "//" not in req.full_url.replace("https://", "")
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient.get_agent_optimization
+# ---------------------------------------------------------------------------
+
+
+class TestGetAgentOptimization:
+    def test_requests_correct_path(self):
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen(_make_config())) as mock_open:
+            client.get_agent_optimization("my-project", "my-opt-key")
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert "/api/v2/projects/my-project/agent-optimizations/my-opt-key" in req.full_url
+
+    def test_returns_validated_config(self):
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen(_make_config())):
+            result = client.get_agent_optimization("proj", "opt")
+        assert result["aiConfigKey"] == "my-agent"
+        assert result["maxAttempts"] == 3
+
+    def test_raises_on_invalid_response(self):
+        client = LDApiClient("test-key")
+        bad_response = {"id": "x"}  # missing many required fields
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen(bad_response)):
+            with pytest.raises(ValueError, match="Invalid AgentOptimization response"):
+                client.get_agent_optimization("proj", "opt")
+
+    def test_raises_ld_api_error_on_http_404(self):
+        client = LDApiClient("test-key")
+        http_error = urllib.error.HTTPError(
+            url="http://x", code=404, msg="Not Found", hdrs=MagicMock(), fp=BytesIO(b"not found")
+        )
+        with patch("urllib.request.urlopen", side_effect=http_error):
+            with pytest.raises(LDApiError) as exc_info:
+                client.get_agent_optimization("proj", "missing-key")
+        assert exc_info.value.status_code == 404
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient.post_agent_optimization_result
+# ---------------------------------------------------------------------------
+
+
+class TestPostAgentOptimizationResult:
+    def _make_payload(self) -> OptimizationResultPayload:
+        return {
+            "run_id": "run-abc",
+            "config_optimization_version": 1,
+            "status": "RUNNING",
+            "activity": "GENERATING",
+            "iteration": 1,
+            "instructions": "You are a helpful assistant.",
+            "parameters": {"temperature": 0.7},
+            "completion_response": "The answer is 4.",
+            "scores": {},
+        }
+
+    def test_requests_correct_path(self):
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client.post_agent_optimization_result("my-project", "opt-uuid", self._make_payload())
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            assert "/api/v2/projects/my-project/agent-optimizations/opt-uuid/results" in req.full_url
+
+    def test_sends_payload_as_json_body(self):
+        client = LDApiClient("test-key")
+        payload = self._make_payload()
+        with patch("urllib.request.urlopen", return_value=_mock_urlopen({})) as mock_open:
+            client.post_agent_optimization_result("proj", "opt-id", payload)
+            req: urllib.request.Request = mock_open.call_args[0][0]
+            sent = json.loads(req.data.decode())
+            assert sent["run_id"] == "run-abc"
+            assert sent["status"] == "RUNNING"
+            assert sent["instructions"] == "You are a helpful assistant."
+
+    def test_swallows_http_errors_without_raising(self):
+        client = LDApiClient("test-key")
+        http_error = urllib.error.HTTPError(
+            url="http://x", code=500, msg="Server Error", hdrs=MagicMock(), fp=BytesIO(b"err")
+        )
+        with patch("urllib.request.urlopen", side_effect=http_error):
+            with patch("time.sleep"):
+                # must not raise even after all retries are exhausted
+                client.post_agent_optimization_result("proj", "opt-id", self._make_payload())
+
+    def test_swallows_url_errors_without_raising(self):
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")):
+            with patch("time.sleep"):
+                client.post_agent_optimization_result("proj", "opt-id", self._make_payload())
+
+
+# ---------------------------------------------------------------------------
+# LDApiClient retry behaviour
+# ---------------------------------------------------------------------------
+
+
+class TestLDApiClientRetry:
+    def _http_error(self, code: int) -> urllib.error.HTTPError:
+        return urllib.error.HTTPError(
+            url="http://x", code=code, msg="Error", hdrs=MagicMock(), fp=BytesIO(b"body")
+        )
+
+    def test_retryable_error_retries_max_times(self):
+        """A 429 or 5xx should be retried up to _MAX_RETRIES times then raise."""
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", side_effect=self._http_error(429)) as mock_open:
+            with patch("time.sleep"):
+                with pytest.raises(LDApiError) as exc_info:
+                    client._request("GET", "/path")
+        assert mock_open.call_count == _MAX_RETRIES + 1
+        assert exc_info.value.status_code == 429
+
+    def test_non_retryable_error_raises_immediately(self):
+        """A 401, 403, or 404 should raise after a single attempt with no retries."""
+        for code in (400, 401, 403, 404):
+            client = LDApiClient("test-key")
+            with patch("urllib.request.urlopen", side_effect=self._http_error(code)) as mock_open:
+                with patch("time.sleep") as mock_sleep:
+                    with pytest.raises(LDApiError) as exc_info:
+                        client._request("GET", "/path")
+            assert mock_open.call_count == 1, f"Expected 1 attempt for {code}, got {mock_open.call_count}"
+            mock_sleep.assert_not_called()
+            assert exc_info.value.status_code == code
+
+    def test_url_error_retries_max_times(self):
+        """Network-level errors should also be retried."""
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")) as mock_open:
+            with patch("time.sleep"):
+                with pytest.raises(LDApiError):
+                    client._request("GET", "/path")
+        assert mock_open.call_count == _MAX_RETRIES + 1
+
+    def test_backoff_delays_are_exponential(self):
+        """Sleep durations should double on each retry: 1s, 2s, 4s."""
+        client = LDApiClient("test-key")
+        with patch("urllib.request.urlopen", side_effect=self._http_error(500)):
+            with patch("time.sleep") as mock_sleep:
+                with pytest.raises(LDApiError):
+                    client._request("GET", "/path")
+        sleep_calls = [c.args[0] for c in mock_sleep.call_args_list]
+        assert sleep_calls == [1.0, 2.0, 4.0]
+
+    def test_succeeds_on_retry_after_transient_error(self):
+        """If a retryable error clears, the successful response should be returned."""
+        client = LDApiClient("test-key")
+        ok_response = _mock_urlopen({"result": "ok"})
+        side_effects = [self._http_error(500), ok_response]
+        with patch("urllib.request.urlopen", side_effect=side_effects) as mock_open:
+            with patch("time.sleep"):
+                result = client._request("GET", "/path")
+        assert result == {"result": "ok"}
+        assert mock_open.call_count == 2
diff --git a/packages/optimization/tests/test_package.py b/packages/optimization/tests/test_package.py
index 2123eb68..d7d29514 100644
--- a/packages/optimization/tests/test_package.py
+++ b/packages/optimization/tests/test_package.py
@@ -1,8 +1,8 @@
-"""Smoke tests for ldai_optimization."""
+"""Smoke tests for ldai_optimizer."""
 
 import pytest
 
-from ldai_optimization import ApiAgentOptimizationClient, __version__
+from ldai_optimizer import OptimizationClient, __version__
 
 
 def test_version_is_string():
@@ -10,7 +10,6 @@ def test_version_is_string():
     assert len(__version__) > 0
 
 
-def test_optimize_not_implemented():
-    client = ApiAgentOptimizationClient()
-    with pytest.raises(NotImplementedError):
-        client.optimize("example", {})
+def test_client_requires_ldai_client():
+    with pytest.raises(TypeError):
+        OptimizationClient()  # type: ignore[call-arg]
diff --git a/release-please-config.json b/release-please-config.json
index cf0d738a..3cb7adc0 100644
--- a/release-please-config.json
+++ b/release-please-config.json
@@ -38,10 +38,8 @@
       "versioning": "default",
       "bump-minor-pre-major": true,
       "include-v-in-tag": false,
-      "extra-files": [
-        "src/ldai_optimization/__init__.py"
-      ],
-      "component": "launchdarkly-server-sdk-ai-optimization"
+      "extra-files": ["src/ldai_optimizer/__init__.py"],
+      "component": "ldai_optimizer"
     }
   }
 }