Agent Behavior Testing with Configuration Snapshots: Reproducible Test Environments
Create configuration snapshots for reproducible AI agent testing. Learn snapshot creation, test isolation, seeded randomness, and techniques for achieving deterministic test results.
The Reproducibility Problem
AI agent testing is notoriously flaky. The same test can pass or fail depending on which model version was deployed that week, what temperature was configured, or which tools were enabled. When a test fails, the first question should be "what changed?" — but without configuration snapshots, you have no way to answer that question definitively.
A configuration snapshot captures the complete state of an agent's configuration at a specific point in time. By loading a snapshot before running tests, you ensure the same inputs always produce comparable outputs, regardless of what is currently deployed in production.
Snapshot Data Model
A snapshot includes everything that affects agent behavior: the config values, the model identifier, tool definitions, and a content hash for integrity verification.
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Optional
import json
import hashlib
@dataclass
class ConfigSnapshot:
snapshot_id: str
name: str
description: str
created_at: datetime
agent_config: dict[str, Any]
tool_definitions: list[dict[str, Any]]
model_version: str
content_hash: str
created_by: str
parent_snapshot_id: Optional[str] = None
tags: list[str] = field(default_factory=list)
@staticmethod
def compute_hash(config: dict, tools: list, model: str) -> str:
payload = json.dumps(
{"config": config, "tools": tools, "model": model},
sort_keys=True,
)
return hashlib.sha256(payload.encode()).hexdigest()[:16]
class SnapshotStore:
def __init__(self):
self._snapshots: dict[str, ConfigSnapshot] = {}
def create(
self,
name: str,
agent_config: dict,
tool_definitions: list,
model_version: str,
created_by: str,
description: str = "",
tags: list[str] | None = None,
) -> ConfigSnapshot:
content_hash = ConfigSnapshot.compute_hash(
agent_config, tool_definitions, model_version
)
snapshot = ConfigSnapshot(
snapshot_id=f"snap_{content_hash}",
name=name,
description=description,
created_at=datetime.utcnow(),
agent_config=agent_config,
tool_definitions=tool_definitions,
model_version=model_version,
content_hash=content_hash,
created_by=created_by,
tags=tags or [],
)
self._snapshots[snapshot.snapshot_id] = snapshot
return snapshot
def load(self, snapshot_id: str) -> ConfigSnapshot:
snapshot = self._snapshots.get(snapshot_id)
if not snapshot:
raise KeyError(f"Snapshot not found: {snapshot_id}")
return snapshot
def find_by_tag(self, tag: str) -> list[ConfigSnapshot]:
return [s for s in self._snapshots.values() if tag in s.tags]
Test Isolation with Context Managers
Use a context manager to activate a snapshot, run tests in that isolated environment, and automatically restore the original configuration afterward.
from contextlib import contextmanager
from typing import Generator
class AgentRuntime:
def __init__(self):
self.config: dict = {}
self.tools: list = []
self.model: str = "gpt-4o"
def apply_config(self, config: dict, tools: list, model: str):
self.config = config
self.tools = tools
self.model = model
@contextmanager
def snapshot_context(
runtime: AgentRuntime, snapshot: ConfigSnapshot
) -> Generator[AgentRuntime, None, None]:
# Save current state
original_config = runtime.config.copy()
original_tools = runtime.tools.copy()
original_model = runtime.model
try:
# Apply snapshot
runtime.apply_config(
snapshot.agent_config,
snapshot.tool_definitions,
snapshot.model_version,
)
yield runtime
finally:
# Restore original state
runtime.apply_config(original_config, original_tools, original_model)
Deterministic Test Fixtures
For tests that call an LLM, you need deterministic outputs. There are two approaches: mock the LLM call with recorded responses, or use temperature zero with a fixed seed (when the API supports it).
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
from dataclasses import dataclass
from typing import Callable
@dataclass
class RecordedResponse:
input_hash: str
response: str
model: str
tokens_used: int
class ResponseRecorder:
def __init__(self):
self._recordings: dict[str, RecordedResponse] = {}
def record(self, prompt: str, response: str, model: str, tokens: int):
input_hash = hashlib.sha256(prompt.encode()).hexdigest()[:16]
self._recordings[input_hash] = RecordedResponse(
input_hash=input_hash,
response=response,
model=model,
tokens_used=tokens,
)
def replay(self, prompt: str) -> Optional[RecordedResponse]:
input_hash = hashlib.sha256(prompt.encode()).hexdigest()[:16]
return self._recordings.get(input_hash)
def save(self, path: str):
data = {k: v.__dict__ for k, v in self._recordings.items()}
with open(path, "w") as f:
json.dump(data, f, indent=2)
def load(self, path: str):
with open(path, "r") as f:
data = json.load(f)
self._recordings = {
k: RecordedResponse(**v) for k, v in data.items()
}
Snapshot-Based Test Runner
Combine snapshots with recorded responses for a fully reproducible test suite.
class SnapshotTestRunner:
def __init__(
self,
runtime: AgentRuntime,
snapshot_store: SnapshotStore,
recorder: ResponseRecorder,
):
self._runtime = runtime
self._snapshots = snapshot_store
self._recorder = recorder
def run_test(
self,
snapshot_id: str,
test_input: str,
expected_contains: list[str],
) -> dict:
snapshot = self._snapshots.load(snapshot_id)
with snapshot_context(self._runtime, snapshot):
# Try replay first, fall back to live call
recorded = self._recorder.replay(test_input)
if recorded:
response = recorded.response
else:
response = self._call_agent(test_input)
# Check assertions
passed = all(
phrase.lower() in response.lower()
for phrase in expected_contains
)
return {
"snapshot_id": snapshot_id,
"snapshot_hash": snapshot.content_hash,
"input": test_input,
"response": response,
"expected_contains": expected_contains,
"passed": passed,
"replayed": recorded is not None,
}
def _call_agent(self, message: str) -> str:
# Placeholder for actual agent invocation
return f"Agent response to: {message}"
Snapshot Comparison
When a test fails after a config change, compare snapshots to pinpoint exactly what changed.
def compare_snapshots(
old: ConfigSnapshot, new: ConfigSnapshot
) -> list[dict]:
diffs = []
if old.model_version != new.model_version:
diffs.append({
"field": "model_version",
"old": old.model_version,
"new": new.model_version,
})
# Compare config values
all_keys = set(old.agent_config.keys()) | set(new.agent_config.keys())
for key in sorted(all_keys):
old_val = old.agent_config.get(key)
new_val = new.agent_config.get(key)
if old_val != new_val:
diffs.append({"field": f"config.{key}", "old": old_val, "new": new_val})
# Compare tool lists
old_tools = {t.get("name") for t in old.tool_definitions}
new_tools = {t.get("name") for t in new.tool_definitions}
for added in new_tools - old_tools:
diffs.append({"field": "tools", "change": "added", "tool": added})
for removed in old_tools - new_tools:
diffs.append({"field": "tools", "change": "removed", "tool": removed})
return diffs
FAQ
How often should I create new snapshots?
Create a snapshot before every production deployment and after any configuration change. Tag snapshots that correspond to known-good states as "baseline" so test regressions can be compared against a stable reference point. Prune old snapshots on a monthly schedule, keeping only tagged baselines.
Can I use snapshots in CI/CD pipelines?
Yes, and you should. Store snapshot files in your test fixtures directory alongside recorded responses. Your CI pipeline loads the snapshot, runs the test suite with replayed responses, and fails the build if any assertions break. This gives you fast, deterministic tests without calling the LLM on every pipeline run.
How do I handle snapshot drift when the model provider updates their API?
Pin your model version explicitly (for example gpt-4o-2024-11-20 rather than gpt-4o) in snapshots. When a model is deprecated, create new snapshots with the replacement model, re-record responses, and update your test baselines. Treat model version changes the same way you treat dependency updates — they deserve their own review cycle.
#Testing #AIAgents #ConfigurationSnapshots #Reproducibility #Python #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.