Testing Multi-Agent Handoffs: Verifying Routing Logic and Context Transfer
Learn how to test multi-agent handoff logic, verify conversation routing, validate context transfer between agents, and test boundary conditions in agent orchestration systems.
Why Handoff Testing Is Critical
In multi-agent systems, a triage agent routes conversations to specialized agents — billing, technical support, sales. Handoff failures are some of the most damaging bugs: a customer asking about a refund gets routed to tech support, or context is lost during transfer and the next agent asks the customer to repeat everything.
Testing handoffs requires verifying three things: the router selects the right destination agent, the full conversation context transfers correctly, and edge cases like ambiguous requests or mid-conversation re-routing work properly.
Modeling Handoffs for Testability
Define handoffs as explicit, inspectable objects rather than implicit side effects.
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class Handoff:
source_agent: str
target_agent: str
reason: str
context: dict = field(default_factory=dict)
conversation_history: list[dict] = field(default_factory=list)
@dataclass
class HandoffResult:
should_handoff: bool
handoff: Optional[Handoff] = None
response: Optional[str] = None
class TriageAgent:
def __init__(self, llm, available_agents: list[str]):
self.llm = llm
self.available_agents = available_agents
def process(self, message: str, history: list[dict]) -> HandoffResult:
# LLM determines routing
decision = self.llm.chat([
{"role": "system", "content": self._build_routing_prompt()},
*history,
{"role": "user", "content": message},
])
parsed = self._parse_decision(decision["content"])
if parsed["action"] == "handoff":
return HandoffResult(
should_handoff=True,
handoff=Handoff(
source_agent="triage",
target_agent=parsed["target"],
reason=parsed["reason"],
context=parsed.get("extracted_context", {}),
conversation_history=history + [
{"role": "user", "content": message}
],
),
)
return HandoffResult(should_handoff=False, response=parsed["response"])
Testing Routing Decisions
Use a FakeLLM to control routing decisions and verify the triage agent routes correctly.
import pytest
@pytest.fixture
def fake_llm():
return FakeLLM(responses=[])
def make_routing_response(target: str, reason: str) -> str:
return f'{{"action": "handoff", "target": "{target}", "reason": "{reason}"}}'
def test_billing_question_routes_to_billing(fake_llm):
fake_llm.responses = [make_routing_response("billing", "refund request")]
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech", "sales"])
result = triage.process("I want a refund for my last charge", history=[])
assert result.should_handoff is True
assert result.handoff.target_agent == "billing"
def test_technical_issue_routes_to_tech(fake_llm):
fake_llm.responses = [make_routing_response("tech", "login error")]
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech", "sales"])
result = triage.process("I cannot log in to my account", history=[])
assert result.should_handoff is True
assert result.handoff.target_agent == "tech"
def test_general_question_stays_in_triage(fake_llm):
fake_llm.responses = ['{"action": "respond", "response": "How can I help?"}']
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech", "sales"])
result = triage.process("Hello", history=[])
assert result.should_handoff is False
assert result.response is not None
Testing Context Transfer
Verify that all relevant context passes from the source agent to the destination agent.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
def test_context_includes_conversation_history(fake_llm):
fake_llm.responses = [make_routing_response("billing", "payment issue")]
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech"])
history = [
{"role": "user", "content": "Hi, I have a problem"},
{"role": "assistant", "content": "Sure, what is the issue?"},
]
result = triage.process("I was double charged $49.99", history=history)
# Full history must transfer — no lost context
assert len(result.handoff.conversation_history) == 3
assert result.handoff.conversation_history[-1]["content"] == "I was double charged $49.99"
def test_extracted_context_contains_key_info(fake_llm):
fake_llm.responses = [
'{"action": "handoff", "target": "billing", "reason": "refund",'
' "extracted_context": {"amount": "$49.99", "issue": "double charge"}}'
]
triage = TriageAgent(llm=fake_llm, available_agents=["billing"])
result = triage.process("I was double charged $49.99", history=[])
assert result.handoff.context["amount"] == "$49.99"
assert result.handoff.context["issue"] == "double charge"
Testing Boundary Conditions
Edge cases where routing logic is most likely to fail.
def test_invalid_target_agent_raises_error(fake_llm):
fake_llm.responses = [make_routing_response("nonexistent_agent", "test")]
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech"])
with pytest.raises(ValueError, match="Unknown agent"):
triage.process("Route me somewhere", history=[])
def test_ambiguous_request_asks_clarification(fake_llm):
"""When the intent is unclear, triage should ask rather than guess."""
fake_llm.responses = ['{"action": "respond", "response": "Could you clarify?"}']
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech"])
result = triage.process("I have a problem", history=[])
assert result.should_handoff is False
assert "clarif" in result.response.lower()
def test_mid_conversation_rerouting(fake_llm):
"""Agent should re-route if the topic changes mid-conversation."""
fake_llm.responses = [make_routing_response("tech", "now a tech issue")]
triage = TriageAgent(llm=fake_llm, available_agents=["billing", "tech"])
history = [
{"role": "user", "content": "I need a refund"},
{"role": "assistant", "content": "Let me connect you to billing."},
{"role": "user", "content": "Actually, my app keeps crashing"},
]
result = triage.process("The crash happens on every login", history=history)
assert result.handoff.target_agent == "tech"
FAQ
How do I test handoffs with the OpenAI Agents SDK?
The Agents SDK models handoffs as special tool calls. Mock the LLM to return a handoff tool call, then verify the runner transfers control to the correct agent and carries the conversation state.
Should handoff tests use real LLMs?
Use mocked LLMs for unit tests of routing logic. Use real LLMs in a small set of integration tests that verify end-to-end handoff flows, especially for ambiguous cases where routing quality depends on prompt wording.
What is the most common handoff bug?
Lost context. The destination agent does not receive the conversation history or extracted entities, so it asks the user to repeat information. Always assert that the handoff object contains the complete conversation history.
#MultiAgent #Handoffs #Routing #Testing #Python #Orchestration #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.