Building a Debug Mode for AI Agents: Verbose Logging, Step-Through Execution, and Inspection Tools
Learn how to build a comprehensive debug mode for AI agents with toggle-able verbose logging, step-through execution callbacks, state dumps, and conversation replay capability for efficient troubleshooting.
Every Serious Agent Needs a Debug Mode
Traditional software has debuggers, breakpoints, and step-through execution. AI agents typically have none of these. When something goes wrong, you either stare at logs or add print statements, run it again, and hope the stochastic model reproduces the same issue.
Building a proper debug mode into your agent framework changes everything. A well-designed debug mode lets you watch the agent think in real time, pause at each decision point, inspect the full state, and replay conversations deterministically. This is not a luxury — it is essential infrastructure for any team that ships agents to production.
The Debug Mode Architecture
A debug mode has four capabilities: verbose logging, step callbacks, state dumps, and replay. Here is the core structure:
import json
import time
from enum import Enum
from dataclasses import dataclass, field
from typing import Callable, Any
class DebugLevel(Enum):
OFF = 0
BASIC = 1 # Log agent decisions and tool calls
VERBOSE = 2 # Log full prompts and responses
TRACE = 3 # Log everything including internal state
@dataclass
class AgentStep:
step_number: int
step_type: str # "llm_call", "tool_call", "handoff", "response"
agent_name: str
input_data: dict
output_data: dict = field(default_factory=dict)
duration_ms: float = 0
timestamp: float = field(default_factory=time.time)
class DebugMode:
def __init__(self, level: DebugLevel = DebugLevel.OFF):
self.level = level
self.steps: list[AgentStep] = []
self.step_callbacks: list[Callable] = []
self.pause_before: set[str] = set() # Step types to pause on
def is_enabled(self) -> bool:
return self.level != DebugLevel.OFF
def add_callback(self, callback: Callable):
self.step_callbacks.append(callback)
def pause_on(self, step_type: str):
self.pause_before.add(step_type)
async def record_step(self, step: AgentStep):
self.steps.append(step)
if self.level.value >= DebugLevel.BASIC.value:
self._print_step(step)
for callback in self.step_callbacks:
await callback(step)
def _print_step(self, step: AgentStep):
prefix = f"[DEBUG][{step.agent_name}][Step {step.step_number}]"
print(f"{prefix} {step.step_type} ({step.duration_ms:.0f}ms)")
if self.level.value >= DebugLevel.VERBOSE.value:
print(f"{prefix} Input: {json.dumps(step.input_data, indent=2)[:500]}")
print(f"{prefix} Output: {json.dumps(step.output_data, indent=2)[:500]}")
Integrating Debug Mode into Agent Execution
Wire the debug mode into every decision point in your agent loop:
class DebuggableAgent:
def __init__(self, agent, debug: DebugMode = None):
self.agent = agent
self.debug = debug or DebugMode()
self.step_count = 0
async def run(self, messages: list[dict], tools: list = None):
while True:
self.step_count += 1
# Step: LLM Call
step = AgentStep(
step_number=self.step_count,
step_type="llm_call",
agent_name=self.agent.name,
input_data={
"message_count": len(messages),
"tool_count": len(tools) if tools else 0,
},
)
if self.debug.is_enabled() and "llm_call" in self.debug.pause_before:
input("Press Enter to continue to LLM call...")
start = time.perf_counter()
response = await self._call_llm(messages, tools)
step.duration_ms = (time.perf_counter() - start) * 1000
step.output_data = {
"has_tool_calls": bool(response.get("tool_calls")),
"content_length": len(response.get("content", "") or ""),
}
await self.debug.record_step(step)
# Check if agent wants to call tools
if response.get("tool_calls"):
for tc in response["tool_calls"]:
await self._execute_tool_with_debug(tc, messages)
else:
return response.get("content", "")
async def _execute_tool_with_debug(self, tool_call, messages):
self.step_count += 1
step = AgentStep(
step_number=self.step_count,
step_type="tool_call",
agent_name=self.agent.name,
input_data={
"tool": tool_call["name"],
"arguments": tool_call["arguments"],
},
)
if self.debug.is_enabled() and "tool_call" in self.debug.pause_before:
print(f"About to call: {tool_call['name']}")
print(f"With args: {json.dumps(tool_call['arguments'], indent=2)}")
input("Press Enter to execute tool call...")
start = time.perf_counter()
try:
result = await self._run_tool(tool_call)
step.output_data = {"result": str(result)[:500]}
except Exception as e:
step.output_data = {"error": str(e)}
step.duration_ms = (time.perf_counter() - start) * 1000
await self.debug.record_step(step)
async def _call_llm(self, messages, tools):
# Placeholder — integrate with your LLM client
pass
async def _run_tool(self, tool_call):
# Placeholder — integrate with your tool registry
pass
State Dumps for Inspection
A state dump captures the complete agent state at a point in time for post-mortem analysis:
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
class StateDumper:
@staticmethod
def dump(
agent_name: str,
messages: list[dict],
context: dict,
step_history: list[AgentStep],
) -> dict:
snapshot = {
"agent_name": agent_name,
"timestamp": time.time(),
"message_count": len(messages),
"messages": messages,
"context_variables": context,
"steps_taken": len(step_history),
"step_summary": [
{
"n": s.step_number,
"type": s.step_type,
"agent": s.agent_name,
"ms": round(s.duration_ms),
}
for s in step_history
],
}
return snapshot
@staticmethod
def save(snapshot: dict, path: str):
with open(path, "w") as f:
json.dump(snapshot, f, indent=2, default=str)
print(f"State dump saved to {path}")
@staticmethod
def load(path: str) -> dict:
with open(path) as f:
return json.load(f)
Building Replay Capability
Replay lets you re-run a conversation with the same inputs to reproduce issues. The key is recording and replaying LLM responses:
class ConversationRecorder:
def __init__(self):
self.recording: list[dict] = []
def record_llm_response(self, messages: list[dict], response: dict):
self.recording.append({
"type": "llm_response",
"input_hash": hash(json.dumps(messages, sort_keys=True)),
"response": response,
})
def record_tool_result(self, tool_name: str, args: dict, result: Any):
self.recording.append({
"type": "tool_result",
"tool": tool_name,
"args": args,
"result": result,
})
def save(self, path: str):
with open(path, "w") as f:
json.dump(self.recording, f, indent=2, default=str)
class ConversationReplayer:
def __init__(self, recording_path: str):
with open(recording_path) as f:
self.recording = json.load(f)
self.position = 0
def next_llm_response(self) -> dict | None:
while self.position < len(self.recording):
entry = self.recording[self.position]
self.position += 1
if entry["type"] == "llm_response":
return entry["response"]
return None
def next_tool_result(self) -> Any:
while self.position < len(self.recording):
entry = self.recording[self.position]
self.position += 1
if entry["type"] == "tool_result":
return entry["result"]
return None
Enabling Debug Mode in Production Safely
Debug mode should be available in production but gated behind flags to prevent performance impact:
import os
def get_debug_mode(request_headers: dict = None) -> DebugMode:
# Environment-level debug
env_level = os.getenv("AGENT_DEBUG_LEVEL", "OFF")
# Request-level override (for specific troubleshooting)
if request_headers:
header_level = request_headers.get("X-Agent-Debug", "").upper()
if header_level in ("BASIC", "VERBOSE", "TRACE"):
env_level = header_level
level = DebugLevel[env_level] if env_level in DebugLevel.__members__ else DebugLevel.OFF
return DebugMode(level=level)
FAQ
How do I enable debug mode for a single conversation in production without affecting other users?
Use a request-level debug header or a user-level feature flag. Pass X-Agent-Debug: VERBOSE in the request headers to enable debug mode for that specific conversation. Store the debug output in a separate log stream or return it as metadata in the response so it does not interfere with normal logging volume.
Will debug mode add significant latency to agent execution?
At the BASIC level, overhead is negligible — just a few microseconds per step for logging. At VERBOSE level, serializing full prompts and responses adds 1 to 5 milliseconds per step. At TRACE level with state dumps, expect 5 to 20 milliseconds per step. The step-through pause feature should only be used in development, never in production.
How do I make conversation replays deterministic when the LLM is stochastic?
Record the actual LLM responses during the original conversation and replay those exact responses instead of calling the LLM again. This makes replays perfectly deterministic regardless of temperature settings. For testing variations, you can replay with live LLM calls at temperature 0 for near-deterministic behavior while still exercising the full pipeline.
#Debugging #DeveloperTools #AIAgents #Observability #Testing #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.