Building a Memory Debugger: Inspecting and Modifying Agent Memory State
Build developer tools for inspecting, visualizing, searching, and manually editing AI agent memory state, enabling effective debugging and testing of memory-dependent behavior.
The Debugging Problem
When an agent gives a wrong answer, the first question is: what did it remember? Without tools to inspect memory state, debugging is guesswork. You stare at logs, try to reconstruct what the agent retrieved, and hope you can reproduce the issue. A proper memory debugger lets you see exactly what the agent knew at any point, search across all memories, and modify state to test hypotheses.
Memory Inspector
The inspector provides a read-only view into the memory store. It answers basic questions: what is stored, how many items exist per category, and what are the most recent entries.
from dataclasses import dataclass
from datetime import datetime
from typing import Any
@dataclass
class MemoryEntry:
id: str
content: str
category: str
created_at: datetime
importance: float
access_count: int
metadata: dict
class MemoryDebugger:
def __init__(self, store: dict[str, MemoryEntry]):
self.store = store
def summary(self) -> dict:
categories: dict[str, int] = {}
total_size = 0
oldest = None
newest = None
for entry in self.store.values():
categories[entry.category] = (
categories.get(entry.category, 0) + 1
)
total_size += len(entry.content)
if oldest is None or entry.created_at < oldest:
oldest = entry.created_at
if newest is None or entry.created_at > newest:
newest = entry.created_at
return {
"total_entries": len(self.store),
"categories": categories,
"total_content_size_bytes": total_size,
"oldest_entry": oldest.isoformat() if oldest else None,
"newest_entry": newest.isoformat() if newest else None,
}
def inspect(self, entry_id: str) -> dict | None:
entry = self.store.get(entry_id)
if not entry:
return None
return {
"id": entry.id,
"content": entry.content,
"category": entry.category,
"created_at": entry.created_at.isoformat(),
"importance": entry.importance,
"access_count": entry.access_count,
"content_length": len(entry.content),
"metadata": entry.metadata,
}
Search and Filter
The debugger needs powerful search that goes beyond what the agent's normal retrieval uses. You want to search by content substring, metadata values, date ranges, importance thresholds, and access patterns.
def search(
self,
content_query: str | None = None,
category: str | None = None,
min_importance: float | None = None,
max_importance: float | None = None,
created_after: datetime | None = None,
created_before: datetime | None = None,
min_access_count: int | None = None,
sort_by: str = "created_at",
limit: int = 20,
) -> list[dict]:
results = list(self.store.values())
if content_query:
query_lower = content_query.lower()
results = [
e for e in results
if query_lower in e.content.lower()
]
if category:
results = [
e for e in results if e.category == category
]
if min_importance is not None:
results = [
e for e in results
if e.importance >= min_importance
]
if max_importance is not None:
results = [
e for e in results
if e.importance <= max_importance
]
if created_after:
results = [
e for e in results
if e.created_at >= created_after
]
if created_before:
results = [
e for e in results
if e.created_at <= created_before
]
if min_access_count is not None:
results = [
e for e in results
if e.access_count >= min_access_count
]
sort_keys = {
"created_at": lambda e: e.created_at,
"importance": lambda e: e.importance,
"access_count": lambda e: e.access_count,
"content_length": lambda e: len(e.content),
}
key_fn = sort_keys.get(sort_by, sort_keys["created_at"])
results.sort(key=key_fn, reverse=True)
return [self.inspect(e.id) for e in results[:limit]]
Memory Visualization
A text-based visualization gives a quick overview of memory distribution, highlighting hot spots and dead zones.
def visualize_timeline(
self, bucket_hours: int = 24
) -> list[dict]:
"""Group memories into time buckets for timeline view."""
if not self.store:
return []
buckets: dict[str, list[MemoryEntry]] = {}
for entry in self.store.values():
bucket_key = entry.created_at.strftime(
"%Y-%m-%d" if bucket_hours >= 24 else "%Y-%m-%d %H:00"
)
buckets.setdefault(bucket_key, []).append(entry)
timeline = []
for period, entries in sorted(buckets.items()):
avg_importance = sum(
e.importance for e in entries
) / len(entries)
bar = "#" * min(len(entries), 50)
timeline.append({
"period": period,
"count": len(entries),
"avg_importance": round(avg_importance, 2),
"bar": bar,
})
return timeline
def importance_distribution(self) -> dict:
"""Show distribution of importance scores."""
buckets = {
"critical (0.9-1.0)": 0,
"high (0.7-0.9)": 0,
"medium (0.4-0.7)": 0,
"low (0.2-0.4)": 0,
"minimal (0-0.2)": 0,
}
for entry in self.store.values():
imp = entry.importance
if imp >= 0.9:
buckets["critical (0.9-1.0)"] += 1
elif imp >= 0.7:
buckets["high (0.7-0.9)"] += 1
elif imp >= 0.4:
buckets["medium (0.4-0.7)"] += 1
elif imp >= 0.2:
buckets["low (0.2-0.4)"] += 1
else:
buckets["minimal (0-0.2)"] += 1
return buckets
Manual Editing
Sometimes you need to modify memory state directly — to fix incorrect data, test edge cases, or reproduce bugs. The editor provides controlled mutation with an audit trail.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
def edit_content(
self, entry_id: str, new_content: str, reason: str
) -> dict | None:
entry = self.store.get(entry_id)
if not entry:
return None
old_content = entry.content
entry.content = new_content
return {
"id": entry_id,
"old_content": old_content,
"new_content": new_content,
"reason": reason,
"edited_at": datetime.now().isoformat(),
}
def edit_importance(
self, entry_id: str, new_importance: float, reason: str
) -> dict | None:
entry = self.store.get(entry_id)
if not entry:
return None
old_importance = entry.importance
entry.importance = max(0.0, min(1.0, new_importance))
return {
"id": entry_id,
"old_importance": old_importance,
"new_importance": entry.importance,
"reason": reason,
}
def delete_entry(self, entry_id: str, reason: str) -> dict | None:
entry = self.store.pop(entry_id, None)
if not entry:
return None
return {
"deleted_id": entry_id,
"deleted_content": entry.content[:100],
"reason": reason,
"deleted_at": datetime.now().isoformat(),
}
Testing Utilities
The debugger also serves as a testing tool. You can snapshot memory state, run the agent, and compare before/after states to verify that memory operations work correctly.
import json
from copy import deepcopy
def snapshot(self) -> dict:
"""Capture current state for comparison."""
return {
entry_id: {
"content": entry.content,
"category": entry.category,
"importance": entry.importance,
"access_count": entry.access_count,
}
for entry_id, entry in self.store.items()
}
def diff_snapshots(
self, before: dict, after: dict
) -> dict:
"""Compare two snapshots to see what changed."""
added = set(after.keys()) - set(before.keys())
removed = set(before.keys()) - set(after.keys())
modified = []
for key in set(before.keys()) & set(after.keys()):
if before[key] != after[key]:
modified.append({
"id": key,
"changes": {
field: {
"before": before[key][field],
"after": after[key][field],
}
for field in before[key]
if before[key][field] != after[key][field]
},
})
return {
"added": list(added),
"removed": list(removed),
"modified": modified,
"total_changes": len(added) + len(removed) + len(modified),
}
Putting It All Together
# Usage during development
debugger = MemoryDebugger(agent.memory_store)
# Quick overview
print(debugger.summary())
# Find suspicious entries
stale = debugger.search(
max_importance=0.1,
min_access_count=0,
created_before=datetime(2026, 1, 1),
)
print(f"Found {len(stale)} potentially stale memories")
# Snapshot before a test
before = debugger.snapshot()
# ... run agent interaction ...
# Compare what changed
after = debugger.snapshot()
changes = debugger.diff_snapshots(before, after)
print(f"Agent made {changes['total_changes']} memory changes")
FAQ
Should the debugger be available in production?
Yes, but behind authentication and with read-only mode as the default. The ability to inspect memory state in production is essential for diagnosing user-reported issues. Restrict the edit and delete operations to admin users and log every mutation.
How do I debug memory retrieval specifically?
Add a explain_retrieval method that returns not just the results but the scores for each signal (recency, relevance, importance) and the final combined score. This shows exactly why certain memories were ranked higher than others for a given query.
Can I use the debugger for automated testing?
Absolutely. Snapshot before, run the agent, snapshot after, then assert on the diff. Verify that specific memories were created, that importance scores fall within expected ranges, and that no unexpected deletions occurred. This catches memory regressions in CI.
#MemoryDebugging #DeveloperTools #AgentTesting #Python #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.