Building a Memory Debugger: Inspecting and Modifying Agent Memory State

The Debugging Problem

When an agent gives a wrong answer, the first question is: what did it remember? Without tools to inspect memory state, debugging is guesswork. You stare at logs, try to reconstruct what the agent retrieved, and hope you can reproduce the issue. A proper memory debugger lets you see exactly what the agent knew at any point, search across all memories, and modify state to test hypotheses.

Memory Inspector

The inspector provides a read-only view into the memory store. It answers basic questions: what is stored, how many items exist per category, and what are the most recent entries.

from dataclasses import dataclass
from datetime import datetime
from typing import Any


@dataclass
class MemoryEntry:
    id: str
    content: str
    category: str
    created_at: datetime
    importance: float
    access_count: int
    metadata: dict


class MemoryDebugger:
    def __init__(self, store: dict[str, MemoryEntry]):
        self.store = store

    def summary(self) -> dict:
        categories: dict[str, int] = {}
        total_size = 0
        oldest = None
        newest = None

        for entry in self.store.values():
            categories[entry.category] = (
                categories.get(entry.category, 0) + 1
            )
            total_size += len(entry.content)
            if oldest is None or entry.created_at < oldest:
                oldest = entry.created_at
            if newest is None or entry.created_at > newest:
                newest = entry.created_at

        return {
            "total_entries": len(self.store),
            "categories": categories,
            "total_content_size_bytes": total_size,
            "oldest_entry": oldest.isoformat() if oldest else None,
            "newest_entry": newest.isoformat() if newest else None,
        }

    def inspect(self, entry_id: str) -> dict | None:
        entry = self.store.get(entry_id)
        if not entry:
            return None
        return {
            "id": entry.id,
            "content": entry.content,
            "category": entry.category,
            "created_at": entry.created_at.isoformat(),
            "importance": entry.importance,
            "access_count": entry.access_count,
            "content_length": len(entry.content),
            "metadata": entry.metadata,
        }

Search and Filter

The debugger needs powerful search that goes beyond what the agent's normal retrieval uses. You want to search by content substring, metadata values, date ranges, importance thresholds, and access patterns.

def search(
    self,
    content_query: str | None = None,
    category: str | None = None,
    min_importance: float | None = None,
    max_importance: float | None = None,
    created_after: datetime | None = None,
    created_before: datetime | None = None,
    min_access_count: int | None = None,
    sort_by: str = "created_at",
    limit: int = 20,
) -> list[dict]:
    results = list(self.store.values())

    if content_query:
        query_lower = content_query.lower()
        results = [
            e for e in results
            if query_lower in e.content.lower()
        ]
    if category:
        results = [
            e for e in results if e.category == category
        ]
    if min_importance is not None:
        results = [
            e for e in results
            if e.importance >= min_importance
        ]
    if max_importance is not None:
        results = [
            e for e in results
            if e.importance <= max_importance
        ]
    if created_after:
        results = [
            e for e in results
            if e.created_at >= created_after
        ]
    if created_before:
        results = [
            e for e in results
            if e.created_at <= created_before
        ]
    if min_access_count is not None:
        results = [
            e for e in results
            if e.access_count >= min_access_count
        ]

    sort_keys = {
        "created_at": lambda e: e.created_at,
        "importance": lambda e: e.importance,
        "access_count": lambda e: e.access_count,
        "content_length": lambda e: len(e.content),
    }
    key_fn = sort_keys.get(sort_by, sort_keys["created_at"])
    results.sort(key=key_fn, reverse=True)

    return [self.inspect(e.id) for e in results[:limit]]

Memory Visualization

A text-based visualization gives a quick overview of memory distribution, highlighting hot spots and dead zones.

def visualize_timeline(
    self, bucket_hours: int = 24
) -> list[dict]:
    """Group memories into time buckets for timeline view."""
    if not self.store:
        return []

    buckets: dict[str, list[MemoryEntry]] = {}
    for entry in self.store.values():
        bucket_key = entry.created_at.strftime(
            "%Y-%m-%d" if bucket_hours >= 24 else "%Y-%m-%d %H:00"
        )
        buckets.setdefault(bucket_key, []).append(entry)

    timeline = []
    for period, entries in sorted(buckets.items()):
        avg_importance = sum(
            e.importance for e in entries
        ) / len(entries)
        bar = "#" * min(len(entries), 50)
        timeline.append({
            "period": period,
            "count": len(entries),
            "avg_importance": round(avg_importance, 2),
            "bar": bar,
        })
    return timeline


def importance_distribution(self) -> dict:
    """Show distribution of importance scores."""
    buckets = {
        "critical (0.9-1.0)": 0,
        "high (0.7-0.9)": 0,
        "medium (0.4-0.7)": 0,
        "low (0.2-0.4)": 0,
        "minimal (0-0.2)": 0,
    }
    for entry in self.store.values():
        imp = entry.importance
        if imp >= 0.9:
            buckets["critical (0.9-1.0)"] += 1
        elif imp >= 0.7:
            buckets["high (0.7-0.9)"] += 1
        elif imp >= 0.4:
            buckets["medium (0.4-0.7)"] += 1
        elif imp >= 0.2:
            buckets["low (0.2-0.4)"] += 1
        else:
            buckets["minimal (0-0.2)"] += 1
    return buckets

Manual Editing

Sometimes you need to modify memory state directly — to fix incorrect data, test edge cases, or reproduce bugs. The editor provides controlled mutation with an audit trail.

See AI Voice Agents Handle Real Calls

Book a free demo or calculate how much you can save with AI voice automation.

Book a Demo ROI Calculator

def edit_content(
    self, entry_id: str, new_content: str, reason: str
) -> dict | None:
    entry = self.store.get(entry_id)
    if not entry:
        return None

    old_content = entry.content
    entry.content = new_content
    return {
        "id": entry_id,
        "old_content": old_content,
        "new_content": new_content,
        "reason": reason,
        "edited_at": datetime.now().isoformat(),
    }


def edit_importance(
    self, entry_id: str, new_importance: float, reason: str
) -> dict | None:
    entry = self.store.get(entry_id)
    if not entry:
        return None

    old_importance = entry.importance
    entry.importance = max(0.0, min(1.0, new_importance))
    return {
        "id": entry_id,
        "old_importance": old_importance,
        "new_importance": entry.importance,
        "reason": reason,
    }


def delete_entry(self, entry_id: str, reason: str) -> dict | None:
    entry = self.store.pop(entry_id, None)
    if not entry:
        return None
    return {
        "deleted_id": entry_id,
        "deleted_content": entry.content[:100],
        "reason": reason,
        "deleted_at": datetime.now().isoformat(),
    }

Testing Utilities

The debugger also serves as a testing tool. You can snapshot memory state, run the agent, and compare before/after states to verify that memory operations work correctly.

import json
from copy import deepcopy


def snapshot(self) -> dict:
    """Capture current state for comparison."""
    return {
        entry_id: {
            "content": entry.content,
            "category": entry.category,
            "importance": entry.importance,
            "access_count": entry.access_count,
        }
        for entry_id, entry in self.store.items()
    }


def diff_snapshots(
    self, before: dict, after: dict
) -> dict:
    """Compare two snapshots to see what changed."""
    added = set(after.keys()) - set(before.keys())
    removed = set(before.keys()) - set(after.keys())
    modified = []

    for key in set(before.keys()) & set(after.keys()):
        if before[key] != after[key]:
            modified.append({
                "id": key,
                "changes": {
                    field: {
                        "before": before[key][field],
                        "after": after[key][field],
                    }
                    for field in before[key]
                    if before[key][field] != after[key][field]
                },
            })

    return {
        "added": list(added),
        "removed": list(removed),
        "modified": modified,
        "total_changes": len(added) + len(removed) + len(modified),
    }

Putting It All Together

# Usage during development
debugger = MemoryDebugger(agent.memory_store)

# Quick overview
print(debugger.summary())

# Find suspicious entries
stale = debugger.search(
    max_importance=0.1,
    min_access_count=0,
    created_before=datetime(2026, 1, 1),
)
print(f"Found {len(stale)} potentially stale memories")

# Snapshot before a test
before = debugger.snapshot()

# ... run agent interaction ...

# Compare what changed
after = debugger.snapshot()
changes = debugger.diff_snapshots(before, after)
print(f"Agent made {changes['total_changes']} memory changes")

FAQ

Should the debugger be available in production?

Yes, but behind authentication and with read-only mode as the default. The ability to inspect memory state in production is essential for diagnosing user-reported issues. Restrict the edit and delete operations to admin users and log every mutation.

How do I debug memory retrieval specifically?

Add a explain_retrieval method that returns not just the results but the scores for each signal (recency, relevance, importance) and the final combined score. This shows exactly why certain memories were ranked higher than others for a given query.

Can I use the debugger for automated testing?

Absolutely. Snapshot before, run the agent, snapshot after, then assert on the diff. Verify that specific memories were created, that importance scores fall within expected ranges, and that no unexpected deletions occurred. This catches memory regressions in CI.

#MemoryDebugging #DeveloperTools #AgentTesting #Python #AgenticAI #LearnAI #AIEngineering

Building a Memory Debugger: Inspecting and Modifying Agent Memory State

The Debugging Problem

Memory Inspector

Search and Filter

Memory Visualization

Manual Editing

Testing Utilities

Putting It All Together

FAQ

Should the debugger be available in production?

How do I debug memory retrieval specifically?

Can I use the debugger for automated testing?

Try CallSphere AI Voice Agents

Related Articles

WebArena and Real-World Web Agent Benchmarks: How We Measure Browser Agent Performance

Taking Screenshots and Recording Videos with Playwright for AI Analysis

Playwright Selectors Deep Dive: CSS, XPath, Text, and Role-Based Element Finding