Prompt Observability: Logging, Analyzing, and Debugging Prompt Performance
Build comprehensive observability for your AI prompts. Learn structured prompt logging, performance tracking dashboards, failure analysis workflows, and data-driven optimization techniques.
Why Prompt Observability Matters
You cannot improve what you cannot see. Most teams deploy prompts and monitor only high-level API metrics — latency, error rate, token costs. They miss the deeper questions: Which prompts produce the most user complaints? Which test cases regress after a model update? Which conversation patterns cause the agent to go off-track?
Prompt observability means capturing, storing, and analyzing the full lifecycle of every prompt interaction: what was sent, what was received, how long it took, and whether the outcome was successful.
Structured Prompt Logging
Log every prompt interaction with enough context to reconstruct and debug any issue.
import json
import uuid
import time
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
@dataclass
class PromptLog:
trace_id: str
timestamp: str
agent_name: str
prompt_version: str
model: str
system_prompt_hash: str
user_input: str
full_prompt_tokens: int
response_text: str
response_tokens: int
latency_ms: float
temperature: float
success: bool
error: str = None
metadata: dict = field(default_factory=dict)
class PromptLogger:
"""Structured logging for all prompt interactions."""
def __init__(self, log_dir: str = "prompt_logs"):
self.log_dir = Path(log_dir)
self.log_dir.mkdir(exist_ok=True)
def log(self, entry: PromptLog):
"""Write a structured log entry."""
date_str = entry.timestamp[:10]
filepath = self.log_dir / f"{date_str}.jsonl"
with open(filepath, "a") as f:
f.write(json.dumps(asdict(entry)) + "\n")
def create_entry(
self, agent_name: str, prompt_version: str,
model: str, system_prompt: str,
user_input: str, response_text: str,
latency_ms: float, input_tokens: int,
output_tokens: int, temperature: float,
success: bool, error: str = None,
metadata: dict = None,
) -> PromptLog:
import hashlib
return PromptLog(
trace_id=str(uuid.uuid4()),
timestamp=datetime.now(timezone.utc).isoformat(),
agent_name=agent_name,
prompt_version=prompt_version,
model=model,
system_prompt_hash=hashlib.sha256(
system_prompt.encode()
).hexdigest()[:16],
user_input=user_input,
full_prompt_tokens=input_tokens,
response_text=response_text,
response_tokens=output_tokens,
latency_ms=latency_ms,
temperature=temperature,
success=success,
error=error,
metadata=metadata or {},
)
Note that we hash the system prompt rather than storing it verbatim in every log entry. This saves storage while still letting you correlate logs with specific prompt versions.
Middleware for Automatic Logging
Wrap your LLM calls so logging happens transparently.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
class ObservableLLMClient:
"""LLM client wrapper that logs all interactions."""
def __init__(
self, provider, logger: PromptLogger,
agent_name: str, prompt_version: str
):
self.provider = provider
self.logger = logger
self.agent_name = agent_name
self.prompt_version = prompt_version
async def complete(
self, system_prompt: str, messages: list[dict],
temperature: float = 0.7, max_tokens: int = 1024,
metadata: dict = None,
):
start = time.monotonic()
error_msg = None
success = True
response = None
try:
response = await self.provider.complete(
system_prompt=system_prompt,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
except Exception as e:
error_msg = str(e)
success = False
raise
finally:
latency = (time.monotonic() - start) * 1000
user_input = messages[-1]["content"] if messages else ""
entry = self.logger.create_entry(
agent_name=self.agent_name,
prompt_version=self.prompt_version,
model=self.provider.model
if hasattr(self.provider, "model") else "unknown",
system_prompt=system_prompt,
user_input=user_input,
response_text=response.text if response else "",
latency_ms=latency,
input_tokens=response.input_tokens
if response else 0,
output_tokens=response.output_tokens
if response else 0,
temperature=temperature,
success=success,
error=error_msg,
metadata=metadata,
)
self.logger.log(entry)
return response
Performance Tracking
Aggregate logs into metrics that reveal trends and anomalies.
from collections import defaultdict
class PromptPerformanceTracker:
"""Track and analyze prompt performance over time."""
def __init__(self, log_dir: str = "prompt_logs"):
self.log_dir = Path(log_dir)
def load_logs(
self, date_range: tuple[str, str] = None,
agent_name: str = None,
) -> list[dict]:
"""Load and filter log entries."""
logs = []
for filepath in sorted(self.log_dir.glob("*.jsonl")):
date_str = filepath.stem
if date_range:
if date_str < date_range[0] or date_str > date_range[1]:
continue
for line in filepath.read_text().strip().split("\n"):
if not line:
continue
entry = json.loads(line)
if agent_name and entry["agent_name"] != agent_name:
continue
logs.append(entry)
return logs
def compute_metrics(
self, logs: list[dict]
) -> dict:
"""Compute aggregate performance metrics."""
if not logs:
return {}
total = len(logs)
successes = sum(1 for l in logs if l["success"])
latencies = [l["latency_ms"] for l in logs]
tokens = [l["response_tokens"] for l in logs]
latencies.sort()
return {
"total_requests": total,
"success_rate": round(successes / total, 4),
"avg_latency_ms": round(
sum(latencies) / total, 1
),
"p50_latency_ms": latencies[total // 2],
"p95_latency_ms": latencies[int(total * 0.95)],
"p99_latency_ms": latencies[int(total * 0.99)],
"avg_output_tokens": round(
sum(tokens) / total, 1
),
"total_tokens": sum(tokens),
"error_count": total - successes,
}
def compute_metrics_by_agent(
self, logs: list[dict]
) -> dict[str, dict]:
"""Break down metrics per agent."""
by_agent = defaultdict(list)
for log in logs:
by_agent[log["agent_name"]].append(log)
return {
agent: self.compute_metrics(agent_logs)
for agent, agent_logs in by_agent.items()
}
Failure Analysis
When things go wrong, structured logs let you diagnose root causes quickly.
class FailureAnalyzer:
"""Analyze and categorize prompt failures."""
def analyze_failures(
self, logs: list[dict]
) -> dict:
"""Categorize and summarize failures."""
failures = [l for l in logs if not l["success"]]
if not failures:
return {"total_failures": 0}
error_categories = defaultdict(list)
for f in failures:
error = f.get("error", "unknown")
if "timeout" in error.lower():
category = "timeout"
elif "rate_limit" in error.lower():
category = "rate_limit"
elif "context_length" in error.lower():
category = "context_overflow"
elif "invalid" in error.lower():
category = "invalid_request"
else:
category = "other"
error_categories[category].append(f)
return {
"total_failures": len(failures),
"failure_rate": round(
len(failures) / len(logs), 4
),
"categories": {
cat: {
"count": len(entries),
"sample_errors": [
e.get("error", "")[:100]
for e in entries[:3]
],
}
for cat, entries in error_categories.items()
},
}
def find_slow_prompts(
self, logs: list[dict], threshold_ms: float = 5000
) -> list[dict]:
"""Find interactions that exceeded latency threshold."""
slow = [
l for l in logs
if l["latency_ms"] > threshold_ms and l["success"]
]
return sorted(
slow, key=lambda l: l["latency_ms"], reverse=True
)
Optimization Insights
Use observability data to drive prompt improvements.
class PromptOptimizer:
"""Generate optimization recommendations from logs."""
def analyze_token_efficiency(
self, logs: list[dict]
) -> dict:
"""Identify prompts with high token waste."""
by_version = defaultdict(list)
for log in logs:
key = f"{log['agent_name']}:{log['prompt_version']}"
by_version[key].append(log)
recommendations = []
for version_key, entries in by_version.items():
avg_input = sum(
e["full_prompt_tokens"] for e in entries
) / len(entries)
avg_output = sum(
e["response_tokens"] for e in entries
) / len(entries)
ratio = avg_output / avg_input if avg_input else 0
if ratio < 0.1:
recommendations.append({
"prompt": version_key,
"issue": "Low output-to-input token ratio",
"detail": f"Avg {avg_input:.0f} input tokens "
f"producing only {avg_output:.0f} output "
f"tokens ({ratio:.1%} ratio)",
"suggestion": "Consider reducing system prompt "
"length or removing unused context",
})
return {
"total_prompts_analyzed": len(by_version),
"recommendations": recommendations,
}
def detect_prompt_drift(
self, logs: list[dict], window_days: int = 7
) -> list[dict]:
"""Detect changes in prompt behavior over time."""
from datetime import datetime, timedelta, timezone
now = datetime.now(timezone.utc)
cutoff = (
now - timedelta(days=window_days)
).isoformat()
recent = [l for l in logs if l["timestamp"] > cutoff]
older = [l for l in logs if l["timestamp"] <= cutoff]
if not recent or not older:
return []
recent_success = sum(
1 for l in recent if l["success"]
) / len(recent)
older_success = sum(
1 for l in older if l["success"]
) / len(older)
drift_signals = []
if older_success - recent_success > 0.05:
drift_signals.append({
"metric": "success_rate",
"older": round(older_success, 4),
"recent": round(recent_success, 4),
"drop": round(older_success - recent_success, 4),
"alert": "Success rate dropped by more than 5%",
})
return drift_signals
FAQ
What should I log versus what should I skip?
Log everything needed to reproduce and debug an interaction: the system prompt hash, user input, model response, latency, token counts, and success status. Skip the full system prompt text in every log entry (store it separately, reference by hash). For PII compliance, sanitize or mask user inputs before logging.
How long should I retain prompt logs?
Retain detailed logs for 30-90 days for debugging. Aggregate metrics can be kept indefinitely for trend analysis. After the retention period, compress and archive logs or delete them per your data retention policy. Separate the retention policy for logs containing user data from logs containing only system metrics.
How do I set up alerts for prompt performance issues?
Define alert thresholds based on your baseline metrics: alert when success rate drops below 95%, when p95 latency exceeds 2x the baseline, or when error rate spikes above 5%. Use your existing monitoring stack (Prometheus, Datadog, CloudWatch) to ingest the aggregated metrics and trigger alerts through your oncall workflow.
#Observability #PromptMonitoring #Debugging #AIOps #PerformanceAnalysis #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.