Building a Continuous Evaluation Pipeline: Automated Agent Quality Monitoring
Learn how to build a continuous evaluation pipeline for AI agents with scheduled evaluations, dashboard integration, alerting on quality drops, and trend analysis over time.
Why Continuous Evaluation Matters
Deploying an AI agent is not the finish line — it is the starting line. Model provider updates, data drift, traffic pattern changes, and dependency updates can all degrade agent quality silently. A continuous evaluation pipeline runs automated assessments on a schedule, detects quality drops early, and alerts your team before users notice problems.
Think of it as application performance monitoring (APM) for AI quality. Just as you monitor latency and error rates, you need to monitor answer correctness, tool-use accuracy, and safety compliance.
Pipeline Architecture
A continuous eval pipeline has four stages: sample, evaluate, store, and alert.
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
import json
@dataclass
class EvalRun:
run_id: str
timestamp: str
model: str
prompt_version: str
total_cases: int
scores: dict[str, float]
failures: list[dict] = field(default_factory=list)
metadata: dict = field(default_factory=dict)
@dataclass
class EvalPipeline:
eval_cases: list[dict]
agent_fn: callable
judge_fn: callable
storage: "EvalStorage"
alerter: Optional["Alerter"] = None
async def run(self, run_id: str, model: str, prompt_version: str) -> EvalRun:
results = []
failures = []
for case in self.eval_cases:
output = await self.agent_fn(case["input"])
score = await self.judge_fn(case["input"], output, case["expected"])
results.append(score)
if score < 3:
failures.append({
"case_id": case["id"],
"input": case["input"],
"output": output,
"score": score,
})
avg_score = sum(results) / len(results)
eval_run = EvalRun(
run_id=run_id,
timestamp=datetime.now().isoformat(),
model=model,
prompt_version=prompt_version,
total_cases=len(results),
scores={
"average": round(avg_score, 2),
"min": min(results),
"max": max(results),
"pass_rate": round(
sum(1 for s in results if s >= 3) / len(results) * 100, 1
),
},
failures=failures,
)
await self.storage.save(eval_run)
if self.alerter:
await self.alerter.check(eval_run)
return eval_run
Scheduled Evaluations
Run evaluations on a cron schedule using a simple runner script.
# eval_runner.py
import asyncio
import uuid
from datetime import datetime
async def scheduled_eval():
"""Run evaluation suite — called by cron or scheduler."""
from my_agent.core import create_agent
from my_agent.eval import load_eval_cases, create_judge
from my_agent.eval.storage import PostgresEvalStorage
from my_agent.eval.alerts import SlackAlerter
agent = create_agent()
cases = load_eval_cases("eval_datasets/production_suite.jsonl")
judge = create_judge(model="gpt-4o")
storage = PostgresEvalStorage(dsn="postgresql://...")
alerter = SlackAlerter(webhook_url="https://hooks.slack.com/...")
pipeline = EvalPipeline(
eval_cases=cases,
agent_fn=agent.run,
judge_fn=judge.evaluate,
storage=storage,
alerter=alerter,
)
run_id = f"eval-{datetime.now().strftime('%Y%m%d-%H%M')}-{uuid.uuid4().hex[:6]}"
result = await pipeline.run(
run_id=run_id,
model="gpt-4o",
prompt_version="v23",
)
print(f"Eval complete: {result.scores}")
if __name__ == "__main__":
asyncio.run(scheduled_eval())
Schedule with cron:
# Run evaluation every 6 hours
0 */6 * * * cd /app && python eval_runner.py >> /var/log/eval.log 2>&1
Storing Results for Trend Analysis
Store eval results in a database for historical comparison.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
import asyncpg
from datetime import datetime
class PostgresEvalStorage:
def __init__(self, dsn: str):
self.dsn = dsn
async def initialize(self):
self.pool = await asyncpg.create_pool(self.dsn)
await self.pool.execute("""
CREATE TABLE IF NOT EXISTS eval_runs (
run_id TEXT PRIMARY KEY,
timestamp TIMESTAMPTZ NOT NULL,
model TEXT NOT NULL,
prompt_version TEXT NOT NULL,
total_cases INTEGER NOT NULL,
avg_score FLOAT NOT NULL,
pass_rate FLOAT NOT NULL,
min_score INTEGER NOT NULL,
max_score INTEGER NOT NULL,
failures JSONB DEFAULT '[]',
metadata JSONB DEFAULT '{}'
)
""")
async def save(self, run: EvalRun):
await self.pool.execute(
"""INSERT INTO eval_runs
(run_id, timestamp, model, prompt_version, total_cases,
avg_score, pass_rate, min_score, max_score, failures, metadata)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)""",
run.run_id, run.timestamp, run.model, run.prompt_version,
run.total_cases, run.scores["average"], run.scores["pass_rate"],
run.scores["min"], run.scores["max"],
json.dumps(run.failures), json.dumps(run.metadata),
)
async def get_trend(self, days: int = 30) -> list[dict]:
rows = await self.pool.fetch("""
SELECT timestamp, avg_score, pass_rate, prompt_version
FROM eval_runs
WHERE timestamp > NOW() - INTERVAL '%s days'
ORDER BY timestamp
""" % days)
return [dict(r) for r in rows]
Alerting on Quality Drops
Trigger alerts when metrics cross thresholds or show downward trends.
import httpx
class SlackAlerter:
def __init__(self, webhook_url: str):
self.webhook_url = webhook_url
async def check(self, run: EvalRun):
alerts = []
if run.scores["average"] < 3.5:
alerts.append(
f"Average score dropped to {run.scores['average']:.2f} "
f"(threshold: 3.5)"
)
if run.scores["pass_rate"] < 80:
alerts.append(
f"Pass rate dropped to {run.scores['pass_rate']:.1f}% "
f"(threshold: 80%)"
)
if len(run.failures) > len(run.failures) * 0.3:
alerts.append(
f"{len(run.failures)} failures out of {run.total_cases} cases"
)
if alerts:
await self._send_alert(run, alerts)
async def _send_alert(self, run: EvalRun, alerts: list[str]):
message = {
"text": f"Agent Quality Alert - Run {run.run_id}",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": (
f"*Agent Quality Alert*\n"
f"Run: `{run.run_id}`\n"
f"Model: {run.model} | Prompt: {run.prompt_version}\n"
f"Score: {run.scores['average']:.2f} | "
f"Pass Rate: {run.scores['pass_rate']:.1f}%\n\n"
+ "\n".join(f"- {a}" for a in alerts)
),
},
}
],
}
async with httpx.AsyncClient() as client:
await client.post(self.webhook_url, json=message)
Trend Analysis
Detect gradual quality degradation by analyzing trends over time.
import statistics
def analyze_trend(scores: list[float], window: int = 7) -> dict:
"""Detect quality trends over recent eval runs."""
if len(scores) < window * 2:
return {"trend": "insufficient_data"}
recent = scores[-window:]
previous = scores[-window * 2:-window]
recent_avg = statistics.mean(recent)
previous_avg = statistics.mean(previous)
delta = recent_avg - previous_avg
if delta < -0.3:
trend = "declining"
elif delta > 0.3:
trend = "improving"
else:
trend = "stable"
return {
"trend": trend,
"recent_avg": round(recent_avg, 2),
"previous_avg": round(previous_avg, 2),
"delta": round(delta, 2),
"recent_stddev": round(statistics.stdev(recent), 2) if len(recent) > 1 else 0,
}
Putting It All Together
A production continuous evaluation system combines all of these components with a dashboard for visibility.
# Full pipeline integration
async def main():
storage = PostgresEvalStorage(dsn="postgresql://...")
await storage.initialize()
alerter = SlackAlerter(webhook_url="https://hooks.slack.com/...")
pipeline = EvalPipeline(
eval_cases=load_eval_cases("production_suite.jsonl"),
agent_fn=create_agent().run,
judge_fn=create_judge().evaluate,
storage=storage,
alerter=alerter,
)
# Run evaluation
result = await pipeline.run("daily-eval", "gpt-4o", "v23")
# Analyze trend
trend_data = await storage.get_trend(days=30)
scores = [r["avg_score"] for r in trend_data]
trend = analyze_trend(scores)
if trend["trend"] == "declining":
await alerter._send_alert(result, [
f"Quality trending down: {trend['delta']:+.2f} over last 14 runs"
])
FAQ
How frequently should continuous evaluations run?
Run a core eval suite every 6-12 hours. Run a comprehensive suite (including expensive LLM-as-Judge evaluations) daily. Run lightweight checks (structured output validation, tool-call accuracy) after every deployment.
What is the cost of running continuous evaluations?
A 100-case eval suite with GPT-4o-mini as the agent and GPT-4o as the judge costs roughly one to three dollars per run. At four runs per day, that is roughly 120-360 dollars per month — a small fraction of the cost of production incidents caused by undetected quality drops.
How do I evaluate agents that use RAG or real-time data?
Pin your test data sources during evaluation. Use a snapshot of your vector database and mock real-time APIs to return consistent data. This isolates agent quality from data quality, letting you test each independently.
#ContinuousEvaluation #Monitoring #AIAgents #MLOps #Python #QualityAssurance #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.