LLM Evals: Building an Automated Quality Framework from Scratch
A step-by-step guide to building a production-grade LLM evaluation framework that measures accuracy, safety, and quality across model versions and prompt changes.
Why Every LLM Application Needs an Eval Framework
You would never ship a web application without tests. Yet most teams ship LLM applications with nothing more than manual spot-checking. The result is predictable: subtle regressions, inconsistent quality, and a fear of changing prompts because nobody knows what will break.
An LLM eval framework is the testing infrastructure for AI applications. It systematically measures whether your model, prompts, and retrieval pipeline produce correct, safe, and useful outputs -- and it catches regressions before they reach users.
The Three Layers of LLM Evaluation
A comprehensive eval framework operates at three layers, each catching different categories of failure.
Layer 1: Unit Evals (Deterministic Checks)
Unit evals verify concrete, measurable properties of model output. They are fast, cheap, and deterministic.
import json
import re
from dataclasses import dataclass
from enum import Enum
class EvalResult(Enum):
PASS = "pass"
FAIL = "fail"
ERROR = "error"
@dataclass
class EvalCase:
name: str
input_prompt: str
expected: dict # Expected properties of the output
tags: list[str]
@dataclass
class EvalOutcome:
case: EvalCase
result: EvalResult
score: float
details: str
class UnitEvals:
"""Deterministic checks on model output."""
@staticmethod
def check_json_valid(output: str) -> EvalOutcome:
"""Verify output is valid JSON."""
try:
json.loads(output)
return EvalOutcome(result=EvalResult.PASS, score=1.0, details="Valid JSON")
except json.JSONDecodeError as e:
return EvalOutcome(result=EvalResult.FAIL, score=0.0, details=f"Invalid JSON: {e}")
@staticmethod
def check_contains_required_fields(output: str, required_fields: list[str]) -> EvalOutcome:
"""Verify JSON output contains all required fields."""
try:
data = json.loads(output)
missing = [f for f in required_fields if f not in data]
if not missing:
return EvalOutcome(result=EvalResult.PASS, score=1.0, details="All fields present")
return EvalOutcome(
result=EvalResult.FAIL,
score=len(required_fields - len(missing)) / len(required_fields),
details=f"Missing fields: {missing}"
)
except json.JSONDecodeError:
return EvalOutcome(result=EvalResult.FAIL, score=0.0, details="Not valid JSON")
@staticmethod
def check_length_bounds(output: str, min_words: int = 0, max_words: int = 10000) -> EvalOutcome:
"""Verify output length is within acceptable bounds."""
word_count = len(output.split())
if min_words <= word_count <= max_words:
return EvalOutcome(result=EvalResult.PASS, score=1.0,
details=f"Word count {word_count} within [{min_words}, {max_words}]")
return EvalOutcome(result=EvalResult.FAIL, score=0.0,
details=f"Word count {word_count} outside [{min_words}, {max_words}]")
@staticmethod
def check_no_hallucinated_urls(output: str) -> EvalOutcome:
"""Verify output does not contain fabricated URLs."""
urls = re.findall(r'https?://[\S]+', output)
# In a real implementation, you would verify each URL exists
if not urls:
return EvalOutcome(result=EvalResult.PASS, score=1.0, details="No URLs found")
return EvalOutcome(result=EvalResult.PASS, score=0.8,
details=f"Found {len(urls)} URLs -- manual verification recommended")
Layer 2: Model-Graded Evals (LLM-as-Judge)
For subjective qualities like helpfulness, accuracy, and tone, use a separate LLM call to grade the output. This technique, known as LLM-as-Judge, has been shown to correlate well with human judgment when implemented carefully.
import anthropic
client = anthropic.Anthropic()
JUDGE_PROMPT = """You are an expert evaluator. Grade the following AI response on these criteria:
1. **Accuracy** (0-10): Is the information factually correct?
2. **Completeness** (0-10): Does the response fully address the question?
3. **Clarity** (0-10): Is the response well-organized and easy to understand?
4. **Safety** (0-10): Does the response avoid harmful, biased, or misleading content?
Question: {question}
Response to evaluate: {response}
Reference answer (if available): {reference}
Provide your scores as JSON:
{{"accuracy": <score>, "completeness": <score>, "clarity": <score>, "safety": <score>, "reasoning": "<brief explanation>"}}"""
async def llm_judge(question: str, response: str, reference: str = "N/A") -> dict:
"""Use a separate LLM call to evaluate response quality."""
judge_response = await client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=512,
messages=[{
"role": "user",
"content": JUDGE_PROMPT.format(
question=question,
response=response,
reference=reference
)
}]
)
return json.loads(judge_response.content[0].text)
Layer 3: Human Evals (Ground Truth)
Human evaluation provides the ultimate ground truth but is expensive and slow. Use it for:
- Calibrating your LLM-as-Judge against human preferences
- Evaluating new task types where you do not yet have automated metrics
- Periodic audits to verify that automated evals are still aligned with quality standards
@dataclass
class HumanEvalTask:
id: str
prompt: str
model_response: str
judge_scores: dict # LLM judge scores for comparison
@dataclass
class HumanEvalResult:
task_id: str
evaluator_id: str
accuracy: int # 1-5 scale
helpfulness: int
safety: int
preference_vs_baseline: str # "better", "same", "worse"
notes: str
class HumanEvalPipeline:
"""Manage human evaluation tasks and aggregate results."""
def compute_inter_rater_reliability(self, results: list[HumanEvalResult]) -> float:
"""Calculate Cohen's kappa between human evaluators."""
# Group by task_id and compute agreement
pass
def calibrate_judge(self, human_results: list[HumanEvalResult],
judge_results: list[dict]) -> dict:
"""Measure correlation between LLM judge and human evaluators."""
correlations = {}
for dimension in ["accuracy", "helpfulness", "safety"]:
human_scores = [getattr(r, dimension) for r in human_results]
judge_scores = [j[dimension] for j in judge_results]
correlations[dimension] = pearson_correlation(human_scores, judge_scores)
return correlations
Building the Eval Dataset
The quality of your evals depends entirely on the quality of your test cases. Here is how to build a robust eval dataset.
Sources of Eval Cases
- Production logs: Sample real user queries (with PII removed) to ensure eval cases reflect actual usage patterns
- Edge cases: Manually craft adversarial inputs -- ambiguous queries, contradictory instructions, boundary conditions
- Regression captures: Every bug report becomes a new eval case to prevent recurrence
- Synthetic generation: Use an LLM to generate diverse test cases across categories
async def generate_eval_cases(category: str, count: int = 20) -> list[EvalCase]:
"""Generate diverse eval cases for a given category."""
response = await client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[{
"role": "user",
"content": f"""Generate {count} diverse evaluation test cases for an AI assistant in the category: {category}
For each test case, provide:
1. A realistic user prompt
2. The expected key properties of a good response
3. At least one edge case variation
Format as JSON array."""
}]
)
return parse_eval_cases(response.content[0].text)
Eval Dataset Management
class EvalDataset:
"""Manage versioned eval datasets."""
def __init__(self, path: str):
self.path = path
self.cases: list[EvalCase] = self._load()
def add_case(self, case: EvalCase):
self.cases.append(case)
self._save()
def filter_by_tag(self, tag: str) -> list[EvalCase]:
return [c for c in self.cases if tag in c.tags]
def sample(self, n: int, stratify_by: str = "tags") -> list[EvalCase]:
"""Stratified sampling to ensure coverage across categories."""
pass
Running Evals in CI/CD
Integrate evals into your CI pipeline so that every prompt change, model upgrade, or pipeline modification is evaluated before deployment:
# .github/workflows/llm-evals.yml
name: LLM Evals
on:
pull_request:
paths:
- 'prompts/**'
- 'src/ai/**'
- 'eval/**'
jobs:
run-evals:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run unit evals
run: python -m pytest eval/unit/ -v --tb=short
- name: Run model-graded evals
run: python eval/run_judge_evals.py --dataset eval/data/core.json --threshold 0.85
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
- name: Compare with baseline
run: python eval/compare_results.py --current eval/results/current.json --baseline eval/results/baseline.json --max-regression 0.05
- name: Upload eval results
uses: actions/upload-artifact@v4
with:
name: eval-results
path: eval/results/
Tracking Eval Results Over Time
class EvalTracker:
"""Track eval results across model versions and prompt changes."""
def __init__(self, db_path: str):
self.db = sqlite3.connect(db_path)
self._init_schema()
def record_run(self, run_id: str, model: str, prompt_version: str,
results: list[EvalOutcome]):
"""Store eval results for a specific run."""
for outcome in results:
self.db.execute(
"INSERT INTO eval_results (run_id, model, prompt_version, "
"case_name, result, score, details, timestamp) "
"VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(run_id, model, prompt_version, outcome.case.name,
outcome.result.value, outcome.score, outcome.details,
datetime.utcnow().isoformat())
)
self.db.commit()
def detect_regression(self, current_run: str, baseline_run: str,
threshold: float = 0.05) -> list[dict]:
"""Identify eval cases that regressed beyond threshold."""
query = """
SELECT c.case_name,
c.score as current_score,
b.score as baseline_score,
c.score - b.score as delta
FROM eval_results c
JOIN eval_results b ON c.case_name = b.case_name
WHERE c.run_id = ? AND b.run_id = ?
AND c.score < b.score - ?
ORDER BY delta ASC
"""
return self.db.execute(query, (current_run, baseline_run, threshold)).fetchall()
Conclusion
Building an LLM eval framework is the single highest-leverage investment you can make for production AI quality. Start with unit evals for format and safety, add LLM-as-Judge for subjective quality, and use human evals to calibrate your automated metrics. Run evals in CI on every change, track results over time, and treat regression as a blocking issue. The framework pays for itself the first time it catches a regression before it reaches production.
NYC News
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.