Measuring AI Support Agent Performance: CSAT, Resolution Rate, and Containment
Learn how to measure AI support agent effectiveness using CSAT scores, resolution rates, containment rates, and improvement loops that drive measurable gains in customer satisfaction.
You Cannot Improve What You Do Not Measure
Deploying an AI support agent without measuring its performance is like launching a product without analytics. You have no idea if customers are getting help, if the agent is making things worse, or where to invest improvement effort. The three metrics that matter most are containment rate (did the AI resolve it without a human?), resolution rate (was the issue actually solved?), and CSAT (was the customer satisfied?).
Defining the Core KPIs
Each metric captures a different dimension of performance. Together, they give a complete picture.
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
from enum import Enum
class ConversationOutcome(Enum):
RESOLVED_BY_AI = "resolved_by_ai"
ESCALATED_TO_HUMAN = "escalated_to_human"
ABANDONED = "abandoned"
UNRESOLVED = "unresolved"
@dataclass
class ConversationRecord:
id: str
started_at: datetime
ended_at: Optional[datetime]
outcome: ConversationOutcome
turn_count: int
csat_score: Optional[int] # 1-5
resolution_confirmed: bool
escalated: bool
intent: str
first_response_ms: int
total_duration_seconds: int
@dataclass
class SupportMetrics:
total_conversations: int = 0
resolved_by_ai: int = 0
escalated: int = 0
abandoned: int = 0
total_csat: float = 0.0
csat_responses: int = 0
confirmed_resolutions: int = 0
total_first_response_ms: int = 0
total_turns: int = 0
@property
def containment_rate(self) -> float:
if self.total_conversations == 0:
return 0.0
return self.resolved_by_ai / self.total_conversations
@property
def escalation_rate(self) -> float:
if self.total_conversations == 0:
return 0.0
return self.escalated / self.total_conversations
@property
def abandonment_rate(self) -> float:
if self.total_conversations == 0:
return 0.0
return self.abandoned / self.total_conversations
@property
def avg_csat(self) -> float:
if self.csat_responses == 0:
return 0.0
return self.total_csat / self.csat_responses
@property
def resolution_rate(self) -> float:
resolved_total = self.resolved_by_ai + self.escalated
if resolved_total == 0:
return 0.0
return self.confirmed_resolutions / resolved_total
@property
def avg_first_response_ms(self) -> float:
if self.total_conversations == 0:
return 0.0
return self.total_first_response_ms / self.total_conversations
@property
def avg_turns(self) -> float:
if self.total_conversations == 0:
return 0.0
return self.total_turns / self.total_conversations
Collecting Metrics
The metrics collector processes every completed conversation and updates aggregate numbers. It also breaks down metrics by intent category so you can see which topics the agent handles well and which need improvement.
from collections import defaultdict
class MetricsCollector:
def __init__(self):
self.overall = SupportMetrics()
self.by_intent: dict[str, SupportMetrics] = defaultdict(
SupportMetrics
)
def record(self, conversation: ConversationRecord):
for metrics in [self.overall, self.by_intent[conversation.intent]]:
metrics.total_conversations += 1
metrics.total_turns += conversation.turn_count
metrics.total_first_response_ms += conversation.first_response_ms
if conversation.outcome == ConversationOutcome.RESOLVED_BY_AI:
metrics.resolved_by_ai += 1
elif conversation.outcome == ConversationOutcome.ESCALATED_TO_HUMAN:
metrics.escalated += 1
elif conversation.outcome == ConversationOutcome.ABANDONED:
metrics.abandoned += 1
if conversation.resolution_confirmed:
metrics.confirmed_resolutions += 1
if conversation.csat_score is not None:
metrics.total_csat += conversation.csat_score
metrics.csat_responses += 1
def generate_report(self) -> dict:
report = {
"overall": {
"containment_rate": f"{self.overall.containment_rate:.1%}",
"escalation_rate": f"{self.overall.escalation_rate:.1%}",
"abandonment_rate": f"{self.overall.abandonment_rate:.1%}",
"avg_csat": f"{self.overall.avg_csat:.2f}/5.0",
"resolution_rate": f"{self.overall.resolution_rate:.1%}",
"avg_first_response": f"{self.overall.avg_first_response_ms}ms",
"avg_turns": f"{self.overall.avg_turns:.1f}",
},
"by_intent": {},
}
for intent, metrics in self.by_intent.items():
report["by_intent"][intent] = {
"containment_rate": f"{metrics.containment_rate:.1%}",
"avg_csat": f"{metrics.avg_csat:.2f}/5.0",
"volume": metrics.total_conversations,
}
return report
Benchmarking Against Targets
Set target thresholds for each metric and track whether the agent is meeting them. This makes it easy to identify areas that need attention.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
@dataclass
class PerformanceTarget:
metric: str
target: float
warning: float
critical: float
DEFAULT_TARGETS = [
PerformanceTarget("containment_rate", 0.70, 0.60, 0.50),
PerformanceTarget("avg_csat", 4.0, 3.5, 3.0),
PerformanceTarget("resolution_rate", 0.85, 0.75, 0.65),
PerformanceTarget("abandonment_rate", 0.10, 0.15, 0.20),
PerformanceTarget("avg_first_response_ms", 2000, 4000, 6000),
]
class PerformanceMonitor:
def __init__(self, targets: list[PerformanceTarget] = None):
self.targets = targets or DEFAULT_TARGETS
def evaluate(self, metrics: SupportMetrics) -> list[dict]:
results = []
metric_values = {
"containment_rate": metrics.containment_rate,
"avg_csat": metrics.avg_csat,
"resolution_rate": metrics.resolution_rate,
"abandonment_rate": metrics.abandonment_rate,
"avg_first_response_ms": metrics.avg_first_response_ms,
}
for target in self.targets:
value = metric_values.get(target.metric, 0)
# For abandonment_rate and response time, lower is better
if target.metric in ("abandonment_rate", "avg_first_response_ms"):
if value <= target.target:
status = "healthy"
elif value <= target.warning:
status = "warning"
else:
status = "critical"
else:
if value >= target.target:
status = "healthy"
elif value >= target.warning:
status = "warning"
else:
status = "critical"
results.append({
"metric": target.metric,
"value": value,
"target": target.target,
"status": status,
})
return results
Improvement Loops
Metrics without action are just dashboards. Build automated improvement loops that identify the weakest areas and generate actionable recommendations.
class ImprovementEngine:
def __init__(self, collector: MetricsCollector):
self.collector = collector
def identify_weakest_intents(self, top_n: int = 3) -> list[dict]:
intents = []
for intent, metrics in self.collector.by_intent.items():
if metrics.total_conversations < 10:
continue
intents.append({
"intent": intent,
"containment": metrics.containment_rate,
"csat": metrics.avg_csat,
"volume": metrics.total_conversations,
"score": (
metrics.containment_rate * 0.4
+ (metrics.avg_csat / 5) * 0.4
+ (1 - metrics.abandonment_rate) * 0.2
),
})
intents.sort(key=lambda i: i["score"])
return intents[:top_n]
def recommend_actions(self) -> list[str]:
actions = []
weak = self.identify_weakest_intents()
for item in weak:
if item["containment"] < 0.5:
actions.append(
f"Intent '{item['intent']}': Low containment "
f"({item['containment']:.0%}). Review knowledge "
f"base coverage and add missing articles."
)
if item["csat"] < 3.5:
actions.append(
f"Intent '{item['intent']}': Low CSAT "
f"({item['csat']:.1f}). Review conversation "
f"transcripts for tone and accuracy issues."
)
return actions
FAQ
What is a good containment rate for an AI support agent?
Industry benchmarks for AI support containment range from 60% to 80%. Start with a 65% target and improve from there. Below 50% means the AI is essentially a receptionist, not a resolver. Above 80% is excellent but verify with CSAT — high containment with low satisfaction means the agent is closing conversations without actually helping.
How do I collect CSAT from AI-handled conversations?
Send a one-question survey at the end of resolved conversations: "How satisfied were you with the support you received? (1-5)". Keep it simple — multi-question surveys get low response rates. You will typically see 15-25% response rates, which is enough for statistically meaningful analysis once you have a few hundred conversations.
How often should I review support agent metrics?
Review the dashboard daily for anomalies (sudden drops in containment or CSAT), weekly for trend analysis, and monthly for strategic improvements. The daily check catches outages and misconfigurations. The weekly review identifies gradual degradation. The monthly review drives knowledge base expansion and model improvements.
#SupportMetrics #CSAT #PerformanceMeasurement #AIAnalytics #CustomerSupport #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.