Knowledge Base Management for Support Agents: Keeping AI Answers Up to Date
Learn how to build a knowledge base management system that ingests content, tracks freshness, detects gaps, and ensures your support AI agent always provides accurate and current answers.
The Stale Knowledge Problem
An AI support agent is only as good as its knowledge base. When product features change, pricing updates, or policies shift, the agent continues confidently citing outdated information. Customers get wrong answers delivered with high confidence — the worst possible outcome. A knowledge base management system prevents this by tracking what content exists, when it was last verified, and where gaps are forming.
Content Ingestion Pipeline
The ingestion pipeline normalizes content from multiple sources — help center articles, product docs, internal wikis, release notes — into a uniform format suitable for embedding and retrieval.
from dataclasses import dataclass, field
from datetime import datetime
from typing import Optional
from enum import Enum
import hashlib
class ContentSource(Enum):
HELP_CENTER = "help_center"
PRODUCT_DOCS = "product_docs"
INTERNAL_WIKI = "internal_wiki"
RELEASE_NOTES = "release_notes"
SUPPORT_MACROS = "support_macros"
@dataclass
class KBArticle:
id: str
title: str
content: str
source: ContentSource
source_url: str
content_hash: str
embedding: Optional[list[float]] = None
created_at: datetime = field(default_factory=datetime.utcnow)
updated_at: datetime = field(default_factory=datetime.utcnow)
verified_at: Optional[datetime] = None
category: str = ""
tags: list[str] = field(default_factory=list)
@property
def freshness_days(self) -> int:
ref = self.verified_at or self.updated_at
return (datetime.utcnow() - ref).days
class KBIngestionPipeline:
def __init__(self, embed_fn):
self.embed_fn = embed_fn
self.articles: dict[str, KBArticle] = {}
def compute_hash(self, content: str) -> str:
return hashlib.sha256(content.encode()).hexdigest()[:16]
async def ingest(
self,
id: str,
title: str,
content: str,
source: ContentSource,
source_url: str,
category: str = "",
tags: list[str] = None,
) -> KBArticle:
content_hash = self.compute_hash(content)
# Check if content actually changed
existing = self.articles.get(id)
if existing and existing.content_hash == content_hash:
return existing
embedding = await self.embed_fn(content)
article = KBArticle(
id=id,
title=title,
content=content,
source=source,
source_url=source_url,
content_hash=content_hash,
embedding=embedding,
category=category,
tags=tags or [],
)
if existing:
article.created_at = existing.created_at
article.updated_at = datetime.utcnow()
self.articles[id] = article
return article
Freshness Scoring
Every article gets a freshness score that decays over time. Content verified last week scores higher than content untouched for six months. The agent uses freshness scores to weigh answers — preferring recent information when multiple articles match.
import math
class FreshnessScorer:
def __init__(self, half_life_days: int = 90):
self.half_life = half_life_days
def score(self, article: KBArticle) -> float:
"""Returns 0.0-1.0 where 1.0 is perfectly fresh."""
days = article.freshness_days
return math.exp(-0.693 * days / self.half_life)
def get_stale_articles(
self, articles: list[KBArticle], threshold: float = 0.3
) -> list[KBArticle]:
stale = []
for article in articles:
if self.score(article) < threshold:
stale.append(article)
stale.sort(key=lambda a: self.score(a))
return stale
def freshness_report(self, articles: list[KBArticle]) -> dict:
scores = [self.score(a) for a in articles]
fresh = sum(1 for s in scores if s >= 0.7)
aging = sum(1 for s in scores if 0.3 <= s < 0.7)
stale = sum(1 for s in scores if s < 0.3)
return {
"total": len(articles),
"fresh": fresh,
"aging": aging,
"stale": stale,
"avg_score": sum(scores) / len(scores) if scores else 0,
}
Gap Detection
Gap detection identifies topics customers ask about that the knowledge base does not cover. It combines unanswered question tracking with coverage analysis across product categories.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
@dataclass
class KBGap:
topic: str
query_count: int
sample_queries: list[str]
suggested_category: str
avg_best_similarity: float
class GapDetector:
def __init__(self, pipeline: KBIngestionPipeline, embed_fn):
self.pipeline = pipeline
self.embed_fn = embed_fn
self.unanswered: list[dict] = []
def log_unanswered(
self, query: str, best_similarity: float, category: str
):
self.unanswered.append({
"query": query,
"best_similarity": best_similarity,
"category": category,
"timestamp": datetime.utcnow().isoformat(),
})
def detect_gaps(self, min_count: int = 3) -> list[KBGap]:
from collections import defaultdict
clusters = defaultdict(list)
for item in self.unanswered:
# Cluster by first few significant words
words = item["query"].lower().split()[:4]
key = " ".join(words)
clusters[key].append(item)
gaps = []
for key, items in clusters.items():
if len(items) >= min_count:
gaps.append(KBGap(
topic=key,
query_count=len(items),
sample_queries=[i["query"] for i in items[:5]],
suggested_category=items[0]["category"],
avg_best_similarity=sum(
i["best_similarity"] for i in items
) / len(items),
))
gaps.sort(key=lambda g: g.query_count, reverse=True)
return gaps
def coverage_report(self) -> dict:
categories = set()
for article in self.pipeline.articles.values():
categories.add(article.category)
unanswered_categories = set(
i["category"] for i in self.unanswered
)
uncovered = unanswered_categories - categories
return {
"covered_categories": sorted(categories),
"uncovered_categories": sorted(uncovered),
"total_unanswered": len(self.unanswered),
"gaps": self.detect_gaps(),
}
Versioning Content
When an article is updated, preserve the previous version so you can audit what the agent was telling customers at any point in time.
class VersionedKB:
def __init__(self, pipeline: KBIngestionPipeline):
self.pipeline = pipeline
self.versions: dict[str, list[KBArticle]] = {}
async def update_article(self, id: str, **kwargs) -> KBArticle:
existing = self.pipeline.articles.get(id)
if existing:
if id not in self.versions:
self.versions[id] = []
self.versions[id].append(existing)
return await self.pipeline.ingest(id=id, **kwargs)
def get_history(self, id: str) -> list[KBArticle]:
return self.versions.get(id, [])
FAQ
How often should I re-embed knowledge base content?
Re-embed whenever the content text changes. Use content hashing (as shown above) to detect actual changes and avoid unnecessary re-embedding. For most teams, a nightly sync job that checks all sources for updates is sufficient. Real-time webhooks from your CMS are better if available.
What freshness half-life should I use?
It depends on how fast your product changes. SaaS products with monthly releases should use a 60-90 day half-life. Stable enterprise products can use 180 days. The key metric is: what percentage of stale articles contained incorrect information when audited? If more than 10%, shorten the half-life.
How do I prioritize which gaps to fill first?
Rank gaps by query volume multiplied by business impact. A gap that affects 50 customers asking about billing has higher priority than one affecting 100 customers asking about a cosmetic feature. Cross-reference with your ticket escalation data — gaps that lead to human escalation cost the most.
#KnowledgeBase #ContentManagement #RAG #SupportAutomation #AIAgents #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.