Response Caching for AI Agents: Semantic Cache, Exact Cache, and TTL Strategies
Build intelligent caching layers for your AI agents using exact-match caches, semantic similarity caches, and time-based invalidation strategies to reduce costs and latency without serving stale responses.
Why Cache LLM Responses
LLM API calls are expensive and slow. A single GPT-4o call costs $2.50-$10 per million input tokens and takes 1-5 seconds. If 30% of your users ask variations of the same question, you are paying for the same computation repeatedly.
Caching stores previous LLM responses and serves them for identical or similar future queries. A well-designed cache can reduce LLM API costs by 20-50% and cut response times from seconds to milliseconds for cache hits.
Exact-Match Cache
The simplest cache: hash the input and store the output. If the exact same input appears again, return the cached output.
import hashlib
import json
import time
from typing import Any
class ExactCache:
def __init__(self, redis_client, default_ttl: int = 3600):
self.redis = redis_client
self.default_ttl = default_ttl
def _make_key(self, model: str, messages: list[dict], **kwargs) -> str:
"""Create a deterministic cache key from the request parameters."""
payload = json.dumps(
{"model": model, "messages": messages, **kwargs},
sort_keys=True,
)
return f"llm:exact:{hashlib.sha256(payload.encode()).hexdigest()}"
async def get(self, model: str, messages: list[dict], **kwargs) -> dict | None:
key = self._make_key(model, messages, **kwargs)
cached = await self.redis.get(key)
if cached:
return json.loads(cached)
return None
async def set(
self, model: str, messages: list[dict], response: dict, ttl: int = None, **kwargs
):
key = self._make_key(model, messages, **kwargs)
await self.redis.set(
key,
json.dumps(response),
ex=ttl or self.default_ttl,
)
# Usage with an LLM client
class CachedLLMClient:
def __init__(self, openai_client, cache: ExactCache):
self.client = openai_client
self.cache = cache
async def complete(self, model: str, messages: list[dict], **kwargs) -> str:
# Check cache first
cached = await self.cache.get(model, messages, **kwargs)
if cached:
return cached["content"]
# Cache miss — call the LLM
response = await self.client.chat.completions.create(
model=model, messages=messages, **kwargs
)
content = response.choices[0].message.content
# Store in cache
await self.cache.set(
model, messages, {"content": content}, **kwargs
)
return content
Exact caching works well for deterministic queries like classification, extraction, and structured data processing where the same input always produces the same desired output.
Semantic Cache: Matching Similar Queries
Users rarely ask the exact same question. They ask "What is your return policy?" and "How do I return an item?" and "Can I send something back?" — all meaning the same thing. A semantic cache uses embedding similarity to match these variations.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
import numpy as np
import json
import hashlib
class SemanticCache:
def __init__(self, embedder, redis_client, similarity_threshold: float = 0.92):
self.embedder = embedder
self.redis = redis_client
self.threshold = similarity_threshold
self._embeddings: list[tuple[str, np.ndarray]] = []
async def _load_index(self):
"""Load cached embeddings from Redis into memory."""
keys = await self.redis.keys("llm:semantic:emb:*")
self._embeddings = []
for key in keys:
data = json.loads(await self.redis.get(key))
self._embeddings.append((
data["cache_key"],
np.array(data["embedding"]),
))
async def get(self, query: str) -> dict | None:
query_embedding = await self.embedder.embed(query)
best_key = None
best_score = 0.0
for cache_key, stored_embedding in self._embeddings:
score = np.dot(query_embedding, stored_embedding) / (
np.linalg.norm(query_embedding) * np.linalg.norm(stored_embedding)
)
if score > best_score:
best_score = score
best_key = cache_key
if best_score >= self.threshold and best_key:
cached = await self.redis.get(f"llm:semantic:resp:{best_key}")
if cached:
return json.loads(cached)
return None
async def set(self, query: str, response: dict, ttl: int = 3600):
embedding = await self.embedder.embed(query)
cache_key = hashlib.sha256(query.encode()).hexdigest()[:16]
# Store the embedding for future similarity lookups
await self.redis.set(
f"llm:semantic:emb:{cache_key}",
json.dumps({"cache_key": cache_key, "embedding": embedding.tolist()}),
ex=ttl,
)
# Store the response
await self.redis.set(
f"llm:semantic:resp:{cache_key}",
json.dumps(response),
ex=ttl,
)
self._embeddings.append((cache_key, embedding))
The similarity threshold is critical. Set it too low (0.80) and you serve wrong answers. Set it too high (0.98) and you rarely get cache hits. Start at 0.92 and tune based on your domain.
TTL Strategies: When to Invalidate
Different types of cached data need different expiration strategies.
from enum import Enum
class CacheTTL(Enum):
# Static knowledge: rarely changes
FACTUAL = 86400 # 24 hours
# Company-specific: changes occasionally
POLICY = 3600 # 1 hour
# User-specific: changes frequently
PERSONALIZED = 300 # 5 minutes
# Real-time data: changes constantly
LIVE_DATA = 30 # 30 seconds
class SmartCache:
def __init__(self, exact_cache: ExactCache, semantic_cache: SemanticCache):
self.exact = exact_cache
self.semantic = semantic_cache
def classify_ttl(self, messages: list[dict]) -> int:
"""Determine appropriate TTL based on query characteristics."""
last_message = messages[-1]["content"].lower()
if any(w in last_message for w in ["price", "stock", "available", "weather"]):
return CacheTTL.LIVE_DATA.value
elif any(w in last_message for w in ["my account", "my order", "my"]):
return CacheTTL.PERSONALIZED.value
elif any(w in last_message for w in ["policy", "return", "shipping"]):
return CacheTTL.POLICY.value
else:
return CacheTTL.FACTUAL.value
async def get(self, messages: list[dict]) -> dict | None:
# Try exact cache first (fastest)
result = await self.exact.get("gpt-4o", messages)
if result:
return result
# Fall back to semantic cache
query = messages[-1]["content"]
return await self.semantic.get(query)
Hit Rate Optimization
Track and optimize your cache hit rate with structured metrics.
from dataclasses import dataclass, field
@dataclass
class CacheMetrics:
exact_hits: int = 0
semantic_hits: int = 0
misses: int = 0
@property
def total_requests(self) -> int:
return self.exact_hits + self.semantic_hits + self.misses
@property
def hit_rate(self) -> float:
if self.total_requests == 0:
return 0.0
return (self.exact_hits + self.semantic_hits) / self.total_requests
@property
def cost_savings_pct(self) -> float:
return self.hit_rate * 100
def report(self) -> str:
return (
f"Hit rate: {self.hit_rate:.1%} "
f"(exact: {self.exact_hits}, semantic: {self.semantic_hits}, "
f"miss: {self.misses}) | "
f"Est. cost savings: {self.cost_savings_pct:.0f}%"
)
FAQ
What similarity threshold should I use for semantic caching?
Start with 0.92 for general-purpose agents. For high-stakes domains like medical or legal, use 0.96 or higher to minimize incorrect cache hits. For casual conversational agents, 0.88-0.90 can work well. Monitor your false-positive rate — cases where the cache serves a response that does not actually answer the user's question — and adjust accordingly.
Should I cache streaming responses?
Yes, but cache the complete response after streaming finishes, not the stream itself. On a cache hit, you can either return the full response instantly or simulate streaming by emitting the cached text in chunks with small delays to maintain a consistent UX.
How do I handle cache invalidation when my knowledge base changes?
Use versioned cache keys that include a content hash or version number. When your knowledge base updates, increment the version. Old cache entries expire naturally via TTL while new queries hit the updated knowledge base. For critical updates, implement active invalidation by scanning and deleting affected cache keys.
#Caching #SemanticSearch #Redis #CostOptimization #Python #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.