Token Budget Management: Setting and Enforcing Per-User and Per-Request Limits
Build a token budget management system with per-user quotas, per-request limits, enforcement middleware, and graceful degradation. Prevent cost overruns while maintaining service quality for your AI agents.
Why Token Budgets Are Essential
Without token budgets, a single bad prompt or a burst of traffic can consume your entire monthly LLM budget in hours. Unlike traditional API rate limiting (which caps request count), token budgets cap the actual cost driver: token consumption. A rate limit of 100 requests per minute does not prevent a single request from consuming 100,000 tokens.
Token budget management gives you three levels of control: per-request limits (prevent individual runaway calls), per-user quotas (fair resource allocation), and system-wide budgets (total spend caps).
Per-Request Token Limits
from dataclasses import dataclass
from typing import Optional
@dataclass
class TokenBudget:
max_input_tokens: int = 8000
max_output_tokens: int = 2000
max_total_tokens: int = 10000
TIER_BUDGETS = {
"free": TokenBudget(max_input_tokens=2000, max_output_tokens=500, max_total_tokens=2500),
"pro": TokenBudget(max_input_tokens=8000, max_output_tokens=2000, max_total_tokens=10000),
"enterprise": TokenBudget(max_input_tokens=32000, max_output_tokens=4000, max_total_tokens=36000),
}
class TokenBudgetEnforcer:
def validate_request(
self,
estimated_input_tokens: int,
tier: str = "pro",
) -> dict:
budget = TIER_BUDGETS.get(tier, TIER_BUDGETS["free"])
if estimated_input_tokens > budget.max_input_tokens:
return {
"allowed": False,
"reason": f"Input tokens ({estimated_input_tokens}) exceed "
f"limit ({budget.max_input_tokens})",
"suggestion": "Reduce context length or upgrade plan",
}
return {
"allowed": True,
"max_output_tokens": budget.max_output_tokens,
"remaining_budget": budget.max_total_tokens - estimated_input_tokens,
}
Per-User Quota System
Track cumulative token usage per user with rolling windows (daily, monthly) and enforce quotas.
import time
from collections import defaultdict
from typing import Dict
class UserQuotaManager:
def __init__(self):
self.usage: Dict[str, list] = defaultdict(list)
self.quotas: Dict[str, dict] = {}
def set_quota(self, user_id: str, daily_tokens: int, monthly_tokens: int):
self.quotas[user_id] = {
"daily": daily_tokens,
"monthly": monthly_tokens,
}
def record_usage(self, user_id: str, tokens: int):
self.usage[user_id].append({
"tokens": tokens,
"timestamp": time.time(),
})
def get_usage(self, user_id: str, window_seconds: int) -> int:
cutoff = time.time() - window_seconds
entries = self.usage.get(user_id, [])
return sum(e["tokens"] for e in entries if e["timestamp"] > cutoff)
def check_quota(self, user_id: str, requested_tokens: int) -> dict:
quota = self.quotas.get(user_id, {"daily": 100_000, "monthly": 2_000_000})
daily_used = self.get_usage(user_id, 86400)
monthly_used = self.get_usage(user_id, 86400 * 30)
if daily_used + requested_tokens > quota["daily"]:
return {
"allowed": False,
"reason": "daily_quota_exceeded",
"used": daily_used,
"limit": quota["daily"],
"resets_in_seconds": self._next_reset(user_id, 86400),
}
if monthly_used + requested_tokens > quota["monthly"]:
return {
"allowed": False,
"reason": "monthly_quota_exceeded",
"used": monthly_used,
"limit": quota["monthly"],
}
return {
"allowed": True,
"daily_remaining": quota["daily"] - daily_used - requested_tokens,
"monthly_remaining": quota["monthly"] - monthly_used - requested_tokens,
}
def _next_reset(self, user_id: str, window: int) -> int:
entries = self.usage.get(user_id, [])
if not entries:
return 0
oldest_in_window = min(
e["timestamp"] for e in entries
if e["timestamp"] > time.time() - window
)
return int(oldest_in_window + window - time.time())
FastAPI Middleware for Budget Enforcement
from fastapi import Request, HTTPException
from starlette.middleware.base import BaseHTTPMiddleware
class TokenBudgetMiddleware(BaseHTTPMiddleware):
def __init__(self, app, quota_manager: UserQuotaManager):
super().__init__(app)
self.quota_manager = quota_manager
async def dispatch(self, request: Request, call_next):
if not request.url.path.startswith("/api/agent"):
return await call_next(request)
user_id = request.headers.get("X-User-ID", "anonymous")
estimated_tokens = int(request.headers.get("X-Estimated-Tokens", "1000"))
check = self.quota_manager.check_quota(user_id, estimated_tokens)
if not check["allowed"]:
raise HTTPException(
status_code=429,
detail={
"error": "token_quota_exceeded",
"reason": check["reason"],
"used": check.get("used"),
"limit": check.get("limit"),
},
)
response = await call_next(request)
actual_tokens = int(response.headers.get("X-Tokens-Used", estimated_tokens))
self.quota_manager.record_usage(user_id, actual_tokens)
return response
Graceful Degradation
When a user approaches their quota, degrade gracefully instead of cutting off service entirely.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
class GracefulDegradation:
def __init__(self, quota_manager: UserQuotaManager):
self.quota_manager = quota_manager
def get_degraded_config(self, user_id: str) -> dict:
check = self.quota_manager.check_quota(user_id, 0)
if not check["allowed"]:
return {"model": None, "max_tokens": 0, "message": "Quota exceeded"}
daily_remaining = check.get("daily_remaining", 0)
daily_limit = self.quota_manager.quotas.get(user_id, {}).get("daily", 100_000)
usage_pct = 1 - (daily_remaining / daily_limit) if daily_limit else 1
if usage_pct < 0.70:
return {"model": "gpt-4o", "max_tokens": 2000, "tier": "full"}
elif usage_pct < 0.90:
return {"model": "gpt-4o-mini", "max_tokens": 1000, "tier": "reduced"}
else:
return {"model": "gpt-4o-mini", "max_tokens": 500, "tier": "minimal"}
Budget Alerts
class BudgetAlertSystem:
def __init__(self, thresholds: list[float] = None):
self.thresholds = thresholds or [0.50, 0.75, 0.90, 1.00]
self.alerted: dict[str, set] = defaultdict(set)
def check_alerts(self, user_id: str, used: int, limit: int) -> list[str]:
ratio = used / limit if limit > 0 else 1.0
alerts = []
for threshold in self.thresholds:
if ratio >= threshold and threshold not in self.alerted[user_id]:
self.alerted[user_id].add(threshold)
alerts.append(
f"User {user_id} has used {ratio:.0%} of token budget "
f"({used:,} / {limit:,} tokens)"
)
return alerts
FAQ
How do I estimate token count before sending a request?
Use the tiktoken library for accurate counts with OpenAI models: len(tiktoken.encoding_for_model("gpt-4o").encode(text)). For a fast approximation without dependencies, divide word count by 0.75. The approximation is usually within 10–15% of the actual count.
Should I enforce budgets on the client side or server side?
Always enforce on the server side — client-side checks are easily bypassed. You can add client-side estimation for a better user experience (showing remaining quota in the UI), but the server must be the authority. The middleware pattern shown above ensures every request passes through budget validation.
How do I handle token budgets for multi-turn conversations?
Track cumulative tokens across the conversation, not just per-message. Each turn adds the full conversation history as input tokens plus the new output. Set a conversation-level budget (for example, 50,000 total tokens) and either summarize history or end the conversation when the budget is reached.
#TokenBudget #RateLimiting #CostControls #Middleware #UsageManagement #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.