Claude API in Python: Production Patterns and Best Practices
Production-grade Python patterns for the Claude API. Covers async patterns, connection management, structured outputs, dependency injection, testing with pytest, and deployment strategies for Python-based AI applications.
Setting Up the Python SDK
The Anthropic Python SDK supports both synchronous and asynchronous usage, with built-in retry logic, streaming, and comprehensive type hints.
pip install anthropic
Client Initialization
from anthropic import Anthropic, AsyncAnthropic
# Synchronous client
client = Anthropic() # Reads ANTHROPIC_API_KEY from environment
# Async client (for FastAPI, aiohttp, etc.)
async_client = AsyncAnthropic()
# Explicit configuration
client = Anthropic(
api_key=os.environ["ANTHROPIC_API_KEY"],
max_retries=3,
timeout=120.0,
)
Singleton Pattern for Connection Reuse
from functools import lru_cache
from anthropic import AsyncAnthropic
@lru_cache(maxsize=1)
def get_claude_client() -> AsyncAnthropic:
"""Singleton async client that reuses HTTP connections."""
return AsyncAnthropic(
max_retries=3,
timeout=120.0,
)
Async Patterns with FastAPI
from fastapi import FastAPI, Depends
from pydantic import BaseModel
from anthropic import AsyncAnthropic
app = FastAPI()
class ChatRequest(BaseModel):
message: str
system_prompt: str = "You are a helpful assistant."
model: str = "claude-sonnet-4-5-20250514"
max_tokens: int = 4096
class ChatResponse(BaseModel):
text: str
input_tokens: int
output_tokens: int
model: str
cost_usd: float
def get_client() -> AsyncAnthropic:
return get_claude_client()
@app.post("/api/chat", response_model=ChatResponse)
async def chat(
request: ChatRequest,
client: AsyncAnthropic = Depends(get_client),
):
response = await client.messages.create(
model=request.model,
max_tokens=request.max_tokens,
system=request.system_prompt,
messages=[{"role": "user", "content": request.message}],
)
text = response.content[0].text
cost = calculate_cost(
response.model,
response.usage.input_tokens,
response.usage.output_tokens,
)
return ChatResponse(
text=text,
input_tokens=response.usage.input_tokens,
output_tokens=response.usage.output_tokens,
model=response.model,
cost_usd=cost,
)
def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
rates = {
"claude-sonnet-4-5-20250514": (3.0, 15.0),
"claude-haiku-4-5-20250514": (1.0, 5.0),
}
input_rate, output_rate = rates.get(model, (3.0, 15.0))
return (input_tokens * input_rate + output_tokens * output_rate) / 1_000_000
Structured Output with Pydantic
Use Pydantic models to validate Claude's responses:
from pydantic import BaseModel, Field
from typing import Optional
import json
class SentimentAnalysis(BaseModel):
sentiment: str = Field(description="positive, negative, or neutral")
confidence: float = Field(ge=0, le=1, description="Confidence score 0-1")
key_phrases: list[str] = Field(description="Phrases that indicate the sentiment")
summary: str = Field(description="One-sentence summary of the text's tone")
async def analyze_sentiment(text: str) -> SentimentAnalysis:
"""Analyze sentiment with structured, validated output."""
client = get_claude_client()
response = await client.messages.create(
model="claude-haiku-4-5-20250514",
max_tokens=1024,
system="""Analyze the sentiment of the provided text.
Return a JSON object with these exact fields:
- sentiment: "positive", "negative", or "neutral"
- confidence: float between 0 and 1
- key_phrases: array of strings
- summary: one sentence string""",
messages=[{"role": "user", "content": text}],
)
raw = response.content[0].text
# Handle markdown code blocks
if "```json" in raw:
raw = raw.split("```json")[1].split("```")[0]
elif "```" in raw:
raw = raw.split("```")[1].split("```")[0]
data = json.loads(raw.strip())
return SentimentAnalysis(**data)
Streaming with FastAPI
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json
@app.post("/api/chat/stream")
async def chat_stream(request: ChatRequest):
client = get_claude_client()
async def generate():
async with client.messages.stream(
model=request.model,
max_tokens=request.max_tokens,
system=request.system_prompt,
messages=[{"role": "user", "content": request.message}],
) as stream:
async for text in stream.text_stream:
yield f"data: {json.dumps({'text': text})}\n\n"
# Send final usage data
final = await stream.get_final_message()
yield f"data: {json.dumps({'done': True, 'usage': {'input': final.usage.input_tokens, 'output': final.usage.output_tokens}})}\n\n"
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
Dependency Injection Pattern
from abc import ABC, abstractmethod
from anthropic import AsyncAnthropic
class LLMService(ABC):
"""Abstract interface for LLM services."""
@abstractmethod
async def generate(
self, messages: list[dict], system: str = "", max_tokens: int = 4096,
) -> str:
pass
@abstractmethod
async def generate_structured(
self, messages: list[dict], response_model: type, system: str = "",
) -> dict:
pass
class ClaudeService(LLMService):
"""Claude implementation of the LLM service."""
def __init__(self, client: AsyncAnthropic, model: str = "claude-sonnet-4-5-20250514"):
self.client = client
self.model = model
async def generate(
self, messages: list[dict], system: str = "", max_tokens: int = 4096,
) -> str:
response = await self.client.messages.create(
model=self.model,
max_tokens=max_tokens,
system=system,
messages=messages,
)
return response.content[0].text
async def generate_structured(
self, messages: list[dict], response_model: type, system: str = "",
) -> dict:
response = await self.generate(messages, system)
data = json.loads(extract_json(response))
return response_model(**data)
class MockLLMService(LLMService):
"""Mock for testing -- no API calls needed."""
def __init__(self, responses: dict[str, str] = None):
self.responses = responses or {}
self.call_log: list[dict] = []
async def generate(
self, messages: list[dict], system: str = "", max_tokens: int = 4096,
) -> str:
self.call_log.append({"messages": messages, "system": system})
user_msg = messages[-1]["content"] if messages else ""
return self.responses.get(user_msg, "Mock response")
async def generate_structured(
self, messages: list[dict], response_model: type, system: str = "",
) -> dict:
text = await self.generate(messages, system)
return json.loads(text)
Testing with Pytest
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
@pytest.fixture
def mock_claude():
"""Fixture that provides a mocked Claude client."""
client = AsyncMock(spec=AsyncAnthropic)
# Configure the mock response
mock_response = MagicMock()
mock_response.content = [MagicMock(type="text", text="Test response")]
mock_response.usage.input_tokens = 10
mock_response.usage.output_tokens = 5
mock_response.model = "claude-sonnet-4-5-20250514"
mock_response.stop_reason = "end_turn"
client.messages.create = AsyncMock(return_value=mock_response)
return client
@pytest.mark.asyncio
async def test_chat_endpoint(mock_claude):
service = ClaudeService(client=mock_claude)
result = await service.generate(
messages=[{"role": "user", "content": "Hello"}]
)
assert result == "Test response"
mock_claude.messages.create.assert_called_once()
@pytest.mark.asyncio
async def test_structured_output(mock_claude):
mock_claude.messages.create.return_value.content[0].text = json.dumps({
"sentiment": "positive",
"confidence": 0.95,
"key_phrases": ["great", "love it"],
"summary": "Very positive sentiment."
})
service = ClaudeService(client=mock_claude)
result = await service.generate_structured(
messages=[{"role": "user", "content": "I love this product!"}],
response_model=SentimentAnalysis,
)
assert result.sentiment == "positive"
assert result.confidence == 0.95
@pytest.mark.asyncio
async def test_error_handling(mock_claude):
from anthropic import RateLimitError
mock_claude.messages.create.side_effect = RateLimitError(
message="Rate limit exceeded",
response=MagicMock(status_code=429, headers={"retry-after": "30"}),
body={"error": {"message": "Rate limit exceeded"}},
)
service = ClaudeService(client=mock_claude)
with pytest.raises(RateLimitError):
await service.generate(messages=[{"role": "user", "content": "test"}])
Concurrent Request Management
import asyncio
from asyncio import Semaphore
class ConcurrentClaude:
"""Manage concurrent Claude API calls with a semaphore."""
def __init__(self, client: AsyncAnthropic, max_concurrent: int = 10):
self.client = client
self.semaphore = Semaphore(max_concurrent)
self.total_cost = 0.0
async def call(self, messages: list[dict], **kwargs) -> str:
async with self.semaphore:
response = await self.client.messages.create(
messages=messages,
model=kwargs.get("model", "claude-sonnet-4-5-20250514"),
max_tokens=kwargs.get("max_tokens", 4096),
)
self.total_cost += calculate_cost(
response.model,
response.usage.input_tokens,
response.usage.output_tokens,
)
return response.content[0].text
async def batch_call(self, tasks: list[dict]) -> list[str]:
"""Process multiple tasks concurrently within the semaphore limit."""
coros = [
self.call(task["messages"], **task.get("kwargs", {}))
for task in tasks
]
return await asyncio.gather(*coros, return_exceptions=True)
# Usage
concurrent = ConcurrentClaude(get_claude_client(), max_concurrent=5)
results = await concurrent.batch_call([
{"messages": [{"role": "user", "content": f"Summarize: {doc}"}]}
for doc in documents
])
print(f"Total cost: ${concurrent.total_cost:.4f}")
Production Configuration
from pydantic_settings import BaseSettings
class ClaudeSettings(BaseSettings):
anthropic_api_key: str
default_model: str = "claude-sonnet-4-5-20250514"
max_tokens: int = 4096
max_retries: int = 3
timeout_seconds: float = 120.0
max_concurrent_requests: int = 10
cost_alert_threshold_usd: float = 100.0
class Config:
env_prefix = "CLAUDE_"
settings = ClaudeSettings()
These patterns form a solid foundation for any Python application that integrates the Claude API. The key principles: use async everywhere, validate structured outputs, inject dependencies for testability, and track costs from day one.
NYC News
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.