Skip to content
Agentic AI
Agentic AI6 min read8 views

Claude API in Python: Production Patterns and Best Practices

Production-grade Python patterns for the Claude API. Covers async patterns, connection management, structured outputs, dependency injection, testing with pytest, and deployment strategies for Python-based AI applications.

Setting Up the Python SDK

The Anthropic Python SDK supports both synchronous and asynchronous usage, with built-in retry logic, streaming, and comprehensive type hints.

pip install anthropic

Client Initialization

from anthropic import Anthropic, AsyncAnthropic

# Synchronous client
client = Anthropic()  # Reads ANTHROPIC_API_KEY from environment

# Async client (for FastAPI, aiohttp, etc.)
async_client = AsyncAnthropic()

# Explicit configuration
client = Anthropic(
    api_key=os.environ["ANTHROPIC_API_KEY"],
    max_retries=3,
    timeout=120.0,
)

Singleton Pattern for Connection Reuse

from functools import lru_cache
from anthropic import AsyncAnthropic

@lru_cache(maxsize=1)
def get_claude_client() -> AsyncAnthropic:
    """Singleton async client that reuses HTTP connections."""
    return AsyncAnthropic(
        max_retries=3,
        timeout=120.0,
    )

Async Patterns with FastAPI

from fastapi import FastAPI, Depends
from pydantic import BaseModel
from anthropic import AsyncAnthropic

app = FastAPI()

class ChatRequest(BaseModel):
    message: str
    system_prompt: str = "You are a helpful assistant."
    model: str = "claude-sonnet-4-5-20250514"
    max_tokens: int = 4096

class ChatResponse(BaseModel):
    text: str
    input_tokens: int
    output_tokens: int
    model: str
    cost_usd: float

def get_client() -> AsyncAnthropic:
    return get_claude_client()

@app.post("/api/chat", response_model=ChatResponse)
async def chat(
    request: ChatRequest,
    client: AsyncAnthropic = Depends(get_client),
):
    response = await client.messages.create(
        model=request.model,
        max_tokens=request.max_tokens,
        system=request.system_prompt,
        messages=[{"role": "user", "content": request.message}],
    )

    text = response.content[0].text
    cost = calculate_cost(
        response.model,
        response.usage.input_tokens,
        response.usage.output_tokens,
    )

    return ChatResponse(
        text=text,
        input_tokens=response.usage.input_tokens,
        output_tokens=response.usage.output_tokens,
        model=response.model,
        cost_usd=cost,
    )

def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    rates = {
        "claude-sonnet-4-5-20250514": (3.0, 15.0),
        "claude-haiku-4-5-20250514": (1.0, 5.0),
    }
    input_rate, output_rate = rates.get(model, (3.0, 15.0))
    return (input_tokens * input_rate + output_tokens * output_rate) / 1_000_000

Structured Output with Pydantic

Use Pydantic models to validate Claude's responses:

flowchart TD
    START["Claude API in Python: Production Patterns and Bes…"] --> A
    A["Setting Up the Python SDK"]
    A --> B
    B["Async Patterns with FastAPI"]
    B --> C
    C["Structured Output with Pydantic"]
    C --> D
    D["Streaming with FastAPI"]
    D --> E
    E["Dependency Injection Pattern"]
    E --> F
    F["Testing with Pytest"]
    F --> G
    G["Concurrent Request Management"]
    G --> H
    H["Production Configuration"]
    H --> DONE["Key Takeaways"]
    style START fill:#4f46e5,stroke:#4338ca,color:#fff
    style DONE fill:#059669,stroke:#047857,color:#fff
from pydantic import BaseModel, Field
from typing import Optional
import json

class SentimentAnalysis(BaseModel):
    sentiment: str = Field(description="positive, negative, or neutral")
    confidence: float = Field(ge=0, le=1, description="Confidence score 0-1")
    key_phrases: list[str] = Field(description="Phrases that indicate the sentiment")
    summary: str = Field(description="One-sentence summary of the text's tone")

async def analyze_sentiment(text: str) -> SentimentAnalysis:
    """Analyze sentiment with structured, validated output."""
    client = get_claude_client()

    response = await client.messages.create(
        model="claude-haiku-4-5-20250514",
        max_tokens=1024,
        system="""Analyze the sentiment of the provided text.
Return a JSON object with these exact fields:
- sentiment: "positive", "negative", or "neutral"
- confidence: float between 0 and 1
- key_phrases: array of strings
- summary: one sentence string""",
        messages=[{"role": "user", "content": text}],
    )

    raw = response.content[0].text

    # Handle markdown code blocks
    if "```json" in raw:
        raw = raw.split("```json")[1].split("```")[0]
    elif "```" in raw:
        raw = raw.split("```")[1].split("```")[0]

    data = json.loads(raw.strip())
    return SentimentAnalysis(**data)

Streaming with FastAPI

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json

@app.post("/api/chat/stream")
async def chat_stream(request: ChatRequest):
    client = get_claude_client()

    async def generate():
        async with client.messages.stream(
            model=request.model,
            max_tokens=request.max_tokens,
            system=request.system_prompt,
            messages=[{"role": "user", "content": request.message}],
        ) as stream:
            async for text in stream.text_stream:
                yield f"data: {json.dumps({'text': text})}\n\n"

        # Send final usage data
        final = await stream.get_final_message()
        yield f"data: {json.dumps({'done': True, 'usage': {'input': final.usage.input_tokens, 'output': final.usage.output_tokens}})}\n\n"

    return StreamingResponse(
        generate(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "X-Accel-Buffering": "no",
        },
    )

Dependency Injection Pattern

from abc import ABC, abstractmethod
from anthropic import AsyncAnthropic

class LLMService(ABC):
    """Abstract interface for LLM services."""

    @abstractmethod
    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        pass

    @abstractmethod
    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        pass

class ClaudeService(LLMService):
    """Claude implementation of the LLM service."""

    def __init__(self, client: AsyncAnthropic, model: str = "claude-sonnet-4-5-20250514"):
        self.client = client
        self.model = model

    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        response = await self.client.messages.create(
            model=self.model,
            max_tokens=max_tokens,
            system=system,
            messages=messages,
        )
        return response.content[0].text

    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        response = await self.generate(messages, system)
        data = json.loads(extract_json(response))
        return response_model(**data)

class MockLLMService(LLMService):
    """Mock for testing -- no API calls needed."""

    def __init__(self, responses: dict[str, str] = None):
        self.responses = responses or {}
        self.call_log: list[dict] = []

    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        self.call_log.append({"messages": messages, "system": system})
        user_msg = messages[-1]["content"] if messages else ""
        return self.responses.get(user_msg, "Mock response")

    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        text = await self.generate(messages, system)
        return json.loads(text)

Testing with Pytest

import pytest
from unittest.mock import AsyncMock, MagicMock, patch

@pytest.fixture
def mock_claude():
    """Fixture that provides a mocked Claude client."""
    client = AsyncMock(spec=AsyncAnthropic)

    # Configure the mock response
    mock_response = MagicMock()
    mock_response.content = [MagicMock(type="text", text="Test response")]
    mock_response.usage.input_tokens = 10
    mock_response.usage.output_tokens = 5
    mock_response.model = "claude-sonnet-4-5-20250514"
    mock_response.stop_reason = "end_turn"

    client.messages.create = AsyncMock(return_value=mock_response)
    return client

@pytest.mark.asyncio
async def test_chat_endpoint(mock_claude):
    service = ClaudeService(client=mock_claude)
    result = await service.generate(
        messages=[{"role": "user", "content": "Hello"}]
    )
    assert result == "Test response"
    mock_claude.messages.create.assert_called_once()

@pytest.mark.asyncio
async def test_structured_output(mock_claude):
    mock_claude.messages.create.return_value.content[0].text = json.dumps({
        "sentiment": "positive",
        "confidence": 0.95,
        "key_phrases": ["great", "love it"],
        "summary": "Very positive sentiment."
    })

    service = ClaudeService(client=mock_claude)
    result = await service.generate_structured(
        messages=[{"role": "user", "content": "I love this product!"}],
        response_model=SentimentAnalysis,
    )
    assert result.sentiment == "positive"
    assert result.confidence == 0.95

@pytest.mark.asyncio
async def test_error_handling(mock_claude):
    from anthropic import RateLimitError
    mock_claude.messages.create.side_effect = RateLimitError(
        message="Rate limit exceeded",
        response=MagicMock(status_code=429, headers={"retry-after": "30"}),
        body={"error": {"message": "Rate limit exceeded"}},
    )

    service = ClaudeService(client=mock_claude)
    with pytest.raises(RateLimitError):
        await service.generate(messages=[{"role": "user", "content": "test"}])

Concurrent Request Management

import asyncio
from asyncio import Semaphore

class ConcurrentClaude:
    """Manage concurrent Claude API calls with a semaphore."""

    def __init__(self, client: AsyncAnthropic, max_concurrent: int = 10):
        self.client = client
        self.semaphore = Semaphore(max_concurrent)
        self.total_cost = 0.0

    async def call(self, messages: list[dict], **kwargs) -> str:
        async with self.semaphore:
            response = await self.client.messages.create(
                messages=messages,
                model=kwargs.get("model", "claude-sonnet-4-5-20250514"),
                max_tokens=kwargs.get("max_tokens", 4096),
            )
            self.total_cost += calculate_cost(
                response.model,
                response.usage.input_tokens,
                response.usage.output_tokens,
            )
            return response.content[0].text

    async def batch_call(self, tasks: list[dict]) -> list[str]:
        """Process multiple tasks concurrently within the semaphore limit."""
        coros = [
            self.call(task["messages"], **task.get("kwargs", {}))
            for task in tasks
        ]
        return await asyncio.gather(*coros, return_exceptions=True)

# Usage
concurrent = ConcurrentClaude(get_claude_client(), max_concurrent=5)
results = await concurrent.batch_call([
    {"messages": [{"role": "user", "content": f"Summarize: {doc}"}]}
    for doc in documents
])
print(f"Total cost: ${concurrent.total_cost:.4f}")

Production Configuration

from pydantic_settings import BaseSettings

class ClaudeSettings(BaseSettings):
    anthropic_api_key: str
    default_model: str = "claude-sonnet-4-5-20250514"
    max_tokens: int = 4096
    max_retries: int = 3
    timeout_seconds: float = 120.0
    max_concurrent_requests: int = 10
    cost_alert_threshold_usd: float = 100.0

    class Config:
        env_prefix = "CLAUDE_"

settings = ClaudeSettings()

These patterns form a solid foundation for any Python application that integrates the Claude API. The key principles: use async everywhere, validate structured outputs, inject dependencies for testability, and track costs from day one.

Share
C

Written by

CallSphere Team

Expert insights on AI voice agents and customer communication automation.

Try CallSphere AI Voice Agents

See how AI voice agents work for your industry. Live demo available -- no signup required.