Skip to content
Back to Blog
Agentic AI6 min read

Claude API in Python: Production Patterns and Best Practices

Production-grade Python patterns for the Claude API. Covers async patterns, connection management, structured outputs, dependency injection, testing with pytest, and deployment strategies for Python-based AI applications.

Setting Up the Python SDK

The Anthropic Python SDK supports both synchronous and asynchronous usage, with built-in retry logic, streaming, and comprehensive type hints.

pip install anthropic

Client Initialization

from anthropic import Anthropic, AsyncAnthropic

# Synchronous client
client = Anthropic()  # Reads ANTHROPIC_API_KEY from environment

# Async client (for FastAPI, aiohttp, etc.)
async_client = AsyncAnthropic()

# Explicit configuration
client = Anthropic(
    api_key=os.environ["ANTHROPIC_API_KEY"],
    max_retries=3,
    timeout=120.0,
)

Singleton Pattern for Connection Reuse

from functools import lru_cache
from anthropic import AsyncAnthropic

@lru_cache(maxsize=1)
def get_claude_client() -> AsyncAnthropic:
    """Singleton async client that reuses HTTP connections."""
    return AsyncAnthropic(
        max_retries=3,
        timeout=120.0,
    )

Async Patterns with FastAPI

from fastapi import FastAPI, Depends
from pydantic import BaseModel
from anthropic import AsyncAnthropic

app = FastAPI()

class ChatRequest(BaseModel):
    message: str
    system_prompt: str = "You are a helpful assistant."
    model: str = "claude-sonnet-4-5-20250514"
    max_tokens: int = 4096

class ChatResponse(BaseModel):
    text: str
    input_tokens: int
    output_tokens: int
    model: str
    cost_usd: float

def get_client() -> AsyncAnthropic:
    return get_claude_client()

@app.post("/api/chat", response_model=ChatResponse)
async def chat(
    request: ChatRequest,
    client: AsyncAnthropic = Depends(get_client),
):
    response = await client.messages.create(
        model=request.model,
        max_tokens=request.max_tokens,
        system=request.system_prompt,
        messages=[{"role": "user", "content": request.message}],
    )

    text = response.content[0].text
    cost = calculate_cost(
        response.model,
        response.usage.input_tokens,
        response.usage.output_tokens,
    )

    return ChatResponse(
        text=text,
        input_tokens=response.usage.input_tokens,
        output_tokens=response.usage.output_tokens,
        model=response.model,
        cost_usd=cost,
    )

def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    rates = {
        "claude-sonnet-4-5-20250514": (3.0, 15.0),
        "claude-haiku-4-5-20250514": (1.0, 5.0),
    }
    input_rate, output_rate = rates.get(model, (3.0, 15.0))
    return (input_tokens * input_rate + output_tokens * output_rate) / 1_000_000

Structured Output with Pydantic

Use Pydantic models to validate Claude's responses:

from pydantic import BaseModel, Field
from typing import Optional
import json

class SentimentAnalysis(BaseModel):
    sentiment: str = Field(description="positive, negative, or neutral")
    confidence: float = Field(ge=0, le=1, description="Confidence score 0-1")
    key_phrases: list[str] = Field(description="Phrases that indicate the sentiment")
    summary: str = Field(description="One-sentence summary of the text's tone")

async def analyze_sentiment(text: str) -> SentimentAnalysis:
    """Analyze sentiment with structured, validated output."""
    client = get_claude_client()

    response = await client.messages.create(
        model="claude-haiku-4-5-20250514",
        max_tokens=1024,
        system="""Analyze the sentiment of the provided text.
Return a JSON object with these exact fields:
- sentiment: "positive", "negative", or "neutral"
- confidence: float between 0 and 1
- key_phrases: array of strings
- summary: one sentence string""",
        messages=[{"role": "user", "content": text}],
    )

    raw = response.content[0].text

    # Handle markdown code blocks
    if "```json" in raw:
        raw = raw.split("```json")[1].split("```")[0]
    elif "```" in raw:
        raw = raw.split("```")[1].split("```")[0]

    data = json.loads(raw.strip())
    return SentimentAnalysis(**data)

Streaming with FastAPI

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json

@app.post("/api/chat/stream")
async def chat_stream(request: ChatRequest):
    client = get_claude_client()

    async def generate():
        async with client.messages.stream(
            model=request.model,
            max_tokens=request.max_tokens,
            system=request.system_prompt,
            messages=[{"role": "user", "content": request.message}],
        ) as stream:
            async for text in stream.text_stream:
                yield f"data: {json.dumps({'text': text})}\n\n"

        # Send final usage data
        final = await stream.get_final_message()
        yield f"data: {json.dumps({'done': True, 'usage': {'input': final.usage.input_tokens, 'output': final.usage.output_tokens}})}\n\n"

    return StreamingResponse(
        generate(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "X-Accel-Buffering": "no",
        },
    )

Dependency Injection Pattern

from abc import ABC, abstractmethod
from anthropic import AsyncAnthropic

class LLMService(ABC):
    """Abstract interface for LLM services."""

    @abstractmethod
    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        pass

    @abstractmethod
    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        pass

class ClaudeService(LLMService):
    """Claude implementation of the LLM service."""

    def __init__(self, client: AsyncAnthropic, model: str = "claude-sonnet-4-5-20250514"):
        self.client = client
        self.model = model

    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        response = await self.client.messages.create(
            model=self.model,
            max_tokens=max_tokens,
            system=system,
            messages=messages,
        )
        return response.content[0].text

    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        response = await self.generate(messages, system)
        data = json.loads(extract_json(response))
        return response_model(**data)

class MockLLMService(LLMService):
    """Mock for testing -- no API calls needed."""

    def __init__(self, responses: dict[str, str] = None):
        self.responses = responses or {}
        self.call_log: list[dict] = []

    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        self.call_log.append({"messages": messages, "system": system})
        user_msg = messages[-1]["content"] if messages else ""
        return self.responses.get(user_msg, "Mock response")

    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        text = await self.generate(messages, system)
        return json.loads(text)

Testing with Pytest

import pytest
from unittest.mock import AsyncMock, MagicMock, patch

@pytest.fixture
def mock_claude():
    """Fixture that provides a mocked Claude client."""
    client = AsyncMock(spec=AsyncAnthropic)

    # Configure the mock response
    mock_response = MagicMock()
    mock_response.content = [MagicMock(type="text", text="Test response")]
    mock_response.usage.input_tokens = 10
    mock_response.usage.output_tokens = 5
    mock_response.model = "claude-sonnet-4-5-20250514"
    mock_response.stop_reason = "end_turn"

    client.messages.create = AsyncMock(return_value=mock_response)
    return client

@pytest.mark.asyncio
async def test_chat_endpoint(mock_claude):
    service = ClaudeService(client=mock_claude)
    result = await service.generate(
        messages=[{"role": "user", "content": "Hello"}]
    )
    assert result == "Test response"
    mock_claude.messages.create.assert_called_once()

@pytest.mark.asyncio
async def test_structured_output(mock_claude):
    mock_claude.messages.create.return_value.content[0].text = json.dumps({
        "sentiment": "positive",
        "confidence": 0.95,
        "key_phrases": ["great", "love it"],
        "summary": "Very positive sentiment."
    })

    service = ClaudeService(client=mock_claude)
    result = await service.generate_structured(
        messages=[{"role": "user", "content": "I love this product!"}],
        response_model=SentimentAnalysis,
    )
    assert result.sentiment == "positive"
    assert result.confidence == 0.95

@pytest.mark.asyncio
async def test_error_handling(mock_claude):
    from anthropic import RateLimitError
    mock_claude.messages.create.side_effect = RateLimitError(
        message="Rate limit exceeded",
        response=MagicMock(status_code=429, headers={"retry-after": "30"}),
        body={"error": {"message": "Rate limit exceeded"}},
    )

    service = ClaudeService(client=mock_claude)
    with pytest.raises(RateLimitError):
        await service.generate(messages=[{"role": "user", "content": "test"}])

Concurrent Request Management

import asyncio
from asyncio import Semaphore

class ConcurrentClaude:
    """Manage concurrent Claude API calls with a semaphore."""

    def __init__(self, client: AsyncAnthropic, max_concurrent: int = 10):
        self.client = client
        self.semaphore = Semaphore(max_concurrent)
        self.total_cost = 0.0

    async def call(self, messages: list[dict], **kwargs) -> str:
        async with self.semaphore:
            response = await self.client.messages.create(
                messages=messages,
                model=kwargs.get("model", "claude-sonnet-4-5-20250514"),
                max_tokens=kwargs.get("max_tokens", 4096),
            )
            self.total_cost += calculate_cost(
                response.model,
                response.usage.input_tokens,
                response.usage.output_tokens,
            )
            return response.content[0].text

    async def batch_call(self, tasks: list[dict]) -> list[str]:
        """Process multiple tasks concurrently within the semaphore limit."""
        coros = [
            self.call(task["messages"], **task.get("kwargs", {}))
            for task in tasks
        ]
        return await asyncio.gather(*coros, return_exceptions=True)

# Usage
concurrent = ConcurrentClaude(get_claude_client(), max_concurrent=5)
results = await concurrent.batch_call([
    {"messages": [{"role": "user", "content": f"Summarize: {doc}"}]}
    for doc in documents
])
print(f"Total cost: ${concurrent.total_cost:.4f}")

Production Configuration

from pydantic_settings import BaseSettings

class ClaudeSettings(BaseSettings):
    anthropic_api_key: str
    default_model: str = "claude-sonnet-4-5-20250514"
    max_tokens: int = 4096
    max_retries: int = 3
    timeout_seconds: float = 120.0
    max_concurrent_requests: int = 10
    cost_alert_threshold_usd: float = 100.0

    class Config:
        env_prefix = "CLAUDE_"

settings = ClaudeSettings()

These patterns form a solid foundation for any Python application that integrates the Claude API. The key principles: use async everywhere, validate structured outputs, inject dependencies for testability, and track costs from day one.

Share this article
N

NYC News

Expert insights on AI voice agents and customer communication automation.

Try CallSphere AI Voice Agents

See how AI voice agents work for your industry. Live demo available -- no signup required.