Setting Up the Python SDK

The Anthropic Python SDK supports both synchronous and asynchronous usage, with built-in retry logic, streaming, and comprehensive type hints.

pip install anthropic

Client Initialization

from anthropic import Anthropic, AsyncAnthropic

# Synchronous client
client = Anthropic()  # Reads ANTHROPIC_API_KEY from environment

# Async client (for FastAPI, aiohttp, etc.)
async_client = AsyncAnthropic()

# Explicit configuration
client = Anthropic(
    api_key=os.environ["ANTHROPIC_API_KEY"],
    max_retries=3,
    timeout=120.0,
)

Singleton Pattern for Connection Reuse

from functools import lru_cache
from anthropic import AsyncAnthropic

@lru_cache(maxsize=1)
def get_claude_client() -> AsyncAnthropic:
    """Singleton async client that reuses HTTP connections."""
    return AsyncAnthropic(
        max_retries=3,
        timeout=120.0,
    )

Async Patterns with FastAPI

from fastapi import FastAPI, Depends
from pydantic import BaseModel
from anthropic import AsyncAnthropic

app = FastAPI()

class ChatRequest(BaseModel):
    message: str
    system_prompt: str = "You are a helpful assistant."
    model: str = "claude-sonnet-4-5-20250514"
    max_tokens: int = 4096

class ChatResponse(BaseModel):
    text: str
    input_tokens: int
    output_tokens: int
    model: str
    cost_usd: float

def get_client() -> AsyncAnthropic:
    return get_claude_client()

@app.post("/api/chat", response_model=ChatResponse)
async def chat(
    request: ChatRequest,
    client: AsyncAnthropic = Depends(get_client),
):
    response = await client.messages.create(
        model=request.model,
        max_tokens=request.max_tokens,
        system=request.system_prompt,
        messages=[{"role": "user", "content": request.message}],
    )

    text = response.content[0].text
    cost = calculate_cost(
        response.model,
        response.usage.input_tokens,
        response.usage.output_tokens,
    )

    return ChatResponse(
        text=text,
        input_tokens=response.usage.input_tokens,
        output_tokens=response.usage.output_tokens,
        model=response.model,
        cost_usd=cost,
    )

def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float:
    rates = {
        "claude-sonnet-4-5-20250514": (3.0, 15.0),
        "claude-haiku-4-5-20250514": (1.0, 5.0),
    }
    input_rate, output_rate = rates.get(model, (3.0, 15.0))
    return (input_tokens * input_rate + output_tokens * output_rate) / 1_000_000

Structured Output with Pydantic

Use Pydantic models to validate Claude's responses:

from pydantic import BaseModel, Field
from typing import Optional
import json

class SentimentAnalysis(BaseModel):
    sentiment: str = Field(description="positive, negative, or neutral")
    confidence: float = Field(ge=0, le=1, description="Confidence score 0-1")
    key_phrases: list[str] = Field(description="Phrases that indicate the sentiment")
    summary: str = Field(description="One-sentence summary of the text's tone")

async def analyze_sentiment(text: str) -> SentimentAnalysis:
    """Analyze sentiment with structured, validated output."""
    client = get_claude_client()

    response = await client.messages.create(
        model="claude-haiku-4-5-20250514",
        max_tokens=1024,
        system="""Analyze the sentiment of the provided text.
Return a JSON object with these exact fields:
- sentiment: "positive", "negative", or "neutral"
- confidence: float between 0 and 1
- key_phrases: array of strings
- summary: one sentence string""",
        messages=[{"role": "user", "content": text}],
    )

    raw = response.content[0].text

    # Handle markdown code blocks
    if "```json" in raw:
        raw = raw.split("```json")[1].split("```")[0]
    elif "```" in raw:
        raw = raw.split("```")[1].split("```")[0]

    data = json.loads(raw.strip())
    return SentimentAnalysis(**data)

Streaming with FastAPI

from fastapi import FastAPI
from fastapi.responses import StreamingResponse
import json

@app.post("/api/chat/stream")
async def chat_stream(request: ChatRequest):
    client = get_claude_client()

    async def generate():
        async with client.messages.stream(
            model=request.model,
            max_tokens=request.max_tokens,
            system=request.system_prompt,
            messages=[{"role": "user", "content": request.message}],
        ) as stream:
            async for text in stream.text_stream:
                yield f"data: {json.dumps({'text': text})}\n\n"

        # Send final usage data
        final = await stream.get_final_message()
        yield f"data: {json.dumps({'done': True, 'usage': {'input': final.usage.input_tokens, 'output': final.usage.output_tokens}})}\n\n"

    return StreamingResponse(
        generate(),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "X-Accel-Buffering": "no",
        },
    )

Dependency Injection Pattern

from abc import ABC, abstractmethod
from anthropic import AsyncAnthropic

class LLMService(ABC):
    """Abstract interface for LLM services."""

    @abstractmethod
    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        pass

    @abstractmethod
    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        pass

class ClaudeService(LLMService):
    """Claude implementation of the LLM service."""

    def __init__(self, client: AsyncAnthropic, model: str = "claude-sonnet-4-5-20250514"):
        self.client = client
        self.model = model

    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        response = await self.client.messages.create(
            model=self.model,
            max_tokens=max_tokens,
            system=system,
            messages=messages,
        )
        return response.content[0].text

    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        response = await self.generate(messages, system)
        data = json.loads(extract_json(response))
        return response_model(**data)

class MockLLMService(LLMService):
    """Mock for testing -- no API calls needed."""

    def __init__(self, responses: dict[str, str] = None):
        self.responses = responses or {}
        self.call_log: list[dict] = []

    async def generate(
        self, messages: list[dict], system: str = "", max_tokens: int = 4096,
    ) -> str:
        self.call_log.append({"messages": messages, "system": system})
        user_msg = messages[-1]["content"] if messages else ""
        return self.responses.get(user_msg, "Mock response")

    async def generate_structured(
        self, messages: list[dict], response_model: type, system: str = "",
    ) -> dict:
        text = await self.generate(messages, system)
        return json.loads(text)

Testing with Pytest

import pytest
from unittest.mock import AsyncMock, MagicMock, patch

@pytest.fixture
def mock_claude():
    """Fixture that provides a mocked Claude client."""
    client = AsyncMock(spec=AsyncAnthropic)

    # Configure the mock response
    mock_response = MagicMock()
    mock_response.content = [MagicMock(type="text", text="Test response")]
    mock_response.usage.input_tokens = 10
    mock_response.usage.output_tokens = 5
    mock_response.model = "claude-sonnet-4-5-20250514"
    mock_response.stop_reason = "end_turn"

    client.messages.create = AsyncMock(return_value=mock_response)
    return client

@pytest.mark.asyncio
async def test_chat_endpoint(mock_claude):
    service = ClaudeService(client=mock_claude)
    result = await service.generate(
        messages=[{"role": "user", "content": "Hello"}]
    )
    assert result == "Test response"
    mock_claude.messages.create.assert_called_once()

@pytest.mark.asyncio
async def test_structured_output(mock_claude):
    mock_claude.messages.create.return_value.content[0].text = json.dumps({
        "sentiment": "positive",
        "confidence": 0.95,
        "key_phrases": ["great", "love it"],
        "summary": "Very positive sentiment."
    })

    service = ClaudeService(client=mock_claude)
    result = await service.generate_structured(
        messages=[{"role": "user", "content": "I love this product!"}],
        response_model=SentimentAnalysis,
    )
    assert result.sentiment == "positive"
    assert result.confidence == 0.95

@pytest.mark.asyncio
async def test_error_handling(mock_claude):
    from anthropic import RateLimitError
    mock_claude.messages.create.side_effect = RateLimitError(
        message="Rate limit exceeded",
        response=MagicMock(status_code=429, headers={"retry-after": "30"}),
        body={"error": {"message": "Rate limit exceeded"}},
    )

    service = ClaudeService(client=mock_claude)
    with pytest.raises(RateLimitError):
        await service.generate(messages=[{"role": "user", "content": "test"}])

Concurrent Request Management

import asyncio
from asyncio import Semaphore

class ConcurrentClaude:
    """Manage concurrent Claude API calls with a semaphore."""

    def __init__(self, client: AsyncAnthropic, max_concurrent: int = 10):
        self.client = client
        self.semaphore = Semaphore(max_concurrent)
        self.total_cost = 0.0

    async def call(self, messages: list[dict], **kwargs) -> str:
        async with self.semaphore:
            response = await self.client.messages.create(
                messages=messages,
                model=kwargs.get("model", "claude-sonnet-4-5-20250514"),
                max_tokens=kwargs.get("max_tokens", 4096),
            )
            self.total_cost += calculate_cost(
                response.model,
                response.usage.input_tokens,
                response.usage.output_tokens,
            )
            return response.content[0].text

    async def batch_call(self, tasks: list[dict]) -> list[str]:
        """Process multiple tasks concurrently within the semaphore limit."""
        coros = [
            self.call(task["messages"], **task.get("kwargs", {}))
            for task in tasks
        ]
        return await asyncio.gather(*coros, return_exceptions=True)

# Usage
concurrent = ConcurrentClaude(get_claude_client(), max_concurrent=5)
results = await concurrent.batch_call([
    {"messages": [{"role": "user", "content": f"Summarize: {doc}"}]}
    for doc in documents
])
print(f"Total cost: ${concurrent.total_cost:.4f}")

Production Configuration

from pydantic_settings import BaseSettings

class ClaudeSettings(BaseSettings):
    anthropic_api_key: str
    default_model: str = "claude-sonnet-4-5-20250514"
    max_tokens: int = 4096
    max_retries: int = 3
    timeout_seconds: float = 120.0
    max_concurrent_requests: int = 10
    cost_alert_threshold_usd: float = 100.0

    class Config:
        env_prefix = "CLAUDE_"

settings = ClaudeSettings()

These patterns form a solid foundation for any Python application that integrates the Claude API. The key principles: use async everywhere, validate structured outputs, inject dependencies for testability, and track costs from day one.

Claude API in Python: Production Patterns and Best Practices

Setting Up the Python SDK

Client Initialization

Singleton Pattern for Connection Reuse

Async Patterns with FastAPI

Structured Output with Pydantic

Streaming with FastAPI

Dependency Injection Pattern

Testing with Pytest

Concurrent Request Management

Production Configuration

Try CallSphere AI Voice Agents

Related Articles

Massive Multitask Language Understanding (MMLU) benchmark evaluates general knowledge and reasoning

Claude Co-Work: How Claude Enables True Collaborative AI Development

Showcasing LLM Performance: How Research Papers Present Evaluation Results