Claude's Vision Capabilities

Claude can process images alongside text, enabling a wide range of applications: document OCR, chart analysis, UI screenshot review, product image classification, medical image triage, and more. Vision is available across all Claude models (Opus, Sonnet, Haiku) with the same API interface.

Unlike dedicated OCR tools or computer vision models that only extract specific features, Claude understands images holistically. It can read text, interpret charts, describe visual layouts, identify objects, and reason about relationships between visual elements -- all in a single API call.

Sending Images to Claude

Base64-Encoded Images

import base64
from pathlib import Path
from anthropic import Anthropic

client = Anthropic()

def analyze_image(image_path: str, prompt: str) -> str:
    """Send a local image to Claude for analysis."""
    image_data = Path(image_path).read_bytes()
    base64_image = base64.standard_b64encode(image_data).decode("utf-8")

    # Detect media type
    media_types = {
        ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
        ".png": "image/png", ".gif": "image/gif",
        ".webp": "image/webp",
    }
    suffix = Path(image_path).suffix.lower()
    media_type = media_types.get(suffix, "image/jpeg")

    response = client.messages.create(
        model="claude-sonnet-4-5-20250514",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": media_type,
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": prompt,
                },
            ],
        }],
    )
    return response.content[0].text

URL-Based Images

response = client.messages.create(
    model="claude-sonnet-4-5-20250514",
    max_tokens=4096,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "image",
                "source": {
                    "type": "url",
                    "url": "https://example.com/chart.png",
                },
            },
            {
                "type": "text",
                "text": "Analyze this chart and summarize the key trends.",
            },
        ],
    }],
)

Multi-Image Analysis

Claude can process multiple images in a single request, enabling comparison and cross-reference tasks:

flowchart TD
    START["Claude Vision API: Analyzing Images and Documents…"] --> A
    A["Claude39s Vision Capabilities"]
    A --> B
    B["Sending Images to Claude"]
    B --> C
    C["Multi-Image Analysis"]
    C --> D
    D["Document Processing at Scale"]
    D --> E
    E["Image Token Costs"]
    E --> F
    F["Batch Image Processing"]
    F --> G
    G["Prompt Engineering for Vision"]
    G --> H
    H["Limitations to Know"]
    H --> DONE["Key Takeaways"]
    style START fill:#4f46e5,stroke:#4338ca,color:#fff
    style DONE fill:#059669,stroke:#047857,color:#fff

def compare_images(image_paths: list[str], prompt: str) -> str:
    """Send multiple images to Claude for comparison."""
    content = []

    for i, path in enumerate(image_paths):
        image_data = base64.standard_b64encode(Path(path).read_bytes()).decode()
        content.append({
            "type": "text",
            "text": f"Image {i + 1}:",
        })
        content.append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/png",
                "data": image_data,
            },
        })

    content.append({"type": "text", "text": prompt})

    response = client.messages.create(
        model="claude-sonnet-4-5-20250514",
        max_tokens=4096,
        messages=[{"role": "user", "content": content}],
    )
    return response.content[0].text

# Example: Compare two versions of a UI design
result = compare_images(
    ["design_v1.png", "design_v2.png"],
    "Compare these two UI designs. What changed? Which is better for usability?"
)

Document Processing at Scale

PDF Processing Pipeline

import fitz  # PyMuPDF
import asyncio
from anthropic import AsyncAnthropic

async_client = AsyncAnthropic()

def pdf_to_images(pdf_path: str, dpi: int = 200) -> list[str]:
    """Convert PDF pages to base64 images."""
    doc = fitz.open(pdf_path)
    images = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        # Render at specified DPI
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        img_bytes = pix.tobytes("png")
        images.append(base64.standard_b64encode(img_bytes).decode())

    doc.close()
    return images

async def process_pdf_page(page_image: str, page_num: int, prompt: str) -> dict:
    """Process a single PDF page with Claude vision."""
    response = await async_client.messages.create(
        model="claude-sonnet-4-5-20250514",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": page_image,
                    },
                },
                {"type": "text", "text": prompt},
            ],
        }],
    )
    return {
        "page": page_num,
        "content": response.content[0].text,
        "tokens": response.usage.input_tokens + response.usage.output_tokens,
    }

async def process_pdf(pdf_path: str, prompt: str, max_concurrent: int = 5) -> list[dict]:
    """Process all pages of a PDF concurrently."""
    pages = pdf_to_images(pdf_path)
    semaphore = asyncio.Semaphore(max_concurrent)

    async def bounded_process(img, num):
        async with semaphore:
            return await process_pdf_page(img, num, prompt)

    tasks = [bounded_process(img, i) for i, img in enumerate(pages)]
    results = await asyncio.gather(*tasks)
    return sorted(results, key=lambda r: r["page"])

Invoice Processing Example

INVOICE_PROMPT = """Extract all information from this invoice and return it as JSON:
{
  "vendor_name": "...",
  "vendor_address": "...",
  "invoice_number": "...",
  "invoice_date": "YYYY-MM-DD",
  "due_date": "YYYY-MM-DD",
  "line_items": [
    {"description": "...", "quantity": N, "unit_price": N.NN, "total": N.NN}
  ],
  "subtotal": N.NN,
  "tax": N.NN,
  "total": N.NN,
  "currency": "USD",
  "payment_terms": "..."
}

If any field is not visible or unclear, set it to null."""

async def process_invoice(image_path: str) -> dict:
    result = analyze_image(image_path, INVOICE_PROMPT)
    return json.loads(extract_json(result))

Image Token Costs

Image tokens are calculated based on image dimensions. Claude resizes images to fit within its processing limits:

See AI Voice Agents Handle Real Calls

Book a free demo or calculate how much you can save with AI voice automation.

Try Live Demo ROI Calculator

flowchart TD
    ROOT["Claude Vision API: Analyzing Images and Docu…"] 
    ROOT --> P0["Sending Images to Claude"]
    P0 --> P0C0["Base64-Encoded Images"]
    P0 --> P0C1["URL-Based Images"]
    ROOT --> P1["Document Processing at Scale"]
    P1 --> P1C0["PDF Processing Pipeline"]
    P1 --> P1C1["Invoice Processing Example"]
    ROOT --> P2["Image Token Costs"]
    P2 --> P2C0["Optimizing Image Costs"]
    ROOT --> P3["Prompt Engineering for Vision"]
    P3 --> P3C0["Be Specific About What to Look For"]
    P3 --> P3C1["Use Structured Output Requests"]
    P3 --> P3C2["Provide Context When Available"]
    style ROOT fill:#4f46e5,stroke:#4338ca,color:#fff
    style P0 fill:#e0e7ff,stroke:#6366f1,color:#1e293b
    style P1 fill:#e0e7ff,stroke:#6366f1,color:#1e293b
    style P2 fill:#e0e7ff,stroke:#6366f1,color:#1e293b
    style P3 fill:#e0e7ff,stroke:#6366f1,color:#1e293b

Image Size	Approximate Tokens	Cost (Sonnet Input)
200x200 px	~200	$0.0006
800x600 px	~800	$0.0024
1920x1080 px	~1,600	$0.0048
4000x3000 px	~3,000	$0.0090

Optimizing Image Costs

from PIL import Image
import io

def optimize_image(image_path: str, max_dimension: int = 1568) -> str:
    """Resize image to reduce token costs while preserving readability."""
    img = Image.open(image_path)

    # Calculate resize ratio
    ratio = min(max_dimension / img.width, max_dimension / img.height)
    if ratio < 1:
        new_size = (int(img.width * ratio), int(img.height * ratio))
        img = img.resize(new_size, Image.LANCZOS)

    # Convert to PNG bytes
    buffer = io.BytesIO()
    img.save(buffer, format="PNG", optimize=True)
    return base64.standard_b64encode(buffer.getvalue()).decode()

The maximum image dimension Claude accepts is 8,000 pixels on any side, but images are internally resized to a maximum of 1,568 pixels on the long side. Sending larger images just wastes upload bandwidth -- they get resized before processing.

Batch Image Processing

For processing hundreds or thousands of images, use the Batch API:

def prepare_image_batch(
    image_paths: list[str],
    prompt: str,
    model: str = "claude-haiku-4-5-20250514",
) -> list[dict]:
    """Prepare a batch of image analysis requests."""
    requests = []
    for i, path in enumerate(image_paths):
        optimized = optimize_image(path)
        requests.append({
            "custom_id": f"img-{i}",
            "params": {
                "model": model,
                "max_tokens": 1024,
                "messages": [{
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": optimized,
                            },
                        },
                        {"type": "text", "text": prompt},
                    ],
                }],
            },
        })
    return requests

# Process 1,000 product images for classification
batch_requests = prepare_image_batch(
    product_images,
    "Classify this product image. Return JSON: {category, subcategory, color, condition}",
)
batch = client.messages.batches.create(requests=batch_requests)

Prompt Engineering for Vision

Be Specific About What to Look For

Bad: "Describe this image." Good: "This is a screenshot of a web form. List every input field, its label, current value, and any validation errors shown."

Use Structured Output Requests

Bad: "What does this chart show?" Good: "Extract the data from this bar chart. Return a JSON array of {label: string, value: number} objects for each bar."

Provide Context When Available

# Better results when you provide context
response = client.messages.create(
    model="claude-sonnet-4-5-20250514",
    max_tokens=4096,
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "This is a medical insurance claim form from Aetna."},
            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": form_image}},
            {"type": "text", "text": "Extract all fields. Pay special attention to the diagnosis codes (ICD-10) and procedure codes (CPT)."},
        ],
    }],
)

Limitations to Know

No pixel-perfect coordinate extraction: Claude understands spatial relationships but does not return exact pixel coordinates
Handwriting recognition: Works reasonably well for clear handwriting but struggles with messy or stylized handwriting
Small text: Text smaller than approximately 12pt at 72 DPI may not be reliably readable. Increase image resolution if you need to read fine print
Rotated content: Claude can handle slight rotations but may struggle with 90-degree or upside-down text. Pre-process images to correct orientation

Claude Vision API: Analyzing Images and Documents at Scale