Skip to content
Back to Blog
Agentic AI6 min read

Claude Vision API: Analyzing Images and Documents at Scale

Complete guide to using Claude's vision capabilities for image analysis, document processing, and OCR at scale. Covers image formats, multi-image analysis, PDF processing, prompt engineering for vision tasks, and cost optimization.

Claude's Vision Capabilities

Claude can process images alongside text, enabling a wide range of applications: document OCR, chart analysis, UI screenshot review, product image classification, medical image triage, and more. Vision is available across all Claude models (Opus, Sonnet, Haiku) with the same API interface.

Unlike dedicated OCR tools or computer vision models that only extract specific features, Claude understands images holistically. It can read text, interpret charts, describe visual layouts, identify objects, and reason about relationships between visual elements -- all in a single API call.

Sending Images to Claude

Base64-Encoded Images

import base64
from pathlib import Path
from anthropic import Anthropic

client = Anthropic()

def analyze_image(image_path: str, prompt: str) -> str:
    """Send a local image to Claude for analysis."""
    image_data = Path(image_path).read_bytes()
    base64_image = base64.standard_b64encode(image_data).decode("utf-8")

    # Detect media type
    media_types = {
        ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
        ".png": "image/png", ".gif": "image/gif",
        ".webp": "image/webp",
    }
    suffix = Path(image_path).suffix.lower()
    media_type = media_types.get(suffix, "image/jpeg")

    response = client.messages.create(
        model="claude-sonnet-4-5-20250514",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": media_type,
                        "data": base64_image,
                    },
                },
                {
                    "type": "text",
                    "text": prompt,
                },
            ],
        }],
    )
    return response.content[0].text

URL-Based Images

response = client.messages.create(
    model="claude-sonnet-4-5-20250514",
    max_tokens=4096,
    messages=[{
        "role": "user",
        "content": [
            {
                "type": "image",
                "source": {
                    "type": "url",
                    "url": "https://example.com/chart.png",
                },
            },
            {
                "type": "text",
                "text": "Analyze this chart and summarize the key trends.",
            },
        ],
    }],
)

Multi-Image Analysis

Claude can process multiple images in a single request, enabling comparison and cross-reference tasks:

def compare_images(image_paths: list[str], prompt: str) -> str:
    """Send multiple images to Claude for comparison."""
    content = []

    for i, path in enumerate(image_paths):
        image_data = base64.standard_b64encode(Path(path).read_bytes()).decode()
        content.append({
            "type": "text",
            "text": f"Image {i + 1}:",
        })
        content.append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/png",
                "data": image_data,
            },
        })

    content.append({"type": "text", "text": prompt})

    response = client.messages.create(
        model="claude-sonnet-4-5-20250514",
        max_tokens=4096,
        messages=[{"role": "user", "content": content}],
    )
    return response.content[0].text

# Example: Compare two versions of a UI design
result = compare_images(
    ["design_v1.png", "design_v2.png"],
    "Compare these two UI designs. What changed? Which is better for usability?"
)

Document Processing at Scale

PDF Processing Pipeline

import fitz  # PyMuPDF
import asyncio
from anthropic import AsyncAnthropic

async_client = AsyncAnthropic()

def pdf_to_images(pdf_path: str, dpi: int = 200) -> list[str]:
    """Convert PDF pages to base64 images."""
    doc = fitz.open(pdf_path)
    images = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        # Render at specified DPI
        mat = fitz.Matrix(dpi / 72, dpi / 72)
        pix = page.get_pixmap(matrix=mat)
        img_bytes = pix.tobytes("png")
        images.append(base64.standard_b64encode(img_bytes).decode())

    doc.close()
    return images

async def process_pdf_page(page_image: str, page_num: int, prompt: str) -> dict:
    """Process a single PDF page with Claude vision."""
    response = await async_client.messages.create(
        model="claude-sonnet-4-5-20250514",
        max_tokens=4096,
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": page_image,
                    },
                },
                {"type": "text", "text": prompt},
            ],
        }],
    )
    return {
        "page": page_num,
        "content": response.content[0].text,
        "tokens": response.usage.input_tokens + response.usage.output_tokens,
    }

async def process_pdf(pdf_path: str, prompt: str, max_concurrent: int = 5) -> list[dict]:
    """Process all pages of a PDF concurrently."""
    pages = pdf_to_images(pdf_path)
    semaphore = asyncio.Semaphore(max_concurrent)

    async def bounded_process(img, num):
        async with semaphore:
            return await process_pdf_page(img, num, prompt)

    tasks = [bounded_process(img, i) for i, img in enumerate(pages)]
    results = await asyncio.gather(*tasks)
    return sorted(results, key=lambda r: r["page"])

Invoice Processing Example

INVOICE_PROMPT = """Extract all information from this invoice and return it as JSON:
{
  "vendor_name": "...",
  "vendor_address": "...",
  "invoice_number": "...",
  "invoice_date": "YYYY-MM-DD",
  "due_date": "YYYY-MM-DD",
  "line_items": [
    {"description": "...", "quantity": N, "unit_price": N.NN, "total": N.NN}
  ],
  "subtotal": N.NN,
  "tax": N.NN,
  "total": N.NN,
  "currency": "USD",
  "payment_terms": "..."
}

If any field is not visible or unclear, set it to null."""

async def process_invoice(image_path: str) -> dict:
    result = analyze_image(image_path, INVOICE_PROMPT)
    return json.loads(extract_json(result))

Image Token Costs

Image tokens are calculated based on image dimensions. Claude resizes images to fit within its processing limits:

Image Size Approximate Tokens Cost (Sonnet Input)
200x200 px ~200 $0.0006
800x600 px ~800 $0.0024
1920x1080 px ~1,600 $0.0048
4000x3000 px ~3,000 $0.0090

Optimizing Image Costs

from PIL import Image
import io

def optimize_image(image_path: str, max_dimension: int = 1568) -> str:
    """Resize image to reduce token costs while preserving readability."""
    img = Image.open(image_path)

    # Calculate resize ratio
    ratio = min(max_dimension / img.width, max_dimension / img.height)
    if ratio < 1:
        new_size = (int(img.width * ratio), int(img.height * ratio))
        img = img.resize(new_size, Image.LANCZOS)

    # Convert to PNG bytes
    buffer = io.BytesIO()
    img.save(buffer, format="PNG", optimize=True)
    return base64.standard_b64encode(buffer.getvalue()).decode()

The maximum image dimension Claude accepts is 8,000 pixels on any side, but images are internally resized to a maximum of 1,568 pixels on the long side. Sending larger images just wastes upload bandwidth -- they get resized before processing.

Batch Image Processing

For processing hundreds or thousands of images, use the Batch API:

def prepare_image_batch(
    image_paths: list[str],
    prompt: str,
    model: str = "claude-haiku-4-5-20250514",
) -> list[dict]:
    """Prepare a batch of image analysis requests."""
    requests = []
    for i, path in enumerate(image_paths):
        optimized = optimize_image(path)
        requests.append({
            "custom_id": f"img-{i}",
            "params": {
                "model": model,
                "max_tokens": 1024,
                "messages": [{
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/png",
                                "data": optimized,
                            },
                        },
                        {"type": "text", "text": prompt},
                    ],
                }],
            },
        })
    return requests

# Process 1,000 product images for classification
batch_requests = prepare_image_batch(
    product_images,
    "Classify this product image. Return JSON: {category, subcategory, color, condition}",
)
batch = client.messages.batches.create(requests=batch_requests)

Prompt Engineering for Vision

Be Specific About What to Look For

Bad: "Describe this image." Good: "This is a screenshot of a web form. List every input field, its label, current value, and any validation errors shown."

Use Structured Output Requests

Bad: "What does this chart show?" Good: "Extract the data from this bar chart. Return a JSON array of {label: string, value: number} objects for each bar."

Provide Context When Available

# Better results when you provide context
response = client.messages.create(
    model="claude-sonnet-4-5-20250514",
    max_tokens=4096,
    messages=[{
        "role": "user",
        "content": [
            {"type": "text", "text": "This is a medical insurance claim form from Aetna."},
            {"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": form_image}},
            {"type": "text", "text": "Extract all fields. Pay special attention to the diagnosis codes (ICD-10) and procedure codes (CPT)."},
        ],
    }],
)

Limitations to Know

  • No pixel-perfect coordinate extraction: Claude understands spatial relationships but does not return exact pixel coordinates
  • Handwriting recognition: Works reasonably well for clear handwriting but struggles with messy or stylized handwriting
  • Small text: Text smaller than approximately 12pt at 72 DPI may not be reliably readable. Increase image resolution if you need to read fine print
  • Rotated content: Claude can handle slight rotations but may struggle with 90-degree or upside-down text. Pre-process images to correct orientation
Share this article
N

NYC News

Expert insights on AI voice agents and customer communication automation.

Try CallSphere AI Voice Agents

See how AI voice agents work for your industry. Live demo available -- no signup required.