Multi-Modal AI in Production: Vision, Audio, and Text Combined
A practical guide to building production multi-modal AI systems that process images, audio, and text in unified pipelines. Covers architecture patterns, model selection, preprocessing, and real-world deployment strategies for multi-modal applications.
The Multi-Modal Convergence
In 2026, the distinction between "vision models," "language models," and "audio models" is dissolving. Frontier LLMs natively process images, PDFs, and increasingly audio within the same context window. This convergence enables applications that were architecturally complex just two years ago: a single API call can now analyze a screenshot, read a chart, listen to audio, and generate a text response that references all three inputs.
This guide covers the practical patterns for building production systems that leverage multi-modal capabilities.
Multi-Modal Capabilities by Provider
| Capability | Claude (Anthropic) | GPT-4o (OpenAI) | Gemini 2.0 (Google) |
|---|---|---|---|
| Image input | Yes | Yes | Yes |
| PDF input | Yes (native) | Via vision | Yes (native) |
| Audio input | No | Yes (native) | Yes (native) |
| Video input | No | No (frame extraction) | Yes (native) |
| Image generation | No | Yes (DALL-E) | Yes (Imagen) |
| Audio output | No | Yes (TTS) | Yes (TTS) |
| Multi-image | Yes (up to 20) | Yes | Yes |
Pattern 1: Document Understanding Pipeline
The most common multi-modal production use case is processing documents that contain text, tables, charts, and images.
import anthropic
import base64
from pathlib import Path
client = anthropic.Anthropic()
async def analyze_document(file_path: str, question: str) -> str:
"""Analyze a PDF or image document with a specific question"""
path = Path(file_path)
if path.suffix == ".pdf":
# Claude natively processes PDFs
with open(path, "rb") as f:
pdf_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = await client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "document",
"source": {
"type": "base64",
"media_type": "application/pdf",
"data": pdf_data,
},
},
{"type": "text", "text": question},
],
}],
)
else:
# Image processing
media_type = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".webp": "image/webp",
".gif": "image/gif",
}.get(path.suffix.lower(), "image/png")
with open(path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = await client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data,
},
},
{"type": "text", "text": question},
],
}],
)
return response.content[0].text
Structured Data Extraction from Documents
from pydantic import BaseModel
class InvoiceData(BaseModel):
vendor_name: str
invoice_number: str
date: str
line_items: list[dict]
subtotal: float
tax: float
total: float
payment_terms: str
async def extract_invoice_data(image_path: str) -> InvoiceData:
"""Extract structured data from an invoice image"""
with open(image_path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
response = await client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
tools=[{
"name": "extract_invoice",
"description": "Extract invoice data from the image",
"input_schema": InvoiceData.model_json_schema(),
}],
tool_choice={"type": "tool", "name": "extract_invoice"},
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": "image/png",
"data": image_data,
},
},
{
"type": "text",
"text": "Extract all invoice data from this image.",
},
],
}],
)
tool_block = next(b for b in response.content if b.type == "tool_use")
return InvoiceData(**tool_block.input)
Pattern 2: Vision-Language Agent
A vision-language agent processes screenshots, photos, or camera feeds as part of its reasoning loop:
class VisionAgent:
"""Agent that can see and reason about visual inputs"""
def __init__(self, client, tools: list):
self.client = client
self.tools = tools
self.conversation = []
async def process_with_image(
self, text: str, image_path: str = None, image_url: str = None
) -> str:
content = []
if image_path:
with open(image_path, "rb") as f:
data = base64.standard_b64encode(f.read()).decode("utf-8")
content.append({
"type": "image",
"source": {"type": "base64", "media_type": "image/png", "data": data},
})
elif image_url:
content.append({
"type": "image",
"source": {"type": "url", "url": image_url},
})
content.append({"type": "text", "text": text})
self.conversation.append({"role": "user", "content": content})
# Agentic loop with vision + tools
while True:
response = await self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
system="You are a visual analysis agent. Use tools when needed.",
messages=self.conversation,
tools=self.tools,
)
self.conversation.append({
"role": "assistant", "content": response.content
})
if response.stop_reason == "end_turn":
return next(
b.text for b in response.content if b.type == "text"
)
# Handle tool calls
tool_results = []
for block in response.content:
if block.type == "tool_use":
result = await self._execute_tool(block.name, block.input)
tool_results.append({
"type": "tool_result",
"tool_use_id": block.id,
"content": str(result),
})
self.conversation.append({"role": "user", "content": tool_results})
Pattern 3: Audio Processing Pipeline
For applications that need to process voice input, transcribe it, and generate responses:
from openai import OpenAI
client = OpenAI()
async def voice_to_action(audio_file_path: str) -> dict:
"""Process voice input: transcribe, understand, and act"""
# Step 1: Transcribe with Whisper
with open(audio_file_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
response_format="verbose_json",
timestamp_granularities=["segment"],
)
# Step 2: Understand intent with LLM
intent = await classify_intent(transcript.text)
# Step 3: Process based on intent
if intent.action == "schedule_meeting":
result = await schedule_meeting(intent.parameters)
elif intent.action == "search_knowledge":
result = await search_and_answer(intent.parameters)
elif intent.action == "create_task":
result = await create_task(intent.parameters)
# Step 4: Generate spoken response
speech = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=result.text_response,
)
speech.stream_to_file("response.mp3")
return {
"transcript": transcript.text,
"intent": intent,
"response": result.text_response,
"audio_response": "response.mp3",
}
Multi-Modal RAG
Adding visual understanding to RAG pipelines enables retrieval from documents with charts, diagrams, and screenshots:
class MultiModalRAG:
"""RAG pipeline that handles text, images, and mixed documents"""
def __init__(self, text_index, image_index, llm_client):
self.text_index = text_index
self.image_index = image_index
self.llm = llm_client
async def index_document(self, doc_path: str):
"""Index a document, extracting both text and visual elements"""
if doc_path.endswith(".pdf"):
# Extract pages as images for visual content
pages = convert_pdf_to_images(doc_path)
for i, page_img in enumerate(pages):
# Generate description of visual elements
description = await self.describe_page(page_img)
# Store image embedding + text description
await self.image_index.upsert(
id=f"{doc_path}_page_{i}",
image=page_img,
metadata={"description": description, "page": i}
)
# Also extract and index text
text = extract_text_from_pdf(doc_path)
chunks = chunk_text(text)
for chunk in chunks:
await self.text_index.upsert(
id=chunk.id,
text=chunk.text,
embedding=embed(chunk.text),
)
async def query(self, question: str) -> str:
"""Query with both text and visual retrieval"""
# Retrieve relevant text chunks
text_results = await self.text_index.search(question, top_k=5)
# Retrieve relevant images/pages
image_results = await self.image_index.search(question, top_k=3)
# Build multi-modal context
content = []
for img_result in image_results:
content.append({
"type": "image",
"source": {"type": "base64", "media_type": "image/png",
"data": img_result.image_b64},
})
text_context = "\n\n".join([r.text for r in text_results])
content.append({
"type": "text",
"text": f"Text context:\n{text_context}\n\nQuestion: {question}"
})
response = await self.llm.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
system="Answer based on the provided documents, images, and context.",
messages=[{"role": "user", "content": content}],
)
return response.content[0].text
Performance and Cost Optimization
Image Optimization
Vision API costs scale with image resolution. Optimize images before sending:
from PIL import Image
import io
def optimize_image_for_api(image_path: str, max_pixels: int = 1568 * 1568) -> bytes:
"""Resize image to stay within API limits while preserving quality"""
img = Image.open(image_path)
width, height = img.size
total_pixels = width * height
if total_pixels > max_pixels:
scale = (max_pixels / total_pixels) ** 0.5
new_width = int(width * scale)
new_height = int(height * scale)
img = img.resize((new_width, new_height), Image.LANCZOS)
buffer = io.BytesIO()
img.save(buffer, format="PNG", optimize=True)
return buffer.getvalue()
Batch Processing
For high-volume document processing, use batch APIs to reduce costs:
async def batch_process_documents(documents: list[str]) -> list[dict]:
"""Process multiple documents in a batch for 50% cost savings"""
batch_requests = []
for i, doc_path in enumerate(documents):
with open(doc_path, "rb") as f:
data = base64.standard_b64encode(f.read()).decode("utf-8")
batch_requests.append({
"custom_id": f"doc_{i}",
"params": {
"model": "claude-sonnet-4-20250514",
"max_tokens": 2048,
"messages": [{
"role": "user",
"content": [
{"type": "document", "source": {
"type": "base64",
"media_type": "application/pdf",
"data": data,
}},
{"type": "text", "text": "Extract key information."},
],
}],
}
})
batch = await client.messages.batches.create(requests=batch_requests)
return await poll_batch_results(batch.id)
Key Takeaways
Multi-modal AI in production is no longer experimental -- it is the standard approach for document processing, visual analysis, and audio-enabled applications. The key architectural patterns are: unified document pipelines that handle text and images together, vision-language agents that use screenshots as part of their reasoning, audio pipelines that chain transcription with language understanding, and multi-modal RAG that retrieves from both text and visual indexes. Optimize by resizing images, using batch APIs for volume, and caching results for repeated analyses.
NYC News
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.