Building a Video Analysis Agent: Frame Extraction, Scene Detection, and Summarization
Learn how to build a video analysis agent in Python that extracts key frames, detects scene changes, performs temporal analysis, and generates structured summaries using vision language models.
Why Video Analysis Is Hard for AI Agents
Video is the most information-dense modality. A 10-minute video at 30 fps contains 18,000 frames. Sending all of them to a vision model is impractical — it would be slow, expensive, and most frames are redundant. The key challenge is selecting the right frames that capture meaningful changes, then reasoning over them temporally.
This guide builds a video analysis agent that intelligently samples frames, detects scene boundaries, and produces structured summaries.
Dependencies
pip install opencv-python-headless numpy openai pillow scenedetect
Frame Extraction Strategies
The simplest approach samples frames at regular intervals, but this misses important moments and wastes tokens on static segments. A smarter strategy combines uniform sampling with scene-change detection:
import cv2
import numpy as np
from dataclasses import dataclass
from PIL import Image
@dataclass
class ExtractedFrame:
frame_number: int
timestamp_seconds: float
image: Image.Image
is_scene_change: bool = False
def extract_uniform_frames(
video_path: str, interval_seconds: float = 2.0
) -> list[ExtractedFrame]:
"""Extract frames at regular time intervals."""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
interval_frames = int(fps * interval_seconds)
frames = []
for frame_num in range(0, total_frames, interval_frames):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret:
break
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = Image.fromarray(rgb)
frames.append(ExtractedFrame(
frame_number=frame_num,
timestamp_seconds=frame_num / fps,
image=img,
))
cap.release()
return frames
Scene Change Detection
Scene detection identifies frames where the visual content changes significantly. This ensures the agent captures transitions between topics, slides, or camera angles:
from scenedetect import open_video, SceneManager
from scenedetect.detectors import ContentDetector
def detect_scene_changes(
video_path: str, threshold: float = 27.0
) -> list[float]:
"""Return timestamps (seconds) where scene changes occur."""
video = open_video(video_path)
scene_manager = SceneManager()
scene_manager.add_detector(ContentDetector(threshold=threshold))
scene_manager.detect_scenes(video)
scene_list = scene_manager.get_scene_list()
timestamps = []
for scene in scene_list:
start_time = scene[0].get_seconds()
timestamps.append(start_time)
return timestamps
def extract_key_frames(
video_path: str,
uniform_interval: float = 3.0,
max_frames: int = 50,
) -> list[ExtractedFrame]:
"""Combine uniform sampling with scene-change frames."""
uniform = extract_uniform_frames(video_path, uniform_interval)
scene_timestamps = detect_scene_changes(video_path)
# Extract scene-change frames
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
scene_frames = []
for ts in scene_timestamps:
frame_num = int(ts * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if ret:
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
img = Image.fromarray(rgb)
scene_frames.append(ExtractedFrame(
frame_number=frame_num,
timestamp_seconds=ts,
image=img,
is_scene_change=True,
))
cap.release()
# Merge and deduplicate (remove uniform frames too
# close to scene changes)
all_frames = scene_frames + [
f for f in uniform
if not any(
abs(f.timestamp_seconds - sf.timestamp_seconds) < 1.0
for sf in scene_frames
)
]
all_frames.sort(key=lambda f: f.timestamp_seconds)
return all_frames[:max_frames]
Temporal Analysis with Vision Models
Send selected frames to GPT-4o with temporal context so the model understands the sequence:
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
import openai
import base64
import io
async def analyze_frames(
frames: list[ExtractedFrame],
query: str,
client: openai.AsyncOpenAI,
) -> str:
"""Analyze a sequence of video frames with GPT-4o."""
content_parts = [{
"type": "text",
"text": (
f"These are {len(frames)} key frames extracted from a "
f"video, shown in chronological order. "
f"Scene-change frames are marked.\n\n"
f"Question: {query}"
),
}]
for frame in frames:
# Add timestamp label
marker = " [SCENE CHANGE]" if frame.is_scene_change else ""
content_parts.append({
"type": "text",
"text": (
f"Frame at {frame.timestamp_seconds:.1f}s{marker}:"
),
})
# Add frame image
buf = io.BytesIO()
frame.image.save(buf, format="JPEG", quality=80)
b64 = base64.b64encode(buf.getvalue()).decode()
content_parts.append({
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64}",
"detail": "low",
},
})
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": content_parts,
}],
max_tokens=2000,
)
return response.choices[0].message.content
The Video Analysis Agent
Bring all components together into a coherent agent:
class VideoAnalysisAgent:
def __init__(self):
self.client = openai.AsyncOpenAI()
self.frames: list[ExtractedFrame] = []
def load_video(self, video_path: str) -> dict:
self.frames = extract_key_frames(video_path)
return {
"total_frames": len(self.frames),
"duration_covered": (
self.frames[-1].timestamp_seconds
if self.frames else 0
),
"scene_changes": sum(
1 for f in self.frames if f.is_scene_change
),
}
async def summarize(self) -> str:
return await analyze_frames(
self.frames,
"Provide a detailed chronological summary of this video.",
self.client,
)
async def ask(self, question: str) -> str:
return await analyze_frames(
self.frames, question, self.client
)
FAQ
How many frames should I extract from a video for analysis?
A good rule of thumb is 1 frame per 2-3 seconds for short videos (under 5 minutes) and 1 frame per 5-10 seconds for longer content, capped at around 50 frames total. Scene-change frames should always be included regardless of the interval. GPT-4o can process up to about 50 images in a single request while maintaining reasonable latency and cost.
Can this agent process audio from the video as well?
Yes. Extract the audio track using ffmpeg (ffmpeg -i video.mp4 -vn audio.wav), transcribe it with Whisper, and include the timestamped transcript alongside the frame analysis. The agent can then correlate what is shown visually with what is spoken, providing much richer summaries.
How do I handle very long videos like full webinars or lectures?
For long videos, use a two-pass approach. First, extract frames with a wide interval (every 10-15 seconds) and generate a high-level outline. Second, identify the segments relevant to the user's question and re-extract those segments with finer granularity (every 1-2 seconds). This mimics how humans scan through a long video before focusing on specific sections.
#VideoAnalysis #SceneDetection #FrameExtraction #ComputerVision #Python #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.