Accessibility Auditing with GPT Vision: Automated WCAG Compliance Checking
Use GPT Vision to perform automated accessibility audits that detect visual WCAG violations including contrast issues, missing labels, touch target sizes, and reading order problems — generating actionable compliance reports.
What Automated Tools Miss
Existing accessibility checkers like axe-core and Lighthouse do an excellent job catching DOM-level violations: missing alt attributes, empty links, improper ARIA roles. But they cannot evaluate visual aspects of accessibility that WCAG explicitly requires. Does the text have sufficient contrast against its actual background? Are touch targets visually large enough? Is the visual reading order logical? Does the color scheme rely solely on color to convey information?
GPT Vision fills this gap by analyzing the rendered page the way a visually impaired user's assistive technology evaluator would — looking at the actual visual output rather than just the code.
Building the Accessibility Analyzer
from pydantic import BaseModel
from openai import OpenAI
class AccessibilityIssue(BaseModel):
wcag_criterion: str # e.g., "1.4.3 Contrast (Minimum)"
severity: str # A, AA, AAA violation level
description: str
location: str # where on the page
recommendation: str
confidence: str # high, medium, low
class AccessibilityAudit(BaseModel):
page_url: str
overall_score: str # pass, partial, fail
issues: list[AccessibilityIssue]
positive_findings: list[str] # things done well
summary: str
client = OpenAI()
def audit_accessibility(
screenshot_b64: str, page_url: str
) -> AccessibilityAudit:
"""Run a visual accessibility audit on a screenshot."""
response = client.beta.chat.completions.parse(
model="gpt-4o",
messages=[
{
"role": "system",
"content": (
"You are a WCAG 2.2 accessibility expert. Analyze "
"this web page screenshot for visual accessibility "
"issues. Check for:\n"
"- Text contrast against backgrounds (WCAG 1.4.3)\n"
"- Touch/click target sizes (WCAG 2.5.5)\n"
"- Color-only information conveyance (WCAG 1.4.1)\n"
"- Text readability and sizing (WCAG 1.4.4)\n"
"- Visual focus indicators (WCAG 2.4.7)\n"
"- Content reflow and spacing (WCAG 1.4.12)\n"
"- Visual heading hierarchy (WCAG 1.3.1)\n"
"- Image of text usage (WCAG 1.4.5)\n\n"
"Only flag issues you can actually observe in the "
"visual rendering. Rate confidence as high only when "
"the violation is clearly visible."
),
},
{
"role": "user",
"content": [
{
"type": "text",
"text": (
f"Audit this page ({page_url}) for visual "
"accessibility issues."
),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{screenshot_b64}",
"detail": "high",
},
},
],
},
],
response_format=AccessibilityAudit,
)
return response.choices[0].message.parsed
Contrast Checking with Vision and Verification
GPT-4V can estimate contrast issues, but for precise WCAG contrast ratio validation, combine vision detection with programmatic verification.
from PIL import Image
import io
import math
def get_pixel_color(
screenshot_bytes: bytes, x: int, y: int
) -> tuple[int, int, int]:
"""Get RGB color at a specific pixel."""
img = Image.open(io.BytesIO(screenshot_bytes))
return img.getpixel((x, y))[:3]
def relative_luminance(r: int, g: int, b: int) -> float:
"""Calculate relative luminance per WCAG 2.0."""
def linearize(c: int) -> float:
c_srgb = c / 255.0
if c_srgb <= 0.03928:
return c_srgb / 12.92
return ((c_srgb + 0.055) / 1.055) ** 2.4
return (
0.2126 * linearize(r)
+ 0.7152 * linearize(g)
+ 0.0722 * linearize(b)
)
def contrast_ratio(
fg: tuple[int, int, int], bg: tuple[int, int, int]
) -> float:
"""Calculate WCAG contrast ratio between two colors."""
l1 = relative_luminance(*fg)
l2 = relative_luminance(*bg)
lighter = max(l1, l2)
darker = min(l1, l2)
return (lighter + 0.05) / (darker + 0.05)
def check_wcag_contrast(
ratio: float, text_size: str = "normal"
) -> dict:
"""Check contrast ratio against WCAG thresholds."""
if text_size == "large": # 18pt+ or 14pt+ bold
return {
"AA": ratio >= 3.0,
"AAA": ratio >= 4.5,
"ratio": round(ratio, 2),
}
return {
"AA": ratio >= 4.5,
"AAA": ratio >= 7.0,
"ratio": round(ratio, 2),
}
Multi-Viewport Accessibility Testing
Accessibility issues often appear at specific viewport sizes. Test across breakpoints.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
VIEWPORTS = [
{"name": "mobile", "width": 375, "height": 812},
{"name": "tablet", "width": 768, "height": 1024},
{"name": "desktop", "width": 1280, "height": 720},
{"name": "large", "width": 1920, "height": 1080},
]
async def multi_viewport_audit(
url: str,
) -> dict[str, AccessibilityAudit]:
"""Run accessibility audits at multiple viewport sizes."""
results = {}
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
for vp in VIEWPORTS:
page = await browser.new_page(
viewport={"width": vp["width"], "height": vp["height"]}
)
await page.goto(url, wait_until="networkidle")
screenshot = await page.screenshot(type="png")
b64 = base64.b64encode(screenshot).decode()
audit = audit_accessibility(b64, f"{url} ({vp['name']})")
results[vp["name"]] = audit
await page.close()
await browser.close()
return results
Generating Compliance Reports
def generate_a11y_report(
audits: dict[str, AccessibilityAudit]
) -> str:
"""Generate a WCAG compliance report."""
lines = ["# WCAG Visual Accessibility Audit Report", ""]
all_issues = []
for viewport, audit in audits.items():
for issue in audit.issues:
all_issues.append((viewport, issue))
# Group by WCAG criterion
by_criterion: dict[str, list] = {}
for viewport, issue in all_issues:
key = issue.wcag_criterion
by_criterion.setdefault(key, []).append((viewport, issue))
lines.append(f"**Total issues found:** {len(all_issues)}")
critical = sum(
1 for _, i in all_issues if i.severity in ("A", "AA")
)
lines.append(f"**Level A/AA violations:** {critical}")
lines.append("")
for criterion, items in sorted(by_criterion.items()):
lines.append(f"## {criterion}")
for viewport, issue in items:
lines.append(
f"- **[{viewport}]** {issue.description} "
f"(Confidence: {issue.confidence})"
)
lines.append(f" - Fix: {issue.recommendation}")
lines.append("")
return "\n".join(lines)
Combining with axe-core
The strongest accessibility testing combines GPT Vision's visual analysis with axe-core's DOM analysis for comprehensive coverage.
async def comprehensive_audit(page, url: str) -> dict:
"""Combine axe-core DOM audit with GPT Vision visual audit."""
# Run axe-core
await page.evaluate(
"const script = document.createElement('script'); "
"script.src = 'https://cdnjs.cloudflare.com/ajax/libs/axe-core/4.9.1/axe.min.js'; "
"document.head.appendChild(script);"
)
await page.wait_for_function("typeof axe !== 'undefined'")
axe_results = await page.evaluate("axe.run()")
# Run vision audit
screenshot = await page.screenshot(type="png")
b64 = base64.b64encode(screenshot).decode()
vision_audit = audit_accessibility(b64, url)
return {
"dom_violations": axe_results["violations"],
"visual_issues": vision_audit.issues,
"combined_score": vision_audit.overall_score,
}
FAQ
Can GPT Vision accurately measure contrast ratios?
GPT-4V provides qualitative contrast assessments — it can identify text that appears hard to read against its background. For precise contrast ratio measurements meeting WCAG's exact thresholds (4.5:1 for normal text, 3:1 for large text), use the pixel-sampling approach shown above. GPT Vision excels at finding suspect areas; programmatic checks provide exact ratios.
Which WCAG criteria can GPT Vision check that traditional tools cannot?
GPT Vision uniquely evaluates visual reading order, meaningful use of whitespace, visual hierarchy consistency, image-of-text detection, color-only information conveyance, and whether focus indicators are visually sufficient. Traditional DOM-based tools cannot assess these because they require understanding the rendered visual output.
How often should visual accessibility audits run?
Run them on every significant UI change in your CI pipeline. For ongoing monitoring, weekly runs catch gradual drift. Mobile viewport tests are especially important because responsive layouts often break accessibility at specific breakpoints where elements overlap or touch targets shrink below minimum sizes.
#Accessibility #WCAG #GPTVision #A11yTesting #ComplianceAudit #VisualAccessibility #WebAccessibility #AgenticAI
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.