Testing Multilingual AI Agents: Evaluation Across Languages and Cultures
Build comprehensive multilingual test suites that validate AI agent quality across languages with automated quality checks, native speaker reviews, and regression detection.
The Multilingual Testing Gap
Most AI agent test suites are written entirely in English. The agent gets tested in the language its developers speak, ships to global markets, and quality issues in other languages surface only when users complain. This reactive approach is expensive and damaging to brand trust.
A proper multilingual test strategy treats every supported language as a first-class citizen with its own test suite, quality benchmarks, and regression tracking. The goal is to catch localization bugs before they reach users.
Multilingual Test Case Structure
Define test cases that are language-parameterized so the same scenario runs across all supported languages.
from dataclasses import dataclass, field
from typing import List, Optional, Dict
from enum import Enum
class TestCategory(Enum):
FUNCTIONAL = "functional"
LINGUISTIC = "linguistic"
CULTURAL = "cultural"
FORMATTING = "formatting"
@dataclass
class MultilingualTestCase:
test_id: str
category: TestCategory
description: str
input_messages: Dict[str, str] # lang -> input message
expected_behaviors: Dict[str, List[str]] # lang -> expected behaviors
prohibited_content: Dict[str, List[str]] # lang -> prohibited strings
metadata: dict = field(default_factory=dict)
# Example test case
test_greeting = MultilingualTestCase(
test_id="TC001_greeting",
category=TestCategory.CULTURAL,
description="Agent should greet user appropriately for their culture",
input_messages={
"en": "Hello",
"ja": "こんにちは",
"ar": "مرحبا",
"de": "Hallo",
"es": "Hola",
},
expected_behaviors={
"en": ["responds in English", "uses professional greeting"],
"ja": ["responds in Japanese", "uses polite form (desu/masu)"],
"ar": ["responds in Arabic", "uses formal greeting"],
"de": ["responds in German", "uses Sie form"],
"es": ["responds in Spanish", "uses appropriate formality"],
},
prohibited_content={
"ja": ["informal slang", "tu-form"],
"ar": ["casual abbreviations"],
},
)
Automated Test Runner
Build a runner that executes test cases against the agent across all languages and collects structured results.
from dataclasses import dataclass
from datetime import datetime
from typing import List
import asyncio
@dataclass
class TestResult:
test_id: str
language: str
passed: bool
agent_response: str
checks_passed: List[str]
checks_failed: List[str]
execution_time_ms: float
timestamp: str
class MultilingualTestRunner:
def __init__(self, agent_endpoint: str, evaluator):
self.agent_endpoint = agent_endpoint
self.evaluator = evaluator
async def run_test(self, test_case: MultilingualTestCase, lang: str) -> TestResult:
input_msg = test_case.input_messages.get(lang, test_case.input_messages.get("en", ""))
start = datetime.utcnow()
# Send message to agent
response = await self._call_agent(input_msg, lang)
elapsed = (datetime.utcnow() - start).total_seconds() * 1000
# Evaluate response
expected = test_case.expected_behaviors.get(lang, [])
prohibited = test_case.prohibited_content.get(lang, [])
checks_passed = []
checks_failed = []
for behavior in expected:
if await self.evaluator.check_behavior(response, behavior, lang):
checks_passed.append(behavior)
else:
checks_failed.append(f"MISSING: {behavior}")
for banned in prohibited:
if banned.lower() in response.lower():
checks_failed.append(f"PROHIBITED: {banned}")
else:
checks_passed.append(f"correctly avoids: {banned}")
return TestResult(
test_id=test_case.test_id,
language=lang,
passed=len(checks_failed) == 0,
agent_response=response,
checks_passed=checks_passed,
checks_failed=checks_failed,
execution_time_ms=elapsed,
timestamp=datetime.utcnow().isoformat(),
)
async def run_suite(
self, test_cases: List[MultilingualTestCase], languages: List[str]
) -> List[TestResult]:
tasks = []
for tc in test_cases:
for lang in languages:
if lang in tc.input_messages:
tasks.append(self.run_test(tc, lang))
return await asyncio.gather(*tasks)
async def _call_agent(self, message: str, lang: str) -> str:
# Implementation: HTTP call to agent endpoint
import httpx
async with httpx.AsyncClient() as client:
resp = await client.post(
self.agent_endpoint,
json={"message": message, "language": lang},
timeout=30.0,
)
return resp.json().get("response", "")
LLM-Based Quality Evaluation
Use an LLM to evaluate whether agent responses meet linguistic and behavioral expectations.
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
from openai import AsyncOpenAI
class LLMQualityEvaluator:
def __init__(self, client: AsyncOpenAI):
self.client = client
async def check_behavior(self, response: str, expected_behavior: str, lang: str) -> bool:
result = await self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
"You are a multilingual QA evaluator. Given an AI agent response "
f"in {lang}, determine if it exhibits the expected behavior. "
"Reply with only YES or NO."
),
},
{
"role": "user",
"content": f"Response: {response}\n\nExpected behavior: {expected_behavior}",
},
],
temperature=0.0,
)
answer = result.choices[0].message.content.strip().upper()
return answer == "YES"
async def evaluate_fluency(self, text: str, lang: str) -> float:
"""Rate linguistic fluency from 0.0 to 1.0."""
result = await self.client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": (
f"Rate the fluency and naturalness of this {lang} text "
"on a scale from 0.0 to 1.0. Consider grammar, vocabulary, "
"and natural phrasing. Return only the numeric score."
),
},
{"role": "user", "content": text},
],
temperature=0.0,
)
try:
return float(result.choices[0].message.content.strip())
except ValueError:
return 0.0
Regression Detection
Track quality scores over time to detect regressions when prompts change or models are updated.
from typing import Dict, List
from dataclasses import dataclass
@dataclass
class QualityBaseline:
language: str
pass_rate: float # 0.0 to 1.0
avg_fluency: float
test_count: int
class RegressionDetector:
def __init__(self, threshold: float = 0.05):
self.threshold = threshold
self._baselines: Dict[str, QualityBaseline] = {}
def set_baseline(self, lang: str, results: List[TestResult]) -> None:
passed = sum(1 for r in results if r.passed)
self._baselines[lang] = QualityBaseline(
language=lang,
pass_rate=passed / len(results) if results else 0.0,
avg_fluency=0.0,
test_count=len(results),
)
def check_regression(self, lang: str, new_results: List[TestResult]) -> dict:
baseline = self._baselines.get(lang)
if not baseline:
return {"status": "no_baseline", "language": lang}
new_pass_rate = sum(1 for r in new_results if r.passed) / len(new_results)
delta = baseline.pass_rate - new_pass_rate
return {
"language": lang,
"baseline_pass_rate": baseline.pass_rate,
"current_pass_rate": new_pass_rate,
"delta": delta,
"regressed": delta > self.threshold,
"status": "regression" if delta > self.threshold else "ok",
}
FAQ
How many test cases do I need per language?
Aim for at least 30-50 test cases per language covering functional scenarios (correct answers), linguistic quality (grammar, fluency), cultural appropriateness (formality, prohibited content), and formatting (dates, numbers, currency). High-traffic languages should have more comprehensive suites.
Can I use machine translation to generate test inputs?
For functional tests, machine-translated inputs work reasonably well since you are testing the agent's ability to understand intent. For linguistic and cultural tests, use native speakers to write inputs that include natural phrasing, colloquialisms, and edge cases that machine translation would not produce.
How often should I run multilingual test suites?
Run the full suite on every model update or system prompt change. Run a smoke subset (top 10 critical tests per language) on every deployment. Schedule weekly full runs even without code changes, because underlying model behavior can drift with provider updates.
#MultilingualTesting #QualityAssurance #AIEvaluation #TestAutomation #Localization #AgenticAI #LearnAI #AIEngineering
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.