Building a Web Scraping Agent with Playwright: Dynamic Content and JavaScript-Rendered Pages
Build a production-grade web scraping AI agent using Playwright that handles SPAs, infinite scroll, pagination, dynamic content loading, and basic anti-detection strategies.
Why Traditional Scraping Fails on Modern Websites
Traditional HTTP-based scraping with requests and BeautifulSoup sends a GET request and parses the HTML response. This works for static sites, but modern web applications render content with JavaScript — the initial HTML is often just a shell that loads data via API calls and renders it in the browser. SPAs built with React, Vue, or Angular deliver virtually no content in the initial HTML response.
Playwright solves this by running a real browser that executes JavaScript, renders the DOM, and waits for dynamic content to load. For AI agents that need to scrape data from modern websites, Playwright is the most reliable tool available.
Basic Page Scraping
Start with the fundamentals — navigating to a page and extracting content:
from playwright.sync_api import sync_playwright
def scrape_page(url: str) -> dict:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until="networkidle")
data = {
"title": page.title(),
"url": page.url,
"headings": [],
"paragraphs": [],
"links": [],
}
# Extract all headings
for heading in page.locator("h1, h2, h3").all():
data["headings"].append({
"tag": heading.evaluate("el => el.tagName"),
"text": heading.text_content().strip(),
})
# Extract paragraphs
for p_tag in page.locator("p").all():
text = p_tag.text_content().strip()
if len(text) > 20: # Skip empty/short paragraphs
data["paragraphs"].append(text)
# Extract links
for link in page.locator("a[href]").all():
data["links"].append({
"text": link.text_content().strip(),
"href": link.get_attribute("href"),
})
browser.close()
return data
result = scrape_page("https://example.com")
print(f"Title: {result['title']}")
print(f"Headings: {len(result['headings'])}")
print(f"Links: {len(result['links'])}")
Handling Infinite Scroll
Many modern sites use infinite scroll instead of pagination. Your scraping agent must scroll down to trigger content loading:
from playwright.sync_api import sync_playwright
def scrape_infinite_scroll(url: str, max_scrolls: int = 10) -> list:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until="networkidle")
items = []
previous_height = 0
for scroll_count in range(max_scrolls):
# Get current scroll height
current_height = page.evaluate("document.body.scrollHeight")
if current_height == previous_height:
print(f"No new content after scroll {scroll_count}")
break
# Scroll to bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
# Wait for new content to load
page.wait_for_timeout(2000)
page.wait_for_load_state("networkidle")
previous_height = current_height
print(f"Scroll {scroll_count + 1}: height = {current_height}")
# Extract all items after scrolling
for item in page.locator(".item-card").all():
items.append({
"title": item.locator("h3").text_content().strip(),
"description": item.locator("p").text_content().strip(),
})
print(f"Total items scraped: {len(items)}")
browser.close()
return items
Handling Pagination
For sites with traditional next/previous pagination:
See AI Voice Agents Handle Real Calls
Book a free demo or calculate how much you can save with AI voice automation.
from playwright.sync_api import sync_playwright
def scrape_paginated_site(base_url: str, max_pages: int = 5) -> list:
all_items = []
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(base_url, wait_until="networkidle")
for page_num in range(max_pages):
# Extract data from current page
items = page.locator(".result-item").all()
for item in items:
all_items.append({
"title": item.locator(".title").text_content().strip(),
"link": item.locator("a").get_attribute("href"),
"page": page_num + 1,
})
print(f"Page {page_num + 1}: scraped {len(items)} items")
# Try to find and click the next page button
next_button = page.locator(
'a:has-text("Next"), button:has-text("Next"), '
'[aria-label="Next page"]'
)
if next_button.count() == 0 or not next_button.is_enabled():
print("No more pages")
break
next_button.click()
page.wait_for_load_state("networkidle")
browser.close()
return all_items
Waiting for Dynamic Content
JavaScript-rendered content requires explicit waiting strategies:
# Wait for a specific element to appear
page.wait_for_selector(".data-loaded", timeout=15000)
# Wait for a loading spinner to disappear
page.wait_for_selector(".loading-spinner", state="hidden")
# Wait for a minimum number of items
page.locator(".result-item").nth(9).wait_for(state="visible")
# Wait for a JavaScript condition
page.wait_for_function(
"document.querySelectorAll('.result-item').length >= 10"
)
# Combine waits for robust content detection
def wait_for_content(page, selector, min_count=1, timeout=15000):
"""Wait until at least min_count elements matching selector exist."""
page.wait_for_function(
f"document.querySelectorAll('{selector}').length >= {min_count}",
timeout=timeout,
)
Anti-Detection Strategies
Websites may block automated browsers. These techniques help your agent avoid basic detection:
from playwright.sync_api import sync_playwright
def create_stealth_browser():
p = sync_playwright().start()
browser = p.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
]
)
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
locale="en-US",
timezone_id="America/New_York",
)
# Remove the navigator.webdriver flag
context.add_init_script("""
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
""")
return p, browser, context
p, browser, context = create_stealth_browser()
page = context.new_page()
page.goto("https://example.com")
# Add random delays between actions
import random
import time
def human_delay(min_ms=500, max_ms=2000):
time.sleep(random.uniform(min_ms / 1000, max_ms / 1000))
Complete Web Scraping Agent
Here is a production-ready scraping agent class:
import json
import random
import time
from dataclasses import dataclass
from playwright.sync_api import sync_playwright, Page
@dataclass
class ScrapedItem:
title: str
url: str
content: str
metadata: dict
class ScrapingAgent:
def __init__(self, headless: bool = True):
self.headless = headless
self.items: list[ScrapedItem] = []
def _human_delay(self):
time.sleep(random.uniform(0.5, 1.5))
def _extract_items(self, page: Page, config: dict) -> list:
items = []
for el in page.locator(config["item_selector"]).all():
try:
item = ScrapedItem(
title=el.locator(
config.get("title_sel", "h3")
).text_content().strip(),
url=el.locator("a").get_attribute("href") or "",
content=el.locator(
config.get("content_sel", "p")
).text_content().strip(),
metadata={"scraped_at": time.time()},
)
items.append(item)
except Exception as e:
print(f" Skipping item: {e}")
return items
def scrape(self, url: str, config: dict, max_pages: int = 3):
with sync_playwright() as p:
browser = p.chromium.launch(headless=self.headless)
context = browser.new_context(
viewport={"width": 1920, "height": 1080}
)
page = context.new_page()
for page_num in range(max_pages):
target = url if page_num == 0 else None
if target:
page.goto(target, wait_until="networkidle")
new_items = self._extract_items(page, config)
self.items.extend(new_items)
print(f"Page {page_num + 1}: {len(new_items)} items")
self._human_delay()
next_btn = page.locator(config.get(
"next_sel", 'a:has-text("Next")'
))
if next_btn.count() == 0:
break
next_btn.first.click()
page.wait_for_load_state("networkidle")
context.close()
browser.close()
return self.items
# Usage
agent = ScrapingAgent()
results = agent.scrape(
"https://example.com/listings",
config={
"item_selector": ".listing-card",
"title_sel": ".listing-title",
"content_sel": ".listing-description",
"next_sel": ".pagination .next",
},
max_pages=5,
)
FAQ
How do I scrape content from pages that require login?
Use Playwright's storage state feature. First, manually log in and save the authentication state with context.storage_state(path="auth.json"). In subsequent runs, load the saved state with browser.new_context(storage_state="auth.json"). The context will have all cookies and local storage from the authenticated session. This avoids logging in on every run.
How do I handle pages that load content in response to scroll events?
Use a scroll-and-wait loop. After each scroll action (page.evaluate("window.scrollBy(0, 500)")), wait for new elements to appear using page.wait_for_function() with a count check. Set a maximum scroll count to prevent infinite loops on pages that continuously load content. Monitor the scroll height — if it stops increasing, all content has loaded.
What are the legal considerations for web scraping?
Web scraping legality varies by jurisdiction. In general, scraping publicly accessible data is more defensible than scraping behind login walls. Always check a site's robots.txt file and terms of service. Rate-limit your requests to avoid impacting the site's performance. Do not scrape personal data without consent under GDPR or similar regulations. When in doubt, consult a legal professional.
#WebScraping #Playwright #DynamicContent #SPAScraping #AIAgents #InfiniteScroll #DataExtraction
CallSphere Team
Expert insights on AI voice agents and customer communication automation.
Try CallSphere AI Voice Agents
See how AI voice agents work for your industry. Live demo available -- no signup required.