#!/usr/bin/env python3 """ Capture HAR files for search engine result pages using Playwright. This script starts a fresh browser context per search engine, navigates to the configured search URL, and writes one HAR file per engine. It can use Tor if you pass --proxy socks5://HOST:PORT. """ from __future__ import annotations import argparse from datetime import datetime from pathlib import Path from urllib.parse import quote_plus from playwright.sync_api import sync_playwright SEARCH_ENGINES = { "google": "https://www.google.com/search?q={query}", "duckduckgo": "https://duckduckgo.com/?q={query}&ia=web", "bing": "https://www.bing.com/search?q={query}", "brave": "https://search.brave.com/search?q={query}", } def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Capture search result HAR files with Playwright." ) parser.add_argument( "--query", required=True, help="Search query to use, for example: 'migraine symptoms'", ) parser.add_argument( "--engines", nargs="+", default=list(SEARCH_ENGINES), choices=sorted(SEARCH_ENGINES), help="Search engines to capture. Default: all", ) parser.add_argument( "--output-dir", type=Path, default=Path("data"), help="Directory where HAR files are written. Default: ../data/har_capture", ) parser.add_argument( "--proxy", default="", help="Optional proxy, for example: socks5://127.0.0.1:9050", ) parser.add_argument( "--browser", choices=["firefox", "chromium"], default="firefox", help="Browser engine to use. Default: firefox", ) parser.add_argument( "--timeout-ms", type=int, default=45000, help="Navigation timeout in milliseconds. Default: 45000", ) parser.add_argument( "--wait-until", choices=["load", "domcontentloaded", "networkidle"], default="networkidle", help="Navigation wait condition. Default: networkidle", ) parser.add_argument( "--headed", action="store_true", help="Show the browser window instead of running headless.", ) return parser.parse_args() def safe_filename_part(value: str) -> str: keep = [] for char in value.lower(): if char.isalnum(): keep.append(char) elif char in {" ", "-", "_"}: keep.append("_") cleaned = "".join(keep).strip("_") return cleaned[:80] or "query" def main() -> None: args = parse_args() args.output_dir.mkdir(parents=True, exist_ok=True) encoded_query = quote_plus(args.query) query_part = safe_filename_part(args.query) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") with sync_playwright() as playwright: browser_launcher = getattr(playwright, args.browser) launch_options = {"headless": not args.headed} if args.proxy: launch_options["proxy"] = {"server": args.proxy} browser = browser_launcher.launch(**launch_options) try: for engine in args.engines: search_url = SEARCH_ENGINES[engine].format(query=encoded_query) har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har" context = browser.new_context( record_har_path=str(har_path), record_har_content="embed", ) page = context.new_page() page.set_default_timeout(args.timeout_ms) page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms) context.close() print(f"{engine}: {har_path}") finally: browser.close() if __name__ == "__main__": main()