#!/usr/bin/env python3
"""
Capture HAR files for search engine result pages using Playwright.

This script starts a fresh browser context per search engine, navigates to the
configured search URL, and writes one HAR file per engine.

It can use Tor if you pass --proxy socks5://HOST:PORT.
"""

from __future__ import annotations

import argparse
from datetime import datetime
from pathlib import Path
from urllib.parse import quote_plus

from playwright.sync_api import sync_playwright


SEARCH_ENGINES = {
    "google": "https://www.google.com/search?q={query}",
    "duckduckgo": "https://duckduckgo.com/?q={query}&ia=web",
    "bing": "https://www.bing.com/search?q={query}",
    "brave": "https://search.brave.com/search?q={query}",
}


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Capture search result HAR files with Playwright."
    )
    parser.add_argument(
        "--query",
        required=True,
        help="Search query to use, for example: 'migraine symptoms'",
    )
    parser.add_argument(
        "--engines",
        nargs="+",
        default=list(SEARCH_ENGINES),
        choices=sorted(SEARCH_ENGINES),
        help="Search engines to capture. Default: all",
    )
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("data"),
        help="Directory where HAR files are written. Default: ../data/har_capture",
    )
    parser.add_argument(
        "--proxy",
        default="",
        help="Optional proxy, for example: socks5://127.0.0.1:9050",
    )
    parser.add_argument(
        "--browser",
        choices=["firefox", "chromium"],
        default="firefox",
        help="Browser engine to use. Default: firefox",
    )
    parser.add_argument(
        "--timeout-ms",
        type=int,
        default=45000,
        help="Navigation timeout in milliseconds. Default: 45000",
    )
    parser.add_argument(
        "--wait-until",
        choices=["load", "domcontentloaded", "networkidle"],
        default="networkidle",
        help="Navigation wait condition. Default: networkidle",
    )
    parser.add_argument(
        "--headed",
        action="store_true",
        help="Show the browser window instead of running headless.",
    )
    return parser.parse_args()


def safe_filename_part(value: str) -> str:
    keep = []
    for char in value.lower():
        if char.isalnum():
            keep.append(char)
        elif char in {" ", "-", "_"}:
            keep.append("_")
    cleaned = "".join(keep).strip("_")
    return cleaned[:80] or "query"


def main() -> None:
    args = parse_args()
    args.output_dir.mkdir(parents=True, exist_ok=True)

    encoded_query = quote_plus(args.query)
    query_part = safe_filename_part(args.query)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    with sync_playwright() as playwright:
        browser_launcher = getattr(playwright, args.browser)
        launch_options = {"headless": not args.headed}

        if args.proxy:
            launch_options["proxy"] = {"server": args.proxy}

        browser = browser_launcher.launch(**launch_options)

        try:
            for engine in args.engines:
                search_url = SEARCH_ENGINES[engine].format(query=encoded_query)
                har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har"

                context = browser.new_context(
                    record_har_path=str(har_path),
                    record_har_content="embed",
                )
                page = context.new_page()
                page.set_default_timeout(args.timeout_ms)
                page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms)
                context.close()

                print(f"{engine}: {har_path}")
        finally:
            browser.close()


if __name__ == "__main__":
    main()