ontheway

2026-05-20 08:41:59 +02:00
parent 4b2e1455c9
commit e356879542
12 changed files with 816 additions and 14 deletions
--- a/report/scripts/capture_search_har.py
+++ b/report/scripts/capture_search_har.py
@@ -1 +0,0 @@
-../.noroff-env/har_capture/capture_search_har.py
--- a/report/scripts/capture_search_har.py
+++ b/report/scripts/capture_search_har.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""
+Capture HAR files for search engine result pages using Playwright.
+
+This script starts a fresh browser context per search engine, navigates to the
+configured search URL, and writes one HAR file per engine.
+
+It can use Tor if you pass --proxy socks5://HOST:PORT.
+"""
+
+from __future__ import annotations
+
+import argparse
+from datetime import datetime
+from pathlib import Path
+from urllib.parse import quote_plus
+
+from playwright.sync_api import sync_playwright
+
+
+SEARCH_ENGINES = {
+    "google": "https://www.google.com/search?q={query}",
+    "duckduckgo": "https://duckduckgo.com/?q={query}&ia=web",
+    "bing": "https://www.bing.com/search?q={query}",
+    "brave": "https://search.brave.com/search?q={query}",
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Capture search result HAR files with Playwright."
+    )
+    parser.add_argument(
+        "--query",
+        required=True,
+        help="Search query to use, for example: 'migraine symptoms'",
+    )
+    parser.add_argument(
+        "--engines",
+        nargs="+",
+        default=list(SEARCH_ENGINES),
+        choices=sorted(SEARCH_ENGINES),
+        help="Search engines to capture. Default: all",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("data"),
+        help="Directory where HAR files are written. Default: ../data/har_capture",
+    )
+    parser.add_argument(
+        "--proxy",
+        default="",
+        help="Optional proxy, for example: socks5://127.0.0.1:9050",
+    )
+    parser.add_argument(
+        "--browser",
+        choices=["firefox", "chromium"],
+        default="firefox",
+        help="Browser engine to use. Default: firefox",
+    )
+    parser.add_argument(
+        "--timeout-ms",
+        type=int,
+        default=45000,
+        help="Navigation timeout in milliseconds. Default: 45000",
+    )
+    parser.add_argument(
+        "--wait-until",
+        choices=["load", "domcontentloaded", "networkidle"],
+        default="networkidle",
+        help="Navigation wait condition. Default: networkidle",
+    )
+    parser.add_argument(
+        "--headed",
+        action="store_true",
+        help="Show the browser window instead of running headless.",
+    )
+    return parser.parse_args()
+
+
+def safe_filename_part(value: str) -> str:
+    keep = []
+    for char in value.lower():
+        if char.isalnum():
+            keep.append(char)
+        elif char in {" ", "-", "_"}:
+            keep.append("_")
+    cleaned = "".join(keep).strip("_")
+    return cleaned[:80] or "query"
+
+
+def main() -> None:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    encoded_query = quote_plus(args.query)
+    query_part = safe_filename_part(args.query)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    with sync_playwright() as playwright:
+        browser_launcher = getattr(playwright, args.browser)
+        launch_options = {"headless": not args.headed}
+
+        if args.proxy:
+            launch_options["proxy"] = {"server": args.proxy}
+
+        browser = browser_launcher.launch(**launch_options)
+
+        try:
+            for engine in args.engines:
+                search_url = SEARCH_ENGINES[engine].format(query=encoded_query)
+                har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har"
+
+                context = browser.new_context(
+                    record_har_path=str(har_path),
+                    record_har_content="embed",
+                )
+                page = context.new_page()
+                page.set_default_timeout(args.timeout_ms)
+                page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms)
+                context.close()
+
+                print(f"{engine}: {har_path}")
+        finally:
+            browser.close()
+
+
+if __name__ == "__main__":
+    main()
				`@@ -1 +0,0 @@`
				`../.noroff-env/har_capture/capture_search_har.py`