This commit is contained in:
2026-05-20 08:41:59 +02:00
parent 4b2e1455c9
commit e356879542
12 changed files with 816 additions and 14 deletions

View File

@@ -1 +0,0 @@
../.noroff-env/har_capture/capture_search_har.py

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Capture HAR files for search engine result pages using Playwright.
This script starts a fresh browser context per search engine, navigates to the
configured search URL, and writes one HAR file per engine.
It can use Tor if you pass --proxy socks5://HOST:PORT.
"""
from __future__ import annotations
import argparse
from datetime import datetime
from pathlib import Path
from urllib.parse import quote_plus
from playwright.sync_api import sync_playwright
SEARCH_ENGINES = {
"google": "https://www.google.com/search?q={query}",
"duckduckgo": "https://duckduckgo.com/?q={query}&ia=web",
"bing": "https://www.bing.com/search?q={query}",
"brave": "https://search.brave.com/search?q={query}",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Capture search result HAR files with Playwright."
)
parser.add_argument(
"--query",
required=True,
help="Search query to use, for example: 'migraine symptoms'",
)
parser.add_argument(
"--engines",
nargs="+",
default=list(SEARCH_ENGINES),
choices=sorted(SEARCH_ENGINES),
help="Search engines to capture. Default: all",
)
parser.add_argument(
"--output-dir",
type=Path,
default=Path("data"),
help="Directory where HAR files are written. Default: ../data/har_capture",
)
parser.add_argument(
"--proxy",
default="",
help="Optional proxy, for example: socks5://127.0.0.1:9050",
)
parser.add_argument(
"--browser",
choices=["firefox", "chromium"],
default="firefox",
help="Browser engine to use. Default: firefox",
)
parser.add_argument(
"--timeout-ms",
type=int,
default=45000,
help="Navigation timeout in milliseconds. Default: 45000",
)
parser.add_argument(
"--wait-until",
choices=["load", "domcontentloaded", "networkidle"],
default="networkidle",
help="Navigation wait condition. Default: networkidle",
)
parser.add_argument(
"--headed",
action="store_true",
help="Show the browser window instead of running headless.",
)
return parser.parse_args()
def safe_filename_part(value: str) -> str:
keep = []
for char in value.lower():
if char.isalnum():
keep.append(char)
elif char in {" ", "-", "_"}:
keep.append("_")
cleaned = "".join(keep).strip("_")
return cleaned[:80] or "query"
def main() -> None:
args = parse_args()
args.output_dir.mkdir(parents=True, exist_ok=True)
encoded_query = quote_plus(args.query)
query_part = safe_filename_part(args.query)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with sync_playwright() as playwright:
browser_launcher = getattr(playwright, args.browser)
launch_options = {"headless": not args.headed}
if args.proxy:
launch_options["proxy"] = {"server": args.proxy}
browser = browser_launcher.launch(**launch_options)
try:
for engine in args.engines:
search_url = SEARCH_ENGINES[engine].format(query=encoded_query)
har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har"
context = browser.new_context(
record_har_path=str(har_path),
record_har_content="embed",
)
page = context.new_page()
page.set_default_timeout(args.timeout_ms)
page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms)
context.close()
print(f"{engine}: {har_path}")
finally:
browser.close()
if __name__ == "__main__":
main()