ontheway
This commit is contained in:
@@ -1 +0,0 @@
|
||||
../.noroff-env/har_capture/capture_search_har.py
|
||||
130
report/scripts/capture_search_har.py
Executable file
130
report/scripts/capture_search_har.py
Executable file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Capture HAR files for search engine result pages using Playwright.
|
||||
|
||||
This script starts a fresh browser context per search engine, navigates to the
|
||||
configured search URL, and writes one HAR file per engine.
|
||||
|
||||
It can use Tor if you pass --proxy socks5://HOST:PORT.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
|
||||
SEARCH_ENGINES = {
|
||||
"google": "https://www.google.com/search?q={query}",
|
||||
"duckduckgo": "https://duckduckgo.com/?q={query}&ia=web",
|
||||
"bing": "https://www.bing.com/search?q={query}",
|
||||
"brave": "https://search.brave.com/search?q={query}",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Capture search result HAR files with Playwright."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
required=True,
|
||||
help="Search query to use, for example: 'migraine symptoms'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--engines",
|
||||
nargs="+",
|
||||
default=list(SEARCH_ENGINES),
|
||||
choices=sorted(SEARCH_ENGINES),
|
||||
help="Search engines to capture. Default: all",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("data"),
|
||||
help="Directory where HAR files are written. Default: ../data/har_capture",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--proxy",
|
||||
default="",
|
||||
help="Optional proxy, for example: socks5://127.0.0.1:9050",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser",
|
||||
choices=["firefox", "chromium"],
|
||||
default="firefox",
|
||||
help="Browser engine to use. Default: firefox",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-ms",
|
||||
type=int,
|
||||
default=45000,
|
||||
help="Navigation timeout in milliseconds. Default: 45000",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wait-until",
|
||||
choices=["load", "domcontentloaded", "networkidle"],
|
||||
default="networkidle",
|
||||
help="Navigation wait condition. Default: networkidle",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headed",
|
||||
action="store_true",
|
||||
help="Show the browser window instead of running headless.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def safe_filename_part(value: str) -> str:
|
||||
keep = []
|
||||
for char in value.lower():
|
||||
if char.isalnum():
|
||||
keep.append(char)
|
||||
elif char in {" ", "-", "_"}:
|
||||
keep.append("_")
|
||||
cleaned = "".join(keep).strip("_")
|
||||
return cleaned[:80] or "query"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
encoded_query = quote_plus(args.query)
|
||||
query_part = safe_filename_part(args.query)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
with sync_playwright() as playwright:
|
||||
browser_launcher = getattr(playwright, args.browser)
|
||||
launch_options = {"headless": not args.headed}
|
||||
|
||||
if args.proxy:
|
||||
launch_options["proxy"] = {"server": args.proxy}
|
||||
|
||||
browser = browser_launcher.launch(**launch_options)
|
||||
|
||||
try:
|
||||
for engine in args.engines:
|
||||
search_url = SEARCH_ENGINES[engine].format(query=encoded_query)
|
||||
har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har"
|
||||
|
||||
context = browser.new_context(
|
||||
record_har_path=str(har_path),
|
||||
record_har_content="embed",
|
||||
)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(args.timeout_ms)
|
||||
page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms)
|
||||
context.close()
|
||||
|
||||
print(f"{engine}: {har_path}")
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user