ontheway
This commit is contained in:
@@ -1 +0,0 @@
|
||||
../.noroff-env/har_capture/capture_search_har.py
|
||||
130
report/scripts/capture_search_har.py
Executable file
130
report/scripts/capture_search_har.py
Executable file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Capture HAR files for search engine result pages using Playwright.
|
||||
|
||||
This script starts a fresh browser context per search engine, navigates to the
|
||||
configured search URL, and writes one HAR file per engine.
|
||||
|
||||
It can use Tor if you pass --proxy socks5://HOST:PORT.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote_plus
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
|
||||
SEARCH_ENGINES = {
|
||||
"google": "https://www.google.com/search?q={query}",
|
||||
"duckduckgo": "https://duckduckgo.com/?q={query}&ia=web",
|
||||
"bing": "https://www.bing.com/search?q={query}",
|
||||
"brave": "https://search.brave.com/search?q={query}",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Capture search result HAR files with Playwright."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query",
|
||||
required=True,
|
||||
help="Search query to use, for example: 'migraine symptoms'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--engines",
|
||||
nargs="+",
|
||||
default=list(SEARCH_ENGINES),
|
||||
choices=sorted(SEARCH_ENGINES),
|
||||
help="Search engines to capture. Default: all",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=Path,
|
||||
default=Path("data"),
|
||||
help="Directory where HAR files are written. Default: ../data/har_capture",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--proxy",
|
||||
default="",
|
||||
help="Optional proxy, for example: socks5://127.0.0.1:9050",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browser",
|
||||
choices=["firefox", "chromium"],
|
||||
default="firefox",
|
||||
help="Browser engine to use. Default: firefox",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout-ms",
|
||||
type=int,
|
||||
default=45000,
|
||||
help="Navigation timeout in milliseconds. Default: 45000",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wait-until",
|
||||
choices=["load", "domcontentloaded", "networkidle"],
|
||||
default="networkidle",
|
||||
help="Navigation wait condition. Default: networkidle",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--headed",
|
||||
action="store_true",
|
||||
help="Show the browser window instead of running headless.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def safe_filename_part(value: str) -> str:
|
||||
keep = []
|
||||
for char in value.lower():
|
||||
if char.isalnum():
|
||||
keep.append(char)
|
||||
elif char in {" ", "-", "_"}:
|
||||
keep.append("_")
|
||||
cleaned = "".join(keep).strip("_")
|
||||
return cleaned[:80] or "query"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
encoded_query = quote_plus(args.query)
|
||||
query_part = safe_filename_part(args.query)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
with sync_playwright() as playwright:
|
||||
browser_launcher = getattr(playwright, args.browser)
|
||||
launch_options = {"headless": not args.headed}
|
||||
|
||||
if args.proxy:
|
||||
launch_options["proxy"] = {"server": args.proxy}
|
||||
|
||||
browser = browser_launcher.launch(**launch_options)
|
||||
|
||||
try:
|
||||
for engine in args.engines:
|
||||
search_url = SEARCH_ENGINES[engine].format(query=encoded_query)
|
||||
har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har"
|
||||
|
||||
context = browser.new_context(
|
||||
record_har_path=str(har_path),
|
||||
record_har_content="embed",
|
||||
)
|
||||
page = context.new_page()
|
||||
page.set_default_timeout(args.timeout_ms)
|
||||
page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms)
|
||||
context.close()
|
||||
|
||||
print(f"{engine}: {har_path}")
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
352
report/scripts/har_entries_to_csv.py
Executable file
352
report/scripts/har_entries_to_csv.py
Executable file
@@ -0,0 +1,352 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert HAR files to readable CSV files.
|
||||
|
||||
Output 1: har_entries.csv
|
||||
One row per entry in log.entries. This is the most direct way to inspect
|
||||
the HAR structure: each { ... } inside entries[] becomes one CSV row.
|
||||
|
||||
Output 2: har_summary.csv
|
||||
One row per HAR file with simple totals.
|
||||
|
||||
The script does not remove cookie values or URLs. Treat the output as sensitive.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import json
|
||||
from pathlib import Path
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
|
||||
|
||||
ENTRY_FIELDS = [
|
||||
"har_filename",
|
||||
"search_engine",
|
||||
"entry_index",
|
||||
"startedDateTime",
|
||||
"time_ms",
|
||||
"method",
|
||||
"url",
|
||||
"domain",
|
||||
"path",
|
||||
"query_text",
|
||||
"status",
|
||||
"statusText",
|
||||
"request_cookie_count",
|
||||
"request_cookie_names",
|
||||
"request_cookie_values",
|
||||
"response_cookie_count",
|
||||
"response_cookie_names",
|
||||
"response_cookie_values",
|
||||
"query_param_count",
|
||||
"query_param_names",
|
||||
"query_param_values",
|
||||
"request_header_count",
|
||||
"response_header_count",
|
||||
"post_data_present",
|
||||
"request_body_size",
|
||||
"response_body_size",
|
||||
"response_content_size",
|
||||
"transferred_bytes_approx",
|
||||
"is_third_party_domain",
|
||||
"tracking_hint",
|
||||
]
|
||||
|
||||
|
||||
SUMMARY_FIELDS = [
|
||||
"har_filename",
|
||||
"search_engine",
|
||||
"query_text",
|
||||
"requests_total",
|
||||
"unique_domains",
|
||||
"third_party_requests",
|
||||
"request_cookies_total",
|
||||
"response_cookies_total",
|
||||
"query_params_total",
|
||||
"post_requests_total",
|
||||
"tracking_hint_requests",
|
||||
"transferred_kb_approx",
|
||||
"page_load_ms",
|
||||
"status_2xx",
|
||||
"status_3xx",
|
||||
"status_4xx",
|
||||
"status_5xx",
|
||||
]
|
||||
|
||||
|
||||
TRACKING_WORDS = [
|
||||
"ads",
|
||||
"adservice",
|
||||
"analytics",
|
||||
"collect",
|
||||
"conversion",
|
||||
"doubleclick",
|
||||
"event",
|
||||
"gen_204",
|
||||
"googleadservices",
|
||||
"improving",
|
||||
"log",
|
||||
"metrics",
|
||||
"pagead",
|
||||
"telemetry",
|
||||
"track",
|
||||
]
|
||||
|
||||
|
||||
def detect_search_engine(har_path: Path) -> str:
|
||||
name = har_path.name.lower()
|
||||
if "duckduckgo" in name:
|
||||
return "DuckDuckGo"
|
||||
if "google" in name:
|
||||
return "Google"
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def read_har(path: Path) -> dict:
|
||||
with path.open(encoding="utf-8", errors="replace") as file:
|
||||
return json.load(file)
|
||||
|
||||
|
||||
def entries_from_har(har_data: dict) -> list[dict]:
|
||||
return har_data.get("log", {}).get("entries", []) or []
|
||||
|
||||
|
||||
def pages_from_har(har_data: dict) -> list[dict]:
|
||||
return har_data.get("log", {}).get("pages", []) or []
|
||||
|
||||
|
||||
def cookie_names(cookies: list[dict]) -> str:
|
||||
return "|".join(cookie.get("name", "") for cookie in cookies)
|
||||
|
||||
|
||||
def cookie_values(cookies: list[dict]) -> str:
|
||||
return "|".join(cookie.get("value", "") for cookie in cookies)
|
||||
|
||||
|
||||
def query_names(query_items: list[dict]) -> str:
|
||||
return "|".join(item.get("name", "") for item in query_items)
|
||||
|
||||
|
||||
def query_values(query_items: list[dict]) -> str:
|
||||
return "|".join(item.get("value", "") for item in query_items)
|
||||
|
||||
|
||||
def positive_number(value: object) -> int:
|
||||
if isinstance(value, (int, float)) and value > 0:
|
||||
return int(value)
|
||||
return 0
|
||||
|
||||
|
||||
def approximate_transferred_bytes(entry: dict) -> int:
|
||||
request = entry.get("request", {}) or {}
|
||||
response = entry.get("response", {}) or {}
|
||||
content = response.get("content", {}) or {}
|
||||
|
||||
return (
|
||||
positive_number(request.get("headersSize"))
|
||||
+ positive_number(request.get("bodySize"))
|
||||
+ positive_number(response.get("headersSize"))
|
||||
+ positive_number(response.get("bodySize"))
|
||||
+ positive_number(content.get("size"))
|
||||
)
|
||||
|
||||
|
||||
def extract_query_text_from_url(url: str) -> str:
|
||||
parsed = urlparse(url)
|
||||
query = parse_qs(parsed.query, keep_blank_values=True)
|
||||
values = query.get("q", [])
|
||||
return values[0] if values else ""
|
||||
|
||||
|
||||
def has_tracking_hint(domain: str, path: str, url: str) -> str:
|
||||
text = f"{domain} {path} {url}".lower()
|
||||
return "yes" if any(word in text for word in TRACKING_WORDS) else "no"
|
||||
|
||||
|
||||
def max_page_load_ms(entries: list[dict], pages: list[dict]) -> float:
|
||||
max_time = 0.0
|
||||
|
||||
for page in pages:
|
||||
on_load = (page.get("pageTimings", {}) or {}).get("onLoad", -1)
|
||||
if isinstance(on_load, (int, float)) and on_load > max_time:
|
||||
max_time = float(on_load)
|
||||
|
||||
for entry in entries:
|
||||
entry_time = entry.get("time", -1)
|
||||
if isinstance(entry_time, (int, float)) and entry_time > max_time:
|
||||
max_time = float(entry_time)
|
||||
|
||||
return max_time
|
||||
|
||||
|
||||
def main_domain_for_engine(search_engine: str) -> str:
|
||||
if search_engine == "Google":
|
||||
return "google."
|
||||
if search_engine == "DuckDuckGo":
|
||||
return "duckduckgo.com"
|
||||
return ""
|
||||
|
||||
|
||||
def make_entry_rows(har_path: Path) -> list[dict]:
|
||||
har_data = read_har(har_path)
|
||||
entries = entries_from_har(har_data)
|
||||
search_engine = detect_search_engine(har_path)
|
||||
main_domain = main_domain_for_engine(search_engine)
|
||||
rows = []
|
||||
|
||||
for index, entry in enumerate(entries, start=1):
|
||||
request = entry.get("request", {}) or {}
|
||||
response = entry.get("response", {}) or {}
|
||||
content = response.get("content", {}) or {}
|
||||
url = request.get("url", "")
|
||||
parsed = urlparse(url)
|
||||
request_cookies = request.get("cookies", []) or []
|
||||
response_cookies = response.get("cookies", []) or []
|
||||
query_items = request.get("queryString", []) or []
|
||||
domain = parsed.netloc.lower()
|
||||
path = parsed.path
|
||||
query_text = extract_query_text_from_url(url)
|
||||
third_party = "no"
|
||||
|
||||
if main_domain and domain and main_domain not in domain:
|
||||
third_party = "yes"
|
||||
|
||||
rows.append(
|
||||
{
|
||||
"har_filename": har_path.name,
|
||||
"search_engine": search_engine,
|
||||
"entry_index": index,
|
||||
"startedDateTime": entry.get("startedDateTime", ""),
|
||||
"time_ms": entry.get("time", ""),
|
||||
"method": request.get("method", ""),
|
||||
"url": url,
|
||||
"domain": domain,
|
||||
"path": path,
|
||||
"query_text": query_text,
|
||||
"status": response.get("status", ""),
|
||||
"statusText": response.get("statusText", ""),
|
||||
"request_cookie_count": len(request_cookies),
|
||||
"request_cookie_names": cookie_names(request_cookies),
|
||||
"request_cookie_values": cookie_values(request_cookies),
|
||||
"response_cookie_count": len(response_cookies),
|
||||
"response_cookie_names": cookie_names(response_cookies),
|
||||
"response_cookie_values": cookie_values(response_cookies),
|
||||
"query_param_count": len(query_items),
|
||||
"query_param_names": query_names(query_items),
|
||||
"query_param_values": query_values(query_items),
|
||||
"request_header_count": len(request.get("headers", []) or []),
|
||||
"response_header_count": len(response.get("headers", []) or []),
|
||||
"post_data_present": "yes" if request.get("postData") else "no",
|
||||
"request_body_size": request.get("bodySize", ""),
|
||||
"response_body_size": response.get("bodySize", ""),
|
||||
"response_content_size": content.get("size", ""),
|
||||
"transferred_bytes_approx": approximate_transferred_bytes(entry),
|
||||
"is_third_party_domain": third_party,
|
||||
"tracking_hint": has_tracking_hint(domain, path, url),
|
||||
}
|
||||
)
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def make_summary_row(har_path: Path, entry_rows: list[dict]) -> dict:
|
||||
har_data = read_har(har_path)
|
||||
entries = entries_from_har(har_data)
|
||||
pages = pages_from_har(har_data)
|
||||
domains = {row["domain"] for row in entry_rows if row["domain"]}
|
||||
status_counts = {2: 0, 3: 0, 4: 0, 5: 0}
|
||||
query_text = ""
|
||||
|
||||
for row in entry_rows:
|
||||
if row["query_text"] and not query_text:
|
||||
query_text = row["query_text"]
|
||||
|
||||
status = row["status"]
|
||||
if isinstance(status, int):
|
||||
group = status // 100
|
||||
if group in status_counts:
|
||||
status_counts[group] += 1
|
||||
|
||||
transferred_bytes = sum(int(row["transferred_bytes_approx"]) for row in entry_rows)
|
||||
|
||||
return {
|
||||
"har_filename": har_path.name,
|
||||
"search_engine": detect_search_engine(har_path),
|
||||
"query_text": query_text,
|
||||
"requests_total": len(entry_rows),
|
||||
"unique_domains": len(domains),
|
||||
"third_party_requests": sum(
|
||||
1 for row in entry_rows if row["is_third_party_domain"] == "yes"
|
||||
),
|
||||
"request_cookies_total": sum(int(row["request_cookie_count"]) for row in entry_rows),
|
||||
"response_cookies_total": sum(
|
||||
int(row["response_cookie_count"]) for row in entry_rows
|
||||
),
|
||||
"query_params_total": sum(int(row["query_param_count"]) for row in entry_rows),
|
||||
"post_requests_total": sum(1 for row in entry_rows if row["method"] == "POST"),
|
||||
"tracking_hint_requests": sum(1 for row in entry_rows if row["tracking_hint"] == "yes"),
|
||||
"transferred_kb_approx": round(transferred_bytes / 1024, 2),
|
||||
"page_load_ms": round(max_page_load_ms(entries, pages), 2),
|
||||
"status_2xx": status_counts[2],
|
||||
"status_3xx": status_counts[3],
|
||||
"status_4xx": status_counts[4],
|
||||
"status_5xx": status_counts[5],
|
||||
}
|
||||
|
||||
|
||||
def write_csv(path: Path, fieldnames: list[str], rows: list[dict]) -> None:
|
||||
with path.open("w", newline="", encoding="utf-8") as file:
|
||||
writer = csv.DictWriter(file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Convert HAR files to readable CSV files.")
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
type=Path,
|
||||
default=Path("data"),
|
||||
help="Folder with .har files. Default: data",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--entries-output",
|
||||
type=Path,
|
||||
default=Path("har_entries.csv"),
|
||||
help="CSV with one row per log.entries item. Default: har_entries.csv",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summary-output",
|
||||
type=Path,
|
||||
default=Path("har_summary.csv"),
|
||||
help="CSV with one row per HAR file. Default: har_summary.csv",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
har_files = sorted(args.input_dir.glob("*.har"))
|
||||
|
||||
if not har_files:
|
||||
raise SystemExit(f"No HAR files found in {args.input_dir}")
|
||||
|
||||
all_entry_rows = []
|
||||
summary_rows = []
|
||||
|
||||
for har_path in har_files:
|
||||
entry_rows = make_entry_rows(har_path)
|
||||
all_entry_rows.extend(entry_rows)
|
||||
summary_rows.append(make_summary_row(har_path, entry_rows))
|
||||
|
||||
write_csv(args.entries_output, ENTRY_FIELDS, all_entry_rows)
|
||||
write_csv(args.summary_output, SUMMARY_FIELDS, summary_rows)
|
||||
|
||||
print(f"Wrote {len(all_entry_rows)} entry rows to {args.entries_output}")
|
||||
print(f"Wrote {len(summary_rows)} summary rows to {args.summary_output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
55
report/scripts/many_search.sh
Executable file
55
report/scripts/many_search.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
QUERIES=(
|
||||
"weather oslo"
|
||||
"migraine symptoms"
|
||||
"vitamin d deficiency"
|
||||
"running shoes"
|
||||
"coffee grinder"
|
||||
"best laptop for students"
|
||||
"electric car charging"
|
||||
"cheap flights to london"
|
||||
"home insurance"
|
||||
"python list tutorial"
|
||||
"banana bread recipe"
|
||||
"news norway"
|
||||
)
|
||||
|
||||
for query in "${QUERIES[@]}"; do
|
||||
echo "Running query: $query"
|
||||
|
||||
capture_search_har \
|
||||
--query "$query" \
|
||||
--browser chromium \
|
||||
--wait-until load \
|
||||
--headed \
|
||||
--output-dir normal_chromium \
|
||||
--timeout-ms 60000
|
||||
|
||||
capture_search_har \
|
||||
--query "$query" \
|
||||
--browser chromium \
|
||||
--wait-until load \
|
||||
--headed \
|
||||
--output-dir tor_chromium \
|
||||
--timeout-ms 60000 \
|
||||
--proxy socks5://127.0.0.1:9050
|
||||
|
||||
capture_search_har \
|
||||
--query "$query" \
|
||||
--browser firefox \
|
||||
--wait-until load \
|
||||
--headed \
|
||||
--output-dir tor_firefox \
|
||||
--timeout-ms 60000 \
|
||||
--proxy socks5://127.0.0.1:9050
|
||||
|
||||
capture_search_har \
|
||||
--query "$query" \
|
||||
--browser firefox \
|
||||
--wait-until load \
|
||||
--headed \
|
||||
--output-dir normal_firefox \
|
||||
--timeout-ms 60000
|
||||
done
|
||||
82
report/scripts/power_query_etl.txt
Normal file
82
report/scripts/power_query_etl.txt
Normal file
@@ -0,0 +1,82 @@
|
||||
let
|
||||
Kilde = Csv.Document(
|
||||
Web.Contents(
|
||||
"https://example.sharepoint.com/.../tor_chromium/har_entries.csv"
|
||||
),
|
||||
[
|
||||
Delimiter = ",",
|
||||
Columns = 30,
|
||||
QuoteStyle = QuoteStyle.None
|
||||
]
|
||||
),
|
||||
|
||||
#"Promoted Headers" =
|
||||
Table.PromoteHeaders(
|
||||
Kilde,
|
||||
[PromoteAllScalars = true]
|
||||
),
|
||||
|
||||
#"Changed Column Types" =
|
||||
Table.TransformColumnTypes(
|
||||
#"Promoted Headers",
|
||||
{
|
||||
{"har_filename", type text},
|
||||
{"search_engine", type text},
|
||||
{"entry_index", Int64.Type},
|
||||
{"startedDateTime", type datetime},
|
||||
{"time_ms", type text},
|
||||
{"method", type text},
|
||||
{"url", type text},
|
||||
{"domain", type text},
|
||||
{"path", type text},
|
||||
{"query_text", type text},
|
||||
{"status", Int64.Type},
|
||||
{"statusText", type text},
|
||||
{"request_cookie_count", Int64.Type},
|
||||
{"response_cookie_count", Int64.Type},
|
||||
{"query_param_count", Int64.Type},
|
||||
{"request_header_count", Int64.Type},
|
||||
{"response_header_count", Int64.Type},
|
||||
{"tracking_hint", type text}
|
||||
},
|
||||
"en"
|
||||
),
|
||||
|
||||
#"Added Search Engine Column" =
|
||||
Table.AddColumn(
|
||||
#"Changed Column Types",
|
||||
"SearchEngine",
|
||||
each
|
||||
if Text.Contains([har_filename], "bing")
|
||||
then "Bing"
|
||||
else if Text.Contains([har_filename], "google")
|
||||
then "Google"
|
||||
else if Text.Contains([har_filename], "duckduckgo")
|
||||
then "DuckDuckGo"
|
||||
else if Text.Contains([har_filename], "brave")
|
||||
then "Brave"
|
||||
else "Unknown"
|
||||
),
|
||||
|
||||
#"Added Proxy Column" =
|
||||
Table.TransformColumnTypes(
|
||||
Table.AddColumn(
|
||||
#"Added Search Engine Column",
|
||||
"Proxy",
|
||||
each "Tor"
|
||||
),
|
||||
{{"Proxy", type text}}
|
||||
),
|
||||
|
||||
#"Added Browser Column" =
|
||||
Table.TransformColumnTypes(
|
||||
Table.AddColumn(
|
||||
#"Added Proxy Column",
|
||||
"Browser",
|
||||
each "Chromium"
|
||||
),
|
||||
{{"Browser", type text}}
|
||||
)
|
||||
|
||||
in
|
||||
#"Added Browser Column"
|
||||
9
report/scripts/power_query_merge.txt
Normal file
9
report/scripts/power_query_merge.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
let
|
||||
Kilde = Table.Combine({
|
||||
har_summary_normal_chromium,
|
||||
har_summary_normal_firefox,
|
||||
har_summary_tor_chromium,
|
||||
har_summary_tor_firefox
|
||||
})
|
||||
in
|
||||
Kilde
|
||||
Reference in New Issue
Block a user