ontheway

writingmethod
2026-05-20 08:41:59 +02:00 · 2026-05-15 14:30:58 +02:00
13 changed files with 965 additions and 18 deletions
--- a/report/.vscode/ltex.hiddenFalsePositives.en-GB.txt
+++ b/report/.vscode/ltex.hiddenFalsePositives.en-GB.txt
@@ -0,0 +1 @@
+{"rule":"WANT_TO_NN","sentence":"^\\QThe main priority is the variable Trackinghints, as the work tries to identity correspondanses between trackings and cookies.\\E$"}
--- a/report/.vscode/settings.json
+++ b/report/.vscode/settings.json
@@ -28,6 +28,10 @@
    }
  ],

+  "ltex.language": "en-GB",
+  "ltex.enabled": [
+    "latex"
+  ],
  "latex-workshop.latex.clean.fileTypes": [
    "*.aux",
    "*.bbl",
--- a/report/main.tex
+++ b/report/main.tex
@@ -5,7 +5,8 @@
 \usepackage[T1]{fontenc}
 \usepackage[english]{babel}
 \usepackage{lmodern}
-
+\usepackage{enumitem}
+\usepackage{multicol}
 \usepackage[a4paper,margin=2.5cm]{geometry}
 \usepackage{setspace}
 \onehalfspacing
@@ -45,6 +46,7 @@
 \author{Tord-Vincent Heggland}
 \date{\today}

+
 \begin{document}
 \pagenumbering{roman}
 \maketitle
@@ -61,20 +63,18 @@
 \setcounter{page}{1}

 \input{sections/01_introduction.tex}
+\input{sections/01A_theory.tex}
 \input{sections/02_method.tex}
-
 \input{sections/03_results.tex}
-%\begin{figure}[h]
- %   \centering
-  %  \includegraphics[width=\textwidth]{figures/Figure1.png}
-   % \caption{Total video game sales by genre in North America (millions of units).}
-    %\label{fig:Figure1}
-%\end{figure}
 \input{sections/04_discussion.tex}
 \input{sections/05_conclusion.tex}

-\clearpage
-\cite{noauthor_video_nodate}
 \clearpage
 \printbibliography[title={References}]
+
+\clearpage 
+\appendix
+\renewcommand{\thepage}{A-\arabic{page}}
+\setcounter{page}{1}
+\input{sections/99_appendix.tex}
 \end{document}
--- a/report/references.bib
+++ b/report/references.bib
@@ -1,9 +1,72 @@

-@online{noauthor_video_nodate,
-	title = {Video Game Sales Dataset Updated -Extra Feat},
-	url = {https://www.kaggle.com/datasets/ibriiee/video-games-sales-dataset-2022-updated-extra-feat},
-	abstract = {Uncover the Gaming Industry Trends with the Most Comprehensive Sales Data},
-	urldate = {2026-04-21},
+
+@online{TOR,
+	title = {About Tor Browser},
+	url = {https://support.torproject.org/tor-browser/getting-started/about-tor-browser/},
+	abstract = {Tor Browser is a privacy-focused web browser that routes your traffic through the Tor network, hiding your real {IP} address, preventing tracking, and protecting you against surveillance and censorship. Tor Browser uses the Tor network to protect your privacy and anonymity.},
+	titleaddon = {Support},
+	author = {Tor Project, Inc},
+	urldate = {2026-05-15},
 	langid = {english},
-	file = {Snapshot:/home/tvh/snap/zotero-snap/common/Zotero/storage/C5LJ5QMG/video-games-sales-dataset-2022-updated-extra-feat.html:text/html},
+	file = {Snapshot:/home/tvh/snap/zotero-snap/common/Zotero/storage/R5P9688K/about-tor-browser.html:text/html},
+}
+
+@online{HAR,
+	title = {Network request list — Firefox Source Docs documentation},
+	url = {https://firefox-source-docs.mozilla.org/devtools-user/network_monitor/request_list/index.html?utm_source=chatgpt.com},
+	urldate = {2026-05-15},
+	file = {Network request list — Firefox Source Docs documentation:/home/tvh/snap/zotero-snap/common/Zotero/storage/P7S338MU/index.html:text/html},
+}
+
+@online{Playwright,
+	title = {Installation {\textbar} Playwright Python},
+	url = {https://playwright.dev/python/docs/intro},
+	abstract = {Introduction},
+	urldate = {2026-05-15},
+	langid = {english},
+	file = {Snapshot:/home/tvh/snap/zotero-snap/common/Zotero/storage/M3HT6FNN/intro.html:text/html},
+}
+
+@online{VENV,
+	title = {12. Virtual Environments and Packages},
+	url = {https://docs.python.org/3/tutorial/venv.html},
+	abstract = {Introduction: Python applications will often use packages and modules that don’t come as part of the standard library. Applications will sometimes need a specific version of a library, because the ...},
+	titleaddon = {Python documentation},
+	urldate = {2026-05-15},
+	langid = {english},
+	file = {Snapshot:/home/tvh/snap/zotero-snap/common/Zotero/storage/QEN5QM2A/venv.html:text/html},
+}
+
+@article{PDF,
+	title = {Book: Module 7. Lessons and Tasks},
+	author = {{Noroff}},
+	langid = {english},
+	file = {PDF:/home/tvh/snap/zotero-snap/common/Zotero/storage/RVWQE24L/Heggland - Book Module 7. Lessons and Tasks.pdf:application/pdf},
+}
+
+@article{heggland_book_nodate-1,
+	title = {Book: Module 4. Lessons and Tasks},
+	author = {{Noroff}},
+	langid = {english},
+	file = {PDF:/home/tvh/snap/zotero-snap/common/Zotero/storage/YC4C99HY/Heggland - Book Module 4. Lessons and Tasks.pdf:application/pdf},
+}
+
+@article{heggland_book_nodate-2,
+	title = {Book: Module 2. Lessons and Tasks},
+	author = {{Noroff}},
+	langid = {english},
+	file = {PDF:/home/tvh/snap/zotero-snap/common/Zotero/storage/ZUATB293/Heggland - Book Module 2. Lessons and Tasks.pdf:application/pdf},
+}
+
+@misc{noroff_modules,
+  author       = {{Noroff}},
+  title        = {Modules 2, 4 and 7: Lessons and Tasks},
+  year         = {2026},
+  note         = {Internal course material used in the Data Analytics programme},
+  langid       = {english},
+  file         = {
+    PDF:/path/module2.pdf:application/pdf;
+    PDF:/path/module4.pdf:application/pdf;
+    PDF:/path/module7.pdf:application/pdf
+  }
 }
--- a/report/scripts/capture_search_har.py
+++ b/report/scripts/capture_search_har.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""
+Capture HAR files for search engine result pages using Playwright.
+
+This script starts a fresh browser context per search engine, navigates to the
+configured search URL, and writes one HAR file per engine.
+
+It can use Tor if you pass --proxy socks5://HOST:PORT.
+"""
+
+from __future__ import annotations
+
+import argparse
+from datetime import datetime
+from pathlib import Path
+from urllib.parse import quote_plus
+
+from playwright.sync_api import sync_playwright
+
+
+SEARCH_ENGINES = {
+    "google": "https://www.google.com/search?q={query}",
+    "duckduckgo": "https://duckduckgo.com/?q={query}&ia=web",
+    "bing": "https://www.bing.com/search?q={query}",
+    "brave": "https://search.brave.com/search?q={query}",
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Capture search result HAR files with Playwright."
+    )
+    parser.add_argument(
+        "--query",
+        required=True,
+        help="Search query to use, for example: 'migraine symptoms'",
+    )
+    parser.add_argument(
+        "--engines",
+        nargs="+",
+        default=list(SEARCH_ENGINES),
+        choices=sorted(SEARCH_ENGINES),
+        help="Search engines to capture. Default: all",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("data"),
+        help="Directory where HAR files are written. Default: ../data/har_capture",
+    )
+    parser.add_argument(
+        "--proxy",
+        default="",
+        help="Optional proxy, for example: socks5://127.0.0.1:9050",
+    )
+    parser.add_argument(
+        "--browser",
+        choices=["firefox", "chromium"],
+        default="firefox",
+        help="Browser engine to use. Default: firefox",
+    )
+    parser.add_argument(
+        "--timeout-ms",
+        type=int,
+        default=45000,
+        help="Navigation timeout in milliseconds. Default: 45000",
+    )
+    parser.add_argument(
+        "--wait-until",
+        choices=["load", "domcontentloaded", "networkidle"],
+        default="networkidle",
+        help="Navigation wait condition. Default: networkidle",
+    )
+    parser.add_argument(
+        "--headed",
+        action="store_true",
+        help="Show the browser window instead of running headless.",
+    )
+    return parser.parse_args()
+
+
+def safe_filename_part(value: str) -> str:
+    keep = []
+    for char in value.lower():
+        if char.isalnum():
+            keep.append(char)
+        elif char in {" ", "-", "_"}:
+            keep.append("_")
+    cleaned = "".join(keep).strip("_")
+    return cleaned[:80] or "query"
+
+
+def main() -> None:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    encoded_query = quote_plus(args.query)
+    query_part = safe_filename_part(args.query)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+    with sync_playwright() as playwright:
+        browser_launcher = getattr(playwright, args.browser)
+        launch_options = {"headless": not args.headed}
+
+        if args.proxy:
+            launch_options["proxy"] = {"server": args.proxy}
+
+        browser = browser_launcher.launch(**launch_options)
+
+        try:
+            for engine in args.engines:
+                search_url = SEARCH_ENGINES[engine].format(query=encoded_query)
+                har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har"
+
+                context = browser.new_context(
+                    record_har_path=str(har_path),
+                    record_har_content="embed",
+                )
+                page = context.new_page()
+                page.set_default_timeout(args.timeout_ms)
+                page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms)
+                context.close()
+
+                print(f"{engine}: {har_path}")
+        finally:
+            browser.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/report/scripts/har_entries_to_csv.py
+++ b/report/scripts/har_entries_to_csv.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python3
+"""
+Convert HAR files to readable CSV files.
+
+Output 1: har_entries.csv
+  One row per entry in log.entries. This is the most direct way to inspect
+  the HAR structure: each { ... } inside entries[] becomes one CSV row.
+
+Output 2: har_summary.csv
+  One row per HAR file with simple totals.
+
+The script does not remove cookie values or URLs. Treat the output as sensitive.
+"""
+
+from __future__ import annotations
+
+import argparse
+import csv
+import json
+from pathlib import Path
+from urllib.parse import parse_qs, urlparse
+
+
+ENTRY_FIELDS = [
+    "har_filename",
+    "search_engine",
+    "entry_index",
+    "startedDateTime",
+    "time_ms",
+    "method",
+    "url",
+    "domain",
+    "path",
+    "query_text",
+    "status",
+    "statusText",
+    "request_cookie_count",
+    "request_cookie_names",
+    "request_cookie_values",
+    "response_cookie_count",
+    "response_cookie_names",
+    "response_cookie_values",
+    "query_param_count",
+    "query_param_names",
+    "query_param_values",
+    "request_header_count",
+    "response_header_count",
+    "post_data_present",
+    "request_body_size",
+    "response_body_size",
+    "response_content_size",
+    "transferred_bytes_approx",
+    "is_third_party_domain",
+    "tracking_hint",
+]
+
+
+SUMMARY_FIELDS = [
+    "har_filename",
+    "search_engine",
+    "query_text",
+    "requests_total",
+    "unique_domains",
+    "third_party_requests",
+    "request_cookies_total",
+    "response_cookies_total",
+    "query_params_total",
+    "post_requests_total",
+    "tracking_hint_requests",
+    "transferred_kb_approx",
+    "page_load_ms",
+    "status_2xx",
+    "status_3xx",
+    "status_4xx",
+    "status_5xx",
+]
+
+
+TRACKING_WORDS = [
+    "ads",
+    "adservice",
+    "analytics",
+    "collect",
+    "conversion",
+    "doubleclick",
+    "event",
+    "gen_204",
+    "googleadservices",
+    "improving",
+    "log",
+    "metrics",
+    "pagead",
+    "telemetry",
+    "track",
+]
+
+
+def detect_search_engine(har_path: Path) -> str:
+    name = har_path.name.lower()
+    if "duckduckgo" in name:
+        return "DuckDuckGo"
+    if "google" in name:
+        return "Google"
+    return "Unknown"
+
+
+def read_har(path: Path) -> dict:
+    with path.open(encoding="utf-8", errors="replace") as file:
+        return json.load(file)
+
+
+def entries_from_har(har_data: dict) -> list[dict]:
+    return har_data.get("log", {}).get("entries", []) or []
+
+
+def pages_from_har(har_data: dict) -> list[dict]:
+    return har_data.get("log", {}).get("pages", []) or []
+
+
+def cookie_names(cookies: list[dict]) -> str:
+    return "|".join(cookie.get("name", "") for cookie in cookies)
+
+
+def cookie_values(cookies: list[dict]) -> str:
+    return "|".join(cookie.get("value", "") for cookie in cookies)
+
+
+def query_names(query_items: list[dict]) -> str:
+    return "|".join(item.get("name", "") for item in query_items)
+
+
+def query_values(query_items: list[dict]) -> str:
+    return "|".join(item.get("value", "") for item in query_items)
+
+
+def positive_number(value: object) -> int:
+    if isinstance(value, (int, float)) and value > 0:
+        return int(value)
+    return 0
+
+
+def approximate_transferred_bytes(entry: dict) -> int:
+    request = entry.get("request", {}) or {}
+    response = entry.get("response", {}) or {}
+    content = response.get("content", {}) or {}
+
+    return (
+        positive_number(request.get("headersSize"))
+        + positive_number(request.get("bodySize"))
+        + positive_number(response.get("headersSize"))
+        + positive_number(response.get("bodySize"))
+        + positive_number(content.get("size"))
+    )
+
+
+def extract_query_text_from_url(url: str) -> str:
+    parsed = urlparse(url)
+    query = parse_qs(parsed.query, keep_blank_values=True)
+    values = query.get("q", [])
+    return values[0] if values else ""
+
+
+def has_tracking_hint(domain: str, path: str, url: str) -> str:
+    text = f"{domain} {path} {url}".lower()
+    return "yes" if any(word in text for word in TRACKING_WORDS) else "no"
+
+
+def max_page_load_ms(entries: list[dict], pages: list[dict]) -> float:
+    max_time = 0.0
+
+    for page in pages:
+        on_load = (page.get("pageTimings", {}) or {}).get("onLoad", -1)
+        if isinstance(on_load, (int, float)) and on_load > max_time:
+            max_time = float(on_load)
+
+    for entry in entries:
+        entry_time = entry.get("time", -1)
+        if isinstance(entry_time, (int, float)) and entry_time > max_time:
+            max_time = float(entry_time)
+
+    return max_time
+
+
+def main_domain_for_engine(search_engine: str) -> str:
+    if search_engine == "Google":
+        return "google."
+    if search_engine == "DuckDuckGo":
+        return "duckduckgo.com"
+    return ""
+
+
+def make_entry_rows(har_path: Path) -> list[dict]:
+    har_data = read_har(har_path)
+    entries = entries_from_har(har_data)
+    search_engine = detect_search_engine(har_path)
+    main_domain = main_domain_for_engine(search_engine)
+    rows = []
+
+    for index, entry in enumerate(entries, start=1):
+        request = entry.get("request", {}) or {}
+        response = entry.get("response", {}) or {}
+        content = response.get("content", {}) or {}
+        url = request.get("url", "")
+        parsed = urlparse(url)
+        request_cookies = request.get("cookies", []) or []
+        response_cookies = response.get("cookies", []) or []
+        query_items = request.get("queryString", []) or []
+        domain = parsed.netloc.lower()
+        path = parsed.path
+        query_text = extract_query_text_from_url(url)
+        third_party = "no"
+
+        if main_domain and domain and main_domain not in domain:
+            third_party = "yes"
+
+        rows.append(
+            {
+                "har_filename": har_path.name,
+                "search_engine": search_engine,
+                "entry_index": index,
+                "startedDateTime": entry.get("startedDateTime", ""),
+                "time_ms": entry.get("time", ""),
+                "method": request.get("method", ""),
+                "url": url,
+                "domain": domain,
+                "path": path,
+                "query_text": query_text,
+                "status": response.get("status", ""),
+                "statusText": response.get("statusText", ""),
+                "request_cookie_count": len(request_cookies),
+                "request_cookie_names": cookie_names(request_cookies),
+                "request_cookie_values": cookie_values(request_cookies),
+                "response_cookie_count": len(response_cookies),
+                "response_cookie_names": cookie_names(response_cookies),
+                "response_cookie_values": cookie_values(response_cookies),
+                "query_param_count": len(query_items),
+                "query_param_names": query_names(query_items),
+                "query_param_values": query_values(query_items),
+                "request_header_count": len(request.get("headers", []) or []),
+                "response_header_count": len(response.get("headers", []) or []),
+                "post_data_present": "yes" if request.get("postData") else "no",
+                "request_body_size": request.get("bodySize", ""),
+                "response_body_size": response.get("bodySize", ""),
+                "response_content_size": content.get("size", ""),
+                "transferred_bytes_approx": approximate_transferred_bytes(entry),
+                "is_third_party_domain": third_party,
+                "tracking_hint": has_tracking_hint(domain, path, url),
+            }
+        )
+
+    return rows
+
+
+def make_summary_row(har_path: Path, entry_rows: list[dict]) -> dict:
+    har_data = read_har(har_path)
+    entries = entries_from_har(har_data)
+    pages = pages_from_har(har_data)
+    domains = {row["domain"] for row in entry_rows if row["domain"]}
+    status_counts = {2: 0, 3: 0, 4: 0, 5: 0}
+    query_text = ""
+
+    for row in entry_rows:
+        if row["query_text"] and not query_text:
+            query_text = row["query_text"]
+
+        status = row["status"]
+        if isinstance(status, int):
+            group = status // 100
+            if group in status_counts:
+                status_counts[group] += 1
+
+    transferred_bytes = sum(int(row["transferred_bytes_approx"]) for row in entry_rows)
+
+    return {
+        "har_filename": har_path.name,
+        "search_engine": detect_search_engine(har_path),
+        "query_text": query_text,
+        "requests_total": len(entry_rows),
+        "unique_domains": len(domains),
+        "third_party_requests": sum(
+            1 for row in entry_rows if row["is_third_party_domain"] == "yes"
+        ),
+        "request_cookies_total": sum(int(row["request_cookie_count"]) for row in entry_rows),
+        "response_cookies_total": sum(
+            int(row["response_cookie_count"]) for row in entry_rows
+        ),
+        "query_params_total": sum(int(row["query_param_count"]) for row in entry_rows),
+        "post_requests_total": sum(1 for row in entry_rows if row["method"] == "POST"),
+        "tracking_hint_requests": sum(1 for row in entry_rows if row["tracking_hint"] == "yes"),
+        "transferred_kb_approx": round(transferred_bytes / 1024, 2),
+        "page_load_ms": round(max_page_load_ms(entries, pages), 2),
+        "status_2xx": status_counts[2],
+        "status_3xx": status_counts[3],
+        "status_4xx": status_counts[4],
+        "status_5xx": status_counts[5],
+    }
+
+
+def write_csv(path: Path, fieldnames: list[str], rows: list[dict]) -> None:
+    with path.open("w", newline="", encoding="utf-8") as file:
+        writer = csv.DictWriter(file, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert HAR files to readable CSV files.")
+    parser.add_argument(
+        "--input-dir",
+        type=Path,
+        default=Path("data"),
+        help="Folder with .har files. Default: data",
+    )
+    parser.add_argument(
+        "--entries-output",
+        type=Path,
+        default=Path("har_entries.csv"),
+        help="CSV with one row per log.entries item. Default: har_entries.csv",
+    )
+    parser.add_argument(
+        "--summary-output",
+        type=Path,
+        default=Path("har_summary.csv"),
+        help="CSV with one row per HAR file. Default: har_summary.csv",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    har_files = sorted(args.input_dir.glob("*.har"))
+
+    if not har_files:
+        raise SystemExit(f"No HAR files found in {args.input_dir}")
+
+    all_entry_rows = []
+    summary_rows = []
+
+    for har_path in har_files:
+        entry_rows = make_entry_rows(har_path)
+        all_entry_rows.extend(entry_rows)
+        summary_rows.append(make_summary_row(har_path, entry_rows))
+
+    write_csv(args.entries_output, ENTRY_FIELDS, all_entry_rows)
+    write_csv(args.summary_output, SUMMARY_FIELDS, summary_rows)
+
+    print(f"Wrote {len(all_entry_rows)} entry rows to {args.entries_output}")
+    print(f"Wrote {len(summary_rows)} summary rows to {args.summary_output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/report/scripts/many_search.sh
+++ b/report/scripts/many_search.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+QUERIES=(
+	"weather oslo"
+	"migraine symptoms"
+	"vitamin d deficiency"
+	"running shoes"
+	"coffee grinder"
+	"best laptop for students"
+	"electric car charging"
+	"cheap flights to london"
+	"home insurance"
+	"python list tutorial"
+	"banana bread recipe"
+	"news norway"
+)
+
+for query in "${QUERIES[@]}"; do
+	echo "Running query: $query"
+
+	capture_search_har \
+		--query "$query" \
+		--browser chromium \
+		--wait-until load \
+		--headed \
+		--output-dir normal_chromium \
+		--timeout-ms 60000
+
+	capture_search_har \
+		--query "$query" \
+		--browser chromium \
+		--wait-until load \
+		--headed \
+		--output-dir tor_chromium \
+		--timeout-ms 60000 \
+		--proxy socks5://127.0.0.1:9050
+
+	capture_search_har \
+		--query "$query" \
+		--browser firefox \
+		--wait-until load \
+		--headed \
+		--output-dir tor_firefox \
+		--timeout-ms 60000 \
+		--proxy socks5://127.0.0.1:9050
+
+	capture_search_har \
+		--query "$query" \
+		--browser firefox \
+		--wait-until load \
+		--headed \
+		--output-dir normal_firefox \
+		--timeout-ms 60000
+done
--- a/report/scripts/power_query_etl.txt
+++ b/report/scripts/power_query_etl.txt
@@ -0,0 +1,82 @@
+let
+    Kilde = Csv.Document(
+        Web.Contents(
+            "https://example.sharepoint.com/.../tor_chromium/har_entries.csv"
+        ),
+        [
+            Delimiter = ",",
+            Columns = 30,
+            QuoteStyle = QuoteStyle.None
+        ]
+    ),
+
+    #"Promoted Headers" =
+        Table.PromoteHeaders(
+            Kilde,
+            [PromoteAllScalars = true]
+        ),
+
+    #"Changed Column Types" =
+        Table.TransformColumnTypes(
+            #"Promoted Headers",
+            {
+                {"har_filename", type text},
+                {"search_engine", type text},
+                {"entry_index", Int64.Type},
+                {"startedDateTime", type datetime},
+                {"time_ms", type text},
+                {"method", type text},
+                {"url", type text},
+                {"domain", type text},
+                {"path", type text},
+                {"query_text", type text},
+                {"status", Int64.Type},
+                {"statusText", type text},
+                {"request_cookie_count", Int64.Type},
+                {"response_cookie_count", Int64.Type},
+                {"query_param_count", Int64.Type},
+                {"request_header_count", Int64.Type},
+                {"response_header_count", Int64.Type},
+                {"tracking_hint", type text}
+            },
+            "en"
+        ),
+
+    #"Added Search Engine Column" =
+        Table.AddColumn(
+            #"Changed Column Types",
+            "SearchEngine",
+            each
+                if Text.Contains([har_filename], "bing")
+                then "Bing"
+                else if Text.Contains([har_filename], "google")
+                then "Google"
+                else if Text.Contains([har_filename], "duckduckgo")
+                then "DuckDuckGo"
+                else if Text.Contains([har_filename], "brave")
+                then "Brave"
+                else "Unknown"
+        ),
+
+    #"Added Proxy Column" =
+        Table.TransformColumnTypes(
+            Table.AddColumn(
+                #"Added Search Engine Column",
+                "Proxy",
+                each "Tor"
+            ),
+            {{"Proxy", type text}}
+        ),
+
+    #"Added Browser Column" =
+        Table.TransformColumnTypes(
+            Table.AddColumn(
+                #"Added Proxy Column",
+                "Browser",
+                each "Chromium"
+            ),
+            {{"Browser", type text}}
+        )
+
+in
+    #"Added Browser Column"
--- a/report/scripts/power_query_merge.txt
+++ b/report/scripts/power_query_merge.txt
@@ -0,0 +1,9 @@
+let
+    Kilde = Table.Combine({
+        har_summary_normal_chromium,
+        har_summary_normal_firefox,
+        har_summary_tor_chromium,
+        har_summary_tor_firefox
+    })
+in
+    Kilde
--- a/report/sections/01A_theory.tex
+++ b/report/sections/01A_theory.tex
@@ -0,0 +1,50 @@
+\section{Theory\label{sec:theor}}
+
+
+%\subsection{EDA - Exploratory Data Analysis}
+
+%\subsubsection{Data preprocessing and cleaning}
+%\subsubsection{Data reliability and consistency}
+%\subsubsection{Data visualisation principles}
+
+%\subsection{Web traffic}
+
+% HAR files
+
+%\subsubsection{HTTP requests and responses}
+
+%
+
+%\subsubsection{Cookies and tracking parameters}
+
+
+%\subsubsection{Search engines and privacy}
+
+
+%%%
+\subsection{Web Tracking Technologies}
+
+\subsubsection{HTTP Requests and Responses}
+
+\subsubsection{Cookies and Tracking Parameters}
+
+\subsection{Data Collection and Preprocessing}
+
+\subsubsection{HAR Files}
+
+\subsubsection{Data Preprocessing and Cleaning}
+
+\subsection{Exploratory Data Analysis (EDA)}
+
+\subsection{Data Visualisation Principles}
+
+
+
+% “data preprocessing” → du gjør faktisk HAR → CSV-transformasjon
+% “EDA” → du lager konkrete Power BI-visualiseringer
+% “data pipeline” → du automatiserer hele workflowen
+% “data reliability” → du standardiserer browser/proxy conditions
+% “data collection methodology” → du dokumenterer Playwright/Tor-oppsettet
+
+
+
--- a/report/sections/02_method.tex
+++ b/report/sections/02_method.tex
@@ -1,5 +1,8 @@
 \section{Method\label{sec:metho}}

+%This section describes the methodology used throughout the research process. Some technical concepts and terminology referenced in this section are further explained in the Theory section and later discussed in the Discussion section.
+
+This section describes the methodology used in this research. Any technical concepts and terminology references in this section are further explained in \autoref{sec:theor}, and discussed in \autoref{sec:discu}. 

 \subsection{Research design\label{sec:metho:research_design}}
 % Stikkord:
@@ -8,6 +11,42 @@
 % same searches across search engines
 % comparison between search engines, browsers, and network modes

+
+
+%This research is design using tools to simulate human interaction with simple web searches. Each search is design to be anomynous, with no browser histories and cookies before each search was done. For each web search the browser history is cleaned and cookies removed. Browser profile used has no login data, so the web queries can not connect the search to any real person. 
+%4 Search Engines are used for this prosess, once in each webbrowser, Firefox and Chromium. Which means each web query is don 8 times. For this work, several queries are create to widen the data collection. The web quires are following:
+
+
+
+This research is designed using tools to simulate human interaction with simple web searches. Each search is designed to be anonymous, with no browser history or cookies stored before the search is performed. Before each search, the browser history is cleared and cookies are removed. The browser profiles used contain no login data, preventing the web queries from being connected to any real person.
+
+Four search engines are used in this process (Brave, Bing, DuckDuckGo and Google), once in each web browser: Firefox and Chromium. This means that each web query is performed eight times. For this work, several queries are created to widen the data collection. The web queries are as follows:
+
+
+
+
+\begin{multicols}{2}
+\begin{itemize}[noitemsep, topsep=0pt]
+    \item weather oslo
+    \item migraine symptoms
+    \item vitamin d deficiency
+    \item running shoes
+    \item coffee grinder
+    \item best laptop for students
+    \item electric car charging
+    \item cheap flights to london
+    \item home insurance
+    \item python list tutorial
+    \item banana bread recipe
+    \item news norway
+\end{itemize}
+\end{multicols}
+
+Data collection is performed using either a Tor proxy to help hide the identity of the person performing the web searches, or a normal network connection where web traffic may be used to identify the user.
+%ata collected is filtered using either Tor proxy to hide the identity of the person premforing websearch, and not using any proxy, where any web traffic can identify you traffic.
+
+
+
 \subsection{Test environment\label{sec:metho:test_environment}}
 % Stikkord:
 % operating system / controlled environment
@@ -18,7 +57,36 @@
 % cookies allowed
 % same wait condition and timeout

-\subsection{Search engines and search queries\label{sec:metho:search_engines}}
+
+%When you tap ctrl + shift + C
+When pressing \texttt{Ctrl + Shift + C} and click on \texttt{Network}, a log of Network traffic shows up. This window is open and the web-history is emptied before performing a web-search manually. This process gives a clean anonymous log web traffic from only one web query as known in \autoref{fig:metho:manually_har}. Right to \texttt{"No throttling"} is a settings icon. Clicking on that bottom gives the options on \autoref{fig:metho:export_har}.
+Each query could be done manually, or the processes of collecting data could be automated. For this research the process of collecting first-hand raw data was automated. A tool used to automate web-queries is python using Playwright \parencite{Playwright}. Playwright is installed in a virtual environment packages using python \parencite{VENV}. All collection of data is done in Linux shell, and doing EDA is done in Microsoft PowerBI.
+
+Once the installation done, web-browsers of choice may be installed inside the virtual environment. Firefox and chromium were installed inside the virtual environment. Now the environment for retrieving raw, real-world-event data for this analysis.
+
+
+
+
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[
+        width=\linewidth,
+    ]{figures/png/09_importing_har_manually.png}
+    \caption{Network traffic by a simply web search}
+    \label{fig:metho:manually_har}
+\end{figure}
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[
+        width=0.27\linewidth,
+    ]{figures/png/10_har_options.png}
+    \caption{Download Har files.}
+    \label{fig:metho:export_har}
+\end{figure}
+
+%\subsection{Search engines and search queries\label{sec:metho:search_engines}}
 % Stikkord:
 % Google
 % Bing
@@ -27,7 +95,7 @@
 % list of search queries
 % same query used across all engines

-\subsection{Variables and measurements\label{sec:metho:Variables_measurements}}
+%\subsection{Variables and measurements\label{sec:metho:Variables_measurements}}
 % Stikkord:
 % requests_total
 % unique_domains
@@ -42,6 +110,38 @@
 % HTTP status groups

 \subsection{Data collection\label{sec:metho:data_collection}}
+
+
+
+Three Scripts, two python files and one bash files are used to automate the data collection process. The files can be found under folder \texttt{./scripts/}. The bash file (\path{./scripts/many_search.sh}) uses the python file (\path{./scripts/capture_search_har.py}) to automate the process of retrieving data. It essentially loops for each query used, and each web-browser used, and for each proxy used and stores them into different folders.
+
+
+\begin{lstlisting}[language=bash, caption={Playwright data collection command}, label={lst:metho:playwright_command}, basicstyle=\ttfamily\small, breaklines=true]
+capture_search_har \
+    --query "weather oslo" \
+    --browser chromium \
+    --wait-until load \
+    --headed \
+    --output-dir tor_chromium \
+    --proxy socks5://127.0.0.1:9050
+\end{lstlisting}
+
+The \autoref{lst:metho:playwright_command} is an example of a prompt using the python file to automate the retrieving the data. It opens a web-browser, preform a web-search, and saves the output to a directory of choice, or in the default current directory. Input \verb|--query| is the only mandatory input  for this function. All the option are optional, but defaults to a default value. Input \verb|--proxy| is optional, if not used, it uses the desktops current official IP address to preform the web-search. If the \verb|--proxy| option is specified, the provided value will be used as the proxy endpoint. Meaning the web-search would only see, for this instance, the Tor's endpoint and its official IP address when preforming the web-search \parencite{TOR}.  \verb|--headed| and  \verb|--wait-until load| are important. The first one tells Playwright to open a physical window when performing a web-search, and not just a \texttt{HTTPS} call, the second one tells Playwright to wait until the web-browser is fully loaded. The rest of the inputs are self-explanatory. 
+
+After retrieving all the HAR files through the automated Python workflow, the dataset is ready for processing.
+
+
+
+
+
+
+
+
+
+
+
+
+
 % Stikkord:
 % HAR files
 % one HAR file per search engine/query/browser/network mode
@@ -52,6 +152,35 @@
 % Tor via SOCKS proxy where applicable

 \subsection{Data processing\label{sec:metho:data_processing}}
+
+This section contains several important steps in the data processing pipeline, all leading to the exploratory data analysis (EDA) performed in Microsoft Power~BI. As of now, the data set is segregated into several HAR files, which is unreadable to Microsoft Power~BI. This section is a step-by-step process from raw data collection to finished visualized tables in Power~BI.
+
+
+
+\subsubsection{From HAR to CSV files\label{sec:metho:har_to_csv}}
+
+In order to perform data analysis in Power~BI, the data sett had to be converted from HAR files to CSV files. Once collectimg data in \autoref{sec:metho:data_collection}, several data entries needed to be ready before it could be extracted, transformed and loaded into tables in \autoref{sec:metho:etl}. 
+
+A python script at (\path{./scripts/har_entries_to_csv.py}) reads all the \texttt{.har} files in folder \texttt{./data/} and prints two output files. The first one is \texttt{./har\_entries.csv}, and the second one is \texttt{./har\_summary.csv}. Four of each of those files were created, each for each proxy type and web-browser of choice. Working directory decides which proxy and web-browser that is used for that current data collection. More on that in \autoref{sec:discu}. \texttt{./har\_entries.csv} contains every request from the web-search as one entry, or one row in the csv file. \texttt{./har\_summary.csv} summaries its respective \texttt{./har\_entries.csv} file. Which means it takes all the input from several \texttt{.har} file and summarize one \texttt{.har} file in one row. In contrast, the output in file \texttt{./har\_entries.csv} does not summaries the \texttt{.har} files, it takes the raw data and presents one entry in a \texttt{.har} as one row in file \texttt{./har\_entries.csv}, and does not do any data processing. 
+
+The file \texttt{./har\_summary.csv} was discarded in favour of \texttt{./har\_entries.csv}. It contains the raw data, and will be used on the ETL process in Power~BI in \autoref{sec:metho:etl}
+
+
+
+\subsubsection{ETL process in Power~BI\label{sec:metho:etl}}
+
+ETL stands for Extract, Transform, and Load. Some of the Extract and Transform process was done in \autoref{sec:metho:har_to_csv}. The dataset is not ready to be loaded and merged into Power~BI. 
+
+%The dataset is separated in four \texttt{CSV} files in each folder which represents the case of those entries. For instance the raw data in the case of proxy is Tor, and browser used is Chromium, the location of the dataset is as following: \texttt{./tor_chromium}, as \autoref{lst:metho:playwright_command} indicates.
+
+The dataset is separated into four \texttt{CSV} files in each folder, which represents the case of those entries. For instance, if the proxy used is Tor and the browser used is Chromium, the location of the dataset is as follows: \texttt{./tor\_chromium/}, as \autoref{lst:metho:playwright_command} indicates. The \texttt{CSV} files consist of equal file name, which is generated from the python script \texttt{capture\_search\_har}. All \texttt{CSV} files was loaded into each segregated folders in the student private working area at SharePoint. \autoref{lst:appen:pq} in \autoref{sec:appen:pq} shows the total query done in Power~BI for each instance. When a \texttt{CSV} file is loaded as source into Power~BI, some autoformatting is done by Power~BI itself, and \autoref{lst:appen:pq} illustrate the code Power~BI generates, and some more formatting. 
+
+
+As explained, only folder name describes which proxy used and which browser used for each instance. To take account for this, each \texttt{./har\_entries.csv} had to manually loaded to Power~BI for each instance. Once one table was loaded, two new columns had to be added which specified its proxy and browser for the current entries. After this is done all for tables could be merged into one table main table. Before merging, each table was named as following: \texttt{har\_entries\_<proxy>\_<browser>} for its respective proxy and browser. Once merged, the new table got the name \texttt{har\_entries\_all} which was further used for creating tables for this work. Those tables are presented in \autoref{sec:resul}.
+
+
+At last, for all observation, the whole table \texttt{har\_entries\_all} was filtered for the variable \texttt{tracking\_hint} to be equal to yes. Meaning every entry that did not address any hint to tracking was immediately filtered out.
+
 % Stikkord:
 % HAR files converted to CSV
 % har_entries.csv: one row per HAR entry/request
@@ -62,6 +191,9 @@


 \subsection{Limitations of the method\label{sec:metho:limitations}}
+
+Some limitations of this work are related to the process in which the analysis is performed. When retrieving a \texttt{.har} file, the text file is unstructured and contains large amounts of data noise. The scripts do not always guarantee data consistency. The output files did not specify which proxy or browser used.
+
 % Stikkord:
 % HAR shows observable browser-side traffic only
 % cannot prove server-side storage
--- a/report/sections/03_results.tex
+++ b/report/sections/03_results.tex
@@ -4,6 +4,8 @@

 %\includegraphics[width=\linewidth]{figures/pdf/01_3rdparty.pdf}

+This section introduces all the findings in this work. The main priority is the variable \texttt{tracking\_hints}, as the work tries to identify relationships between trackings and cookies. Every graph in this work is filtered for \texttt{tracking\_hints=yes}. Meaning the tables retrieved are larger than those visualized in this work. 
+
 \begin{figure}[h]
    \centering
    \includegraphics[
@@ -14,6 +16,11 @@
 \end{figure}


+The first figure, \autoref{fig:resul:02_browser}, illustrates the distribution of tracking hints across browsers and search engines. DuckDuckgo appears to be the Search~Engine that caches and identify most hints to tracking. Bing and Brave appear to conservative addressing any tracking hints. While only on DuckDuckGo and Google, the choice of web-browser seem to play a crucial role. Chromium addresses more hints to tracking in Google than Firefox does.
+
+
+Furthermore, request cookies counts and response cookies counts will be presented in the Results below, which will be the main focus of the following results.
+
 \begin{figure}[h]
    \centering
    \includegraphics[
@@ -24,6 +31,11 @@
 \end{figure}


+Figures~\ref{fig:resul:03_proxy_request} and~\ref{fig:resul:04_proxy_response} illustrate the cookie counts for each tracking hint. DuckDuckGo and Brave show no cookies across all \texttt{tracking\_hints}. In contrast, only Bing and Google use cookies on entries identified as tracking hints.
+
+
+
+
 \begin{figure}[h]
    \centering
    \includegraphics[
@@ -33,6 +45,9 @@
    \label{fig:resul:04_proxy_response}
 \end{figure}

+
+
+
 \begin{figure}[h]
    \centering
    \includegraphics[
--- a/report/sections/99_appendix.tex
+++ b/report/sections/99_appendix.tex
@@ -0,0 +1,54 @@
+\section{Appendices\label{sec:appen}}
+
+
+\subsection{Automation scripts\label{sec:appen:auto}}
+
+
+\lstinputlisting[
+    basicstyle=\ttfamily\scriptsize
+    language={},
+    caption={Bash automation script},
+    breaklines=true,
+    label={lst:appen:bash}
+]{scripts/many_search.sh}
+
+
+
+\lstinputlisting[
+    basicstyle=\ttfamily\scriptsize
+    language={},
+    caption={Python capture HAR script},
+    breaklines=true,
+    label={lst:appen:capture}
+]{scripts/capture_search_har.py}
+
+
+\lstinputlisting[
+    basicstyle=\ttfamily\scriptsize
+    language={},
+    caption={Python HAR to CSV script},
+    breaklines=true,
+    label={lst:appen:hartocsv}
+]{scripts/har_entries_to_csv.py}
+
+
+
+
+\subsection{Power Query transformation\label{sec:appen:pq}}
+
+\lstinputlisting[
+    basicstyle=\ttfamily\scriptsize
+    language={},
+    caption={Power Query ETL script},
+    breaklines=true,
+    label={lst:appen:pq}
+]{scripts/power_query_etl.txt}
+
+\lstinputlisting[
+    basicstyle=\ttfamily\scriptsize
+    language={},
+    caption={Power Query merge script},
+    breaklines=true,
+    label={lst:appen:merge}
+]{scripts/power_query_merge.txt}
+
Author	SHA1	Message	Date
Tord-Vincent Heggland	e356879542	ontheway	2026-05-20 08:41:59 +02:00
Tord-Vincent Heggland	4b2e1455c9	writingmethod	2026-05-15 14:30:58 +02:00
				`@@ -0,0 +1 @@`
				`{"rule":"WANT_TO_NN","sentence":"^\\QThe main priority is the variable Trackinghints, as the work tries to identity correspondanses between trackings and cookies.\\E$"}`