diff --git a/report/.vscode/ltex.hiddenFalsePositives.en-GB.txt b/report/.vscode/ltex.hiddenFalsePositives.en-GB.txt new file mode 100644 index 0000000..8a16650 --- /dev/null +++ b/report/.vscode/ltex.hiddenFalsePositives.en-GB.txt @@ -0,0 +1 @@ +{"rule":"WANT_TO_NN","sentence":"^\\QThe main priority is the variable Trackinghints, as the work tries to identity correspondanses between trackings and cookies.\\E$"} diff --git a/report/main.tex b/report/main.tex index f91e3bf..a1afb7e 100644 --- a/report/main.tex +++ b/report/main.tex @@ -46,6 +46,7 @@ \author{Tord-Vincent Heggland} \date{\today} + \begin{document} \pagenumbering{roman} \maketitle @@ -70,4 +71,10 @@ \clearpage \printbibliography[title={References}] + +\clearpage +\appendix +\renewcommand{\thepage}{A-\arabic{page}} +\setcounter{page}{1} +\input{sections/99_appendix.tex} \end{document} \ No newline at end of file diff --git a/report/references.bib b/report/references.bib index fc0f14f..3fbbee3 100644 --- a/report/references.bib +++ b/report/references.bib @@ -5,7 +5,7 @@ url = {https://support.torproject.org/tor-browser/getting-started/about-tor-browser/}, abstract = {Tor Browser is a privacy-focused web browser that routes your traffic through the Tor network, hiding your real {IP} address, preventing tracking, and protecting you against surveillance and censorship. Tor Browser uses the Tor network to protect your privacy and anonymity.}, titleaddon = {Support}, - author = {Inc, Tor Project}, + author = {Tor Project, Inc}, urldate = {2026-05-15}, langid = {english}, file = {Snapshot:/home/tvh/snap/zotero-snap/common/Zotero/storage/R5P9688K/about-tor-browser.html:text/html}, diff --git a/report/scripts/capture_search_har.py b/report/scripts/capture_search_har.py deleted file mode 120000 index d1b5c32..0000000 --- a/report/scripts/capture_search_har.py +++ /dev/null @@ -1 +0,0 @@ -../.noroff-env/har_capture/capture_search_har.py \ No newline at end of file diff --git a/report/scripts/capture_search_har.py b/report/scripts/capture_search_har.py new file mode 100755 index 0000000..433590c --- /dev/null +++ b/report/scripts/capture_search_har.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +""" +Capture HAR files for search engine result pages using Playwright. + +This script starts a fresh browser context per search engine, navigates to the +configured search URL, and writes one HAR file per engine. + +It can use Tor if you pass --proxy socks5://HOST:PORT. +""" + +from __future__ import annotations + +import argparse +from datetime import datetime +from pathlib import Path +from urllib.parse import quote_plus + +from playwright.sync_api import sync_playwright + + +SEARCH_ENGINES = { + "google": "https://www.google.com/search?q={query}", + "duckduckgo": "https://duckduckgo.com/?q={query}&ia=web", + "bing": "https://www.bing.com/search?q={query}", + "brave": "https://search.brave.com/search?q={query}", +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Capture search result HAR files with Playwright." + ) + parser.add_argument( + "--query", + required=True, + help="Search query to use, for example: 'migraine symptoms'", + ) + parser.add_argument( + "--engines", + nargs="+", + default=list(SEARCH_ENGINES), + choices=sorted(SEARCH_ENGINES), + help="Search engines to capture. Default: all", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("data"), + help="Directory where HAR files are written. Default: ../data/har_capture", + ) + parser.add_argument( + "--proxy", + default="", + help="Optional proxy, for example: socks5://127.0.0.1:9050", + ) + parser.add_argument( + "--browser", + choices=["firefox", "chromium"], + default="firefox", + help="Browser engine to use. Default: firefox", + ) + parser.add_argument( + "--timeout-ms", + type=int, + default=45000, + help="Navigation timeout in milliseconds. Default: 45000", + ) + parser.add_argument( + "--wait-until", + choices=["load", "domcontentloaded", "networkidle"], + default="networkidle", + help="Navigation wait condition. Default: networkidle", + ) + parser.add_argument( + "--headed", + action="store_true", + help="Show the browser window instead of running headless.", + ) + return parser.parse_args() + + +def safe_filename_part(value: str) -> str: + keep = [] + for char in value.lower(): + if char.isalnum(): + keep.append(char) + elif char in {" ", "-", "_"}: + keep.append("_") + cleaned = "".join(keep).strip("_") + return cleaned[:80] or "query" + + +def main() -> None: + args = parse_args() + args.output_dir.mkdir(parents=True, exist_ok=True) + + encoded_query = quote_plus(args.query) + query_part = safe_filename_part(args.query) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + with sync_playwright() as playwright: + browser_launcher = getattr(playwright, args.browser) + launch_options = {"headless": not args.headed} + + if args.proxy: + launch_options["proxy"] = {"server": args.proxy} + + browser = browser_launcher.launch(**launch_options) + + try: + for engine in args.engines: + search_url = SEARCH_ENGINES[engine].format(query=encoded_query) + har_path = args.output_dir / f"{timestamp}_{engine}_{query_part}.har" + + context = browser.new_context( + record_har_path=str(har_path), + record_har_content="embed", + ) + page = context.new_page() + page.set_default_timeout(args.timeout_ms) + page.goto(search_url, wait_until=args.wait_until, timeout=args.timeout_ms) + context.close() + + print(f"{engine}: {har_path}") + finally: + browser.close() + + +if __name__ == "__main__": + main() diff --git a/report/scripts/har_entries_to_csv.py b/report/scripts/har_entries_to_csv.py new file mode 100755 index 0000000..b86067a --- /dev/null +++ b/report/scripts/har_entries_to_csv.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 +""" +Convert HAR files to readable CSV files. + +Output 1: har_entries.csv + One row per entry in log.entries. This is the most direct way to inspect + the HAR structure: each { ... } inside entries[] becomes one CSV row. + +Output 2: har_summary.csv + One row per HAR file with simple totals. + +The script does not remove cookie values or URLs. Treat the output as sensitive. +""" + +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path +from urllib.parse import parse_qs, urlparse + + +ENTRY_FIELDS = [ + "har_filename", + "search_engine", + "entry_index", + "startedDateTime", + "time_ms", + "method", + "url", + "domain", + "path", + "query_text", + "status", + "statusText", + "request_cookie_count", + "request_cookie_names", + "request_cookie_values", + "response_cookie_count", + "response_cookie_names", + "response_cookie_values", + "query_param_count", + "query_param_names", + "query_param_values", + "request_header_count", + "response_header_count", + "post_data_present", + "request_body_size", + "response_body_size", + "response_content_size", + "transferred_bytes_approx", + "is_third_party_domain", + "tracking_hint", +] + + +SUMMARY_FIELDS = [ + "har_filename", + "search_engine", + "query_text", + "requests_total", + "unique_domains", + "third_party_requests", + "request_cookies_total", + "response_cookies_total", + "query_params_total", + "post_requests_total", + "tracking_hint_requests", + "transferred_kb_approx", + "page_load_ms", + "status_2xx", + "status_3xx", + "status_4xx", + "status_5xx", +] + + +TRACKING_WORDS = [ + "ads", + "adservice", + "analytics", + "collect", + "conversion", + "doubleclick", + "event", + "gen_204", + "googleadservices", + "improving", + "log", + "metrics", + "pagead", + "telemetry", + "track", +] + + +def detect_search_engine(har_path: Path) -> str: + name = har_path.name.lower() + if "duckduckgo" in name: + return "DuckDuckGo" + if "google" in name: + return "Google" + return "Unknown" + + +def read_har(path: Path) -> dict: + with path.open(encoding="utf-8", errors="replace") as file: + return json.load(file) + + +def entries_from_har(har_data: dict) -> list[dict]: + return har_data.get("log", {}).get("entries", []) or [] + + +def pages_from_har(har_data: dict) -> list[dict]: + return har_data.get("log", {}).get("pages", []) or [] + + +def cookie_names(cookies: list[dict]) -> str: + return "|".join(cookie.get("name", "") for cookie in cookies) + + +def cookie_values(cookies: list[dict]) -> str: + return "|".join(cookie.get("value", "") for cookie in cookies) + + +def query_names(query_items: list[dict]) -> str: + return "|".join(item.get("name", "") for item in query_items) + + +def query_values(query_items: list[dict]) -> str: + return "|".join(item.get("value", "") for item in query_items) + + +def positive_number(value: object) -> int: + if isinstance(value, (int, float)) and value > 0: + return int(value) + return 0 + + +def approximate_transferred_bytes(entry: dict) -> int: + request = entry.get("request", {}) or {} + response = entry.get("response", {}) or {} + content = response.get("content", {}) or {} + + return ( + positive_number(request.get("headersSize")) + + positive_number(request.get("bodySize")) + + positive_number(response.get("headersSize")) + + positive_number(response.get("bodySize")) + + positive_number(content.get("size")) + ) + + +def extract_query_text_from_url(url: str) -> str: + parsed = urlparse(url) + query = parse_qs(parsed.query, keep_blank_values=True) + values = query.get("q", []) + return values[0] if values else "" + + +def has_tracking_hint(domain: str, path: str, url: str) -> str: + text = f"{domain} {path} {url}".lower() + return "yes" if any(word in text for word in TRACKING_WORDS) else "no" + + +def max_page_load_ms(entries: list[dict], pages: list[dict]) -> float: + max_time = 0.0 + + for page in pages: + on_load = (page.get("pageTimings", {}) or {}).get("onLoad", -1) + if isinstance(on_load, (int, float)) and on_load > max_time: + max_time = float(on_load) + + for entry in entries: + entry_time = entry.get("time", -1) + if isinstance(entry_time, (int, float)) and entry_time > max_time: + max_time = float(entry_time) + + return max_time + + +def main_domain_for_engine(search_engine: str) -> str: + if search_engine == "Google": + return "google." + if search_engine == "DuckDuckGo": + return "duckduckgo.com" + return "" + + +def make_entry_rows(har_path: Path) -> list[dict]: + har_data = read_har(har_path) + entries = entries_from_har(har_data) + search_engine = detect_search_engine(har_path) + main_domain = main_domain_for_engine(search_engine) + rows = [] + + for index, entry in enumerate(entries, start=1): + request = entry.get("request", {}) or {} + response = entry.get("response", {}) or {} + content = response.get("content", {}) or {} + url = request.get("url", "") + parsed = urlparse(url) + request_cookies = request.get("cookies", []) or [] + response_cookies = response.get("cookies", []) or [] + query_items = request.get("queryString", []) or [] + domain = parsed.netloc.lower() + path = parsed.path + query_text = extract_query_text_from_url(url) + third_party = "no" + + if main_domain and domain and main_domain not in domain: + third_party = "yes" + + rows.append( + { + "har_filename": har_path.name, + "search_engine": search_engine, + "entry_index": index, + "startedDateTime": entry.get("startedDateTime", ""), + "time_ms": entry.get("time", ""), + "method": request.get("method", ""), + "url": url, + "domain": domain, + "path": path, + "query_text": query_text, + "status": response.get("status", ""), + "statusText": response.get("statusText", ""), + "request_cookie_count": len(request_cookies), + "request_cookie_names": cookie_names(request_cookies), + "request_cookie_values": cookie_values(request_cookies), + "response_cookie_count": len(response_cookies), + "response_cookie_names": cookie_names(response_cookies), + "response_cookie_values": cookie_values(response_cookies), + "query_param_count": len(query_items), + "query_param_names": query_names(query_items), + "query_param_values": query_values(query_items), + "request_header_count": len(request.get("headers", []) or []), + "response_header_count": len(response.get("headers", []) or []), + "post_data_present": "yes" if request.get("postData") else "no", + "request_body_size": request.get("bodySize", ""), + "response_body_size": response.get("bodySize", ""), + "response_content_size": content.get("size", ""), + "transferred_bytes_approx": approximate_transferred_bytes(entry), + "is_third_party_domain": third_party, + "tracking_hint": has_tracking_hint(domain, path, url), + } + ) + + return rows + + +def make_summary_row(har_path: Path, entry_rows: list[dict]) -> dict: + har_data = read_har(har_path) + entries = entries_from_har(har_data) + pages = pages_from_har(har_data) + domains = {row["domain"] for row in entry_rows if row["domain"]} + status_counts = {2: 0, 3: 0, 4: 0, 5: 0} + query_text = "" + + for row in entry_rows: + if row["query_text"] and not query_text: + query_text = row["query_text"] + + status = row["status"] + if isinstance(status, int): + group = status // 100 + if group in status_counts: + status_counts[group] += 1 + + transferred_bytes = sum(int(row["transferred_bytes_approx"]) for row in entry_rows) + + return { + "har_filename": har_path.name, + "search_engine": detect_search_engine(har_path), + "query_text": query_text, + "requests_total": len(entry_rows), + "unique_domains": len(domains), + "third_party_requests": sum( + 1 for row in entry_rows if row["is_third_party_domain"] == "yes" + ), + "request_cookies_total": sum(int(row["request_cookie_count"]) for row in entry_rows), + "response_cookies_total": sum( + int(row["response_cookie_count"]) for row in entry_rows + ), + "query_params_total": sum(int(row["query_param_count"]) for row in entry_rows), + "post_requests_total": sum(1 for row in entry_rows if row["method"] == "POST"), + "tracking_hint_requests": sum(1 for row in entry_rows if row["tracking_hint"] == "yes"), + "transferred_kb_approx": round(transferred_bytes / 1024, 2), + "page_load_ms": round(max_page_load_ms(entries, pages), 2), + "status_2xx": status_counts[2], + "status_3xx": status_counts[3], + "status_4xx": status_counts[4], + "status_5xx": status_counts[5], + } + + +def write_csv(path: Path, fieldnames: list[str], rows: list[dict]) -> None: + with path.open("w", newline="", encoding="utf-8") as file: + writer = csv.DictWriter(file, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert HAR files to readable CSV files.") + parser.add_argument( + "--input-dir", + type=Path, + default=Path("data"), + help="Folder with .har files. Default: data", + ) + parser.add_argument( + "--entries-output", + type=Path, + default=Path("har_entries.csv"), + help="CSV with one row per log.entries item. Default: har_entries.csv", + ) + parser.add_argument( + "--summary-output", + type=Path, + default=Path("har_summary.csv"), + help="CSV with one row per HAR file. Default: har_summary.csv", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + har_files = sorted(args.input_dir.glob("*.har")) + + if not har_files: + raise SystemExit(f"No HAR files found in {args.input_dir}") + + all_entry_rows = [] + summary_rows = [] + + for har_path in har_files: + entry_rows = make_entry_rows(har_path) + all_entry_rows.extend(entry_rows) + summary_rows.append(make_summary_row(har_path, entry_rows)) + + write_csv(args.entries_output, ENTRY_FIELDS, all_entry_rows) + write_csv(args.summary_output, SUMMARY_FIELDS, summary_rows) + + print(f"Wrote {len(all_entry_rows)} entry rows to {args.entries_output}") + print(f"Wrote {len(summary_rows)} summary rows to {args.summary_output}") + + +if __name__ == "__main__": + main() diff --git a/report/scripts/many_search.sh b/report/scripts/many_search.sh new file mode 100755 index 0000000..57b00b7 --- /dev/null +++ b/report/scripts/many_search.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash +set -euo pipefail + +QUERIES=( + "weather oslo" + "migraine symptoms" + "vitamin d deficiency" + "running shoes" + "coffee grinder" + "best laptop for students" + "electric car charging" + "cheap flights to london" + "home insurance" + "python list tutorial" + "banana bread recipe" + "news norway" +) + +for query in "${QUERIES[@]}"; do + echo "Running query: $query" + + capture_search_har \ + --query "$query" \ + --browser chromium \ + --wait-until load \ + --headed \ + --output-dir normal_chromium \ + --timeout-ms 60000 + + capture_search_har \ + --query "$query" \ + --browser chromium \ + --wait-until load \ + --headed \ + --output-dir tor_chromium \ + --timeout-ms 60000 \ + --proxy socks5://127.0.0.1:9050 + + capture_search_har \ + --query "$query" \ + --browser firefox \ + --wait-until load \ + --headed \ + --output-dir tor_firefox \ + --timeout-ms 60000 \ + --proxy socks5://127.0.0.1:9050 + + capture_search_har \ + --query "$query" \ + --browser firefox \ + --wait-until load \ + --headed \ + --output-dir normal_firefox \ + --timeout-ms 60000 +done diff --git a/report/scripts/power_query_etl.txt b/report/scripts/power_query_etl.txt new file mode 100644 index 0000000..adf670a --- /dev/null +++ b/report/scripts/power_query_etl.txt @@ -0,0 +1,82 @@ +let + Kilde = Csv.Document( + Web.Contents( + "https://example.sharepoint.com/.../tor_chromium/har_entries.csv" + ), + [ + Delimiter = ",", + Columns = 30, + QuoteStyle = QuoteStyle.None + ] + ), + + #"Promoted Headers" = + Table.PromoteHeaders( + Kilde, + [PromoteAllScalars = true] + ), + + #"Changed Column Types" = + Table.TransformColumnTypes( + #"Promoted Headers", + { + {"har_filename", type text}, + {"search_engine", type text}, + {"entry_index", Int64.Type}, + {"startedDateTime", type datetime}, + {"time_ms", type text}, + {"method", type text}, + {"url", type text}, + {"domain", type text}, + {"path", type text}, + {"query_text", type text}, + {"status", Int64.Type}, + {"statusText", type text}, + {"request_cookie_count", Int64.Type}, + {"response_cookie_count", Int64.Type}, + {"query_param_count", Int64.Type}, + {"request_header_count", Int64.Type}, + {"response_header_count", Int64.Type}, + {"tracking_hint", type text} + }, + "en" + ), + + #"Added Search Engine Column" = + Table.AddColumn( + #"Changed Column Types", + "SearchEngine", + each + if Text.Contains([har_filename], "bing") + then "Bing" + else if Text.Contains([har_filename], "google") + then "Google" + else if Text.Contains([har_filename], "duckduckgo") + then "DuckDuckGo" + else if Text.Contains([har_filename], "brave") + then "Brave" + else "Unknown" + ), + + #"Added Proxy Column" = + Table.TransformColumnTypes( + Table.AddColumn( + #"Added Search Engine Column", + "Proxy", + each "Tor" + ), + {{"Proxy", type text}} + ), + + #"Added Browser Column" = + Table.TransformColumnTypes( + Table.AddColumn( + #"Added Proxy Column", + "Browser", + each "Chromium" + ), + {{"Browser", type text}} + ) + +in + #"Added Browser Column" \ No newline at end of file diff --git a/report/scripts/power_query_merge.txt b/report/scripts/power_query_merge.txt new file mode 100644 index 0000000..d4987ee --- /dev/null +++ b/report/scripts/power_query_merge.txt @@ -0,0 +1,9 @@ +let + Kilde = Table.Combine({ + har_summary_normal_chromium, + har_summary_normal_firefox, + har_summary_tor_chromium, + har_summary_tor_firefox + }) +in + Kilde \ No newline at end of file diff --git a/report/sections/01A_theory.tex b/report/sections/01A_theory.tex index 26ccee5..a694e3b 100644 --- a/report/sections/01A_theory.tex +++ b/report/sections/01A_theory.tex @@ -1,21 +1,50 @@ \section{Theory\label{sec:theor}} -\subsection{EDA - Exploratory Data Analysis} +%\subsection{EDA - Exploratory Data Analysis} -\subsubsection{Data preprocessing and cleaning} -\subsubsection{Data reliability and consistency} -\subsubsection{Data visualisation principles} +%\subsubsection{Data preprocessing and cleaning} +%\subsubsection{Data reliability and consistency} +%\subsubsection{Data visualisation principles} -\subsection{Web traffic} +%\subsection{Web traffic} % HAR files -\subsubsection{HTTP requests and responses} +%\subsubsection{HTTP requests and responses} % -\subsubsection{Cookies and tracking parameters} +%\subsubsection{Cookies and tracking parameters} + + +%\subsubsection{Search engines and privacy} + + +%%% +\subsection{Web Tracking Technologies} + +\subsubsection{HTTP Requests and Responses} + +\subsubsection{Cookies and Tracking Parameters} + +\subsection{Data Collection and Preprocessing} + +\subsubsection{HAR Files} + +\subsubsection{Data Preprocessing and Cleaning} + +\subsection{Exploratory Data Analysis (EDA)} + +\subsection{Data Visualisation Principles} + + + +% “data preprocessing” → du gjør faktisk HAR → CSV-transformasjon +% “EDA” → du lager konkrete Power BI-visualiseringer +% “data pipeline” → du automatiserer hele workflowen +% “data reliability” → du standardiserer browser/proxy conditions +% “data collection methodology” → du dokumenterer Playwright/Tor-oppsettet + -\subsubsection{Search engines and privacy} \ No newline at end of file diff --git a/report/sections/02_method.tex b/report/sections/02_method.tex index 747faee..c9c8f40 100644 --- a/report/sections/02_method.tex +++ b/report/sections/02_method.tex @@ -2,7 +2,7 @@ %This section describes the methodology used throughout the research process. Some technical concepts and terminology referenced in this section are further explained in the Theory section and later discussed in the Discussion section. -This section describes the methodology used in this research. Any technical concepts and terminology references in this section are further explained in Section~\ref{sec:theor}, and discussed in Section~\ref{sec:discu}. +This section describes the methodology used in this research. Any technical concepts and terminology references in this section are further explained in \autoref{sec:theor}, and discussed in \autoref{sec:discu}. \subsection{Research design\label{sec:metho:research_design}} % Stikkord: @@ -59,11 +59,16 @@ Data collection is performed using either a Tor proxy to help hide the identity %When you tap ctrl + shift + C -When pressing \texttt{Ctrl + Shift + C} and click on \texttt{Network}, a log of Network traffic shows up. This window is open and the web-history is emptied before performing a web-search manually. This process gives a clean anonymous log web traffic from only one web query as known in Figure~\ref{fig:metho:manually_har}. Right to \texttt{"No throttling"} is a settings icon. Clicking on that bottom gives the options on Figure~\ref{fig:metho:export_har}. -Each query could be done manually, or the processes of collecting data could be automated. For this research the process of collecting first-hand raw data was automated. A tool used to automate web-queries is python using Playwright \parencite{Playwright}. Playwright is installed in a virtual environment packages using python \parencite{VENV} \texttt{script/capture\_search\_har.py} +When pressing \texttt{Ctrl + Shift + C} and click on \texttt{Network}, a log of Network traffic shows up. This window is open and the web-history is emptied before performing a web-search manually. This process gives a clean anonymous log web traffic from only one web query as known in \autoref{fig:metho:manually_har}. Right to \texttt{"No throttling"} is a settings icon. Clicking on that bottom gives the options on \autoref{fig:metho:export_har}. +Each query could be done manually, or the processes of collecting data could be automated. For this research the process of collecting first-hand raw data was automated. A tool used to automate web-queries is python using Playwright \parencite{Playwright}. Playwright is installed in a virtual environment packages using python \parencite{VENV}. All collection of data is done in Linux shell, and doing EDA is done in Microsoft PowerBI. + +Once the installation done, web-browsers of choice may be installed inside the virtual environment. Firefox and chromium were installed inside the virtual environment. Now the environment for retrieving raw, real-world-event data for this analysis. -\begin{figure}[h] + + + +\begin{figure}[H] \centering \includegraphics[ width=\linewidth, @@ -105,6 +110,38 @@ Each query could be done manually, or the processes of collecting data could be % HTTP status groups \subsection{Data collection\label{sec:metho:data_collection}} + + + +Three Scripts, two python files and one bash files are used to automate the data collection process. The files can be found under folder \texttt{./scripts/}. The bash file (\path{./scripts/many_search.sh}) uses the python file (\path{./scripts/capture_search_har.py}) to automate the process of retrieving data. It essentially loops for each query used, and each web-browser used, and for each proxy used and stores them into different folders. + + +\begin{lstlisting}[language=bash, caption={Playwright data collection command}, label={lst:metho:playwright_command}, basicstyle=\ttfamily\small, breaklines=true] +capture_search_har \ + --query "weather oslo" \ + --browser chromium \ + --wait-until load \ + --headed \ + --output-dir tor_chromium \ + --proxy socks5://127.0.0.1:9050 +\end{lstlisting} + +The \autoref{lst:metho:playwright_command} is an example of a prompt using the python file to automate the retrieving the data. It opens a web-browser, preform a web-search, and saves the output to a directory of choice, or in the default current directory. Input \verb|--query| is the only mandatory input for this function. All the option are optional, but defaults to a default value. Input \verb|--proxy| is optional, if not used, it uses the desktops current official IP address to preform the web-search. If the \verb|--proxy| option is specified, the provided value will be used as the proxy endpoint. Meaning the web-search would only see, for this instance, the Tor's endpoint and its official IP address when preforming the web-search \parencite{TOR}. \verb|--headed| and \verb|--wait-until load| are important. The first one tells Playwright to open a physical window when performing a web-search, and not just a \texttt{HTTPS} call, the second one tells Playwright to wait until the web-browser is fully loaded. The rest of the inputs are self-explanatory. + +After retrieving all the HAR files through the automated Python workflow, the dataset is ready for processing. + + + + + + + + + + + + + % Stikkord: % HAR files % one HAR file per search engine/query/browser/network mode @@ -115,6 +152,35 @@ Each query could be done manually, or the processes of collecting data could be % Tor via SOCKS proxy where applicable \subsection{Data processing\label{sec:metho:data_processing}} + +This section contains several important steps in the data processing pipeline, all leading to the exploratory data analysis (EDA) performed in Microsoft Power~BI. As of now, the data set is segregated into several HAR files, which is unreadable to Microsoft Power~BI. This section is a step-by-step process from raw data collection to finished visualized tables in Power~BI. + + + +\subsubsection{From HAR to CSV files\label{sec:metho:har_to_csv}} + +In order to perform data analysis in Power~BI, the data sett had to be converted from HAR files to CSV files. Once collectimg data in \autoref{sec:metho:data_collection}, several data entries needed to be ready before it could be extracted, transformed and loaded into tables in \autoref{sec:metho:etl}. + +A python script at (\path{./scripts/har_entries_to_csv.py}) reads all the \texttt{.har} files in folder \texttt{./data/} and prints two output files. The first one is \texttt{./har\_entries.csv}, and the second one is \texttt{./har\_summary.csv}. Four of each of those files were created, each for each proxy type and web-browser of choice. Working directory decides which proxy and web-browser that is used for that current data collection. More on that in \autoref{sec:discu}. \texttt{./har\_entries.csv} contains every request from the web-search as one entry, or one row in the csv file. \texttt{./har\_summary.csv} summaries its respective \texttt{./har\_entries.csv} file. Which means it takes all the input from several \texttt{.har} file and summarize one \texttt{.har} file in one row. In contrast, the output in file \texttt{./har\_entries.csv} does not summaries the \texttt{.har} files, it takes the raw data and presents one entry in a \texttt{.har} as one row in file \texttt{./har\_entries.csv}, and does not do any data processing. + +The file \texttt{./har\_summary.csv} was discarded in favour of \texttt{./har\_entries.csv}. It contains the raw data, and will be used on the ETL process in Power~BI in \autoref{sec:metho:etl} + + + +\subsubsection{ETL process in Power~BI\label{sec:metho:etl}} + +ETL stands for Extract, Transform, and Load. Some of the Extract and Transform process was done in \autoref{sec:metho:har_to_csv}. The dataset is not ready to be loaded and merged into Power~BI. + +%The dataset is separated in four \texttt{CSV} files in each folder which represents the case of those entries. For instance the raw data in the case of proxy is Tor, and browser used is Chromium, the location of the dataset is as following: \texttt{./tor_chromium}, as \autoref{lst:metho:playwright_command} indicates. + +The dataset is separated into four \texttt{CSV} files in each folder, which represents the case of those entries. For instance, if the proxy used is Tor and the browser used is Chromium, the location of the dataset is as follows: \texttt{./tor\_chromium/}, as \autoref{lst:metho:playwright_command} indicates. The \texttt{CSV} files consist of equal file name, which is generated from the python script \texttt{capture\_search\_har}. All \texttt{CSV} files was loaded into each segregated folders in the student private working area at SharePoint. \autoref{lst:appen:pq} in \autoref{sec:appen:pq} shows the total query done in Power~BI for each instance. When a \texttt{CSV} file is loaded as source into Power~BI, some autoformatting is done by Power~BI itself, and \autoref{lst:appen:pq} illustrate the code Power~BI generates, and some more formatting. + + +As explained, only folder name describes which proxy used and which browser used for each instance. To take account for this, each \texttt{./har\_entries.csv} had to manually loaded to Power~BI for each instance. Once one table was loaded, two new columns had to be added which specified its proxy and browser for the current entries. After this is done all for tables could be merged into one table main table. Before merging, each table was named as following: \texttt{har\_entries\_\_} for its respective proxy and browser. Once merged, the new table got the name \texttt{har\_entries\_all} which was further used for creating tables for this work. Those tables are presented in \autoref{sec:resul}. + + +At last, for all observation, the whole table \texttt{har\_entries\_all} was filtered for the variable \texttt{tracking\_hint} to be equal to yes. Meaning every entry that did not address any hint to tracking was immediately filtered out. + % Stikkord: % HAR files converted to CSV % har_entries.csv: one row per HAR entry/request @@ -125,6 +191,9 @@ Each query could be done manually, or the processes of collecting data could be \subsection{Limitations of the method\label{sec:metho:limitations}} + +Some limitations of this work are related to the process in which the analysis is performed. When retrieving a \texttt{.har} file, the text file is unstructured and contains large amounts of data noise. The scripts do not always guarantee data consistency. The output files did not specify which proxy or browser used. + % Stikkord: % HAR shows observable browser-side traffic only % cannot prove server-side storage diff --git a/report/sections/03_results.tex b/report/sections/03_results.tex index 6efdeb0..127e3b6 100644 --- a/report/sections/03_results.tex +++ b/report/sections/03_results.tex @@ -4,6 +4,8 @@ %\includegraphics[width=\linewidth]{figures/pdf/01_3rdparty.pdf} +This section introduces all the findings in this work. The main priority is the variable \texttt{tracking\_hints}, as the work tries to identify relationships between trackings and cookies. Every graph in this work is filtered for \texttt{tracking\_hints=yes}. Meaning the tables retrieved are larger than those visualized in this work. + \begin{figure}[h] \centering \includegraphics[ @@ -14,6 +16,11 @@ \end{figure} +The first figure, \autoref{fig:resul:02_browser}, illustrates the distribution of tracking hints across browsers and search engines. DuckDuckgo appears to be the Search~Engine that caches and identify most hints to tracking. Bing and Brave appear to conservative addressing any tracking hints. While only on DuckDuckGo and Google, the choice of web-browser seem to play a crucial role. Chromium addresses more hints to tracking in Google than Firefox does. + + +Furthermore, request cookies counts and response cookies counts will be presented in the Results below, which will be the main focus of the following results. + \begin{figure}[h] \centering \includegraphics[ @@ -24,6 +31,11 @@ \end{figure} +Figures~\ref{fig:resul:03_proxy_request} and~\ref{fig:resul:04_proxy_response} illustrate the cookie counts for each tracking hint. DuckDuckGo and Brave show no cookies across all \texttt{tracking\_hints}. In contrast, only Bing and Google use cookies on entries identified as tracking hints. + + + + \begin{figure}[h] \centering \includegraphics[ @@ -33,6 +45,9 @@ \label{fig:resul:04_proxy_response} \end{figure} + + + \begin{figure}[h] \centering \includegraphics[ diff --git a/report/sections/99_appendix.tex b/report/sections/99_appendix.tex new file mode 100644 index 0000000..d965968 --- /dev/null +++ b/report/sections/99_appendix.tex @@ -0,0 +1,54 @@ +\section{Appendices\label{sec:appen}} + + +\subsection{Automation scripts\label{sec:appen:auto}} + + +\lstinputlisting[ + basicstyle=\ttfamily\scriptsize + language={}, + caption={Bash automation script}, + breaklines=true, + label={lst:appen:bash} +]{scripts/many_search.sh} + + + +\lstinputlisting[ + basicstyle=\ttfamily\scriptsize + language={}, + caption={Python capture HAR script}, + breaklines=true, + label={lst:appen:capture} +]{scripts/capture_search_har.py} + + +\lstinputlisting[ + basicstyle=\ttfamily\scriptsize + language={}, + caption={Python HAR to CSV script}, + breaklines=true, + label={lst:appen:hartocsv} +]{scripts/har_entries_to_csv.py} + + + + +\subsection{Power Query transformation\label{sec:appen:pq}} + +\lstinputlisting[ + basicstyle=\ttfamily\scriptsize + language={}, + caption={Power Query ETL script}, + breaklines=true, + label={lst:appen:pq} +]{scripts/power_query_etl.txt} + +\lstinputlisting[ + basicstyle=\ttfamily\scriptsize + language={}, + caption={Power Query merge script}, + breaklines=true, + label={lst:appen:merge} +]{scripts/power_query_merge.txt} +