"""Fraud-header compliance checks. Two layers: 1. **Local shape assertions** — pure-python checks that every mandatory Gov-Client-*/Gov-Vendor-* header is present and shaped per HMRC spec. Runs in every CI build. 2. **HMRC validator API smoke test** (`test_headers_pass_hmrc_validator`): POSTs the generated header set to the HMRC sandbox validator and asserts a clean 200 with no rejected headers. Gated on the `HMRC_VALIDATOR` env var so `pytest` still runs fine offline. HMRC audits fraud headers during production-access review — a failing validator smoke test MUST block deploy. Spec references (primary): https://developer.service.hmrc.gov.uk/guides/fraud-prevention/connection-method/batch-process-direct/ https://developer.service.hmrc.gov.uk/api-documentation/docs/api/service/txm-fph-validator-api/1.0 """ from __future__ import annotations import hashlib import os import re import httpx import pytest from hmrc_sync.fraud_headers import ( CONNECTION_METHOD_BATCH, CONNECTION_METHOD_WEB_APP, RUNTIME_CONTEXT, VENDOR_LICENSE_ID, VENDOR_PRODUCT_NAME, RuntimeContext, SessionContext, as_validator_payload, build_headers, ) VALIDATOR_URL = ( "https://test-api.service.hmrc.gov.uk/test/fraud-prevention-headers/validate") # Per HMRC BATCH_PROCESS_DIRECT spec (11 mandatory headers). BATCH_MANDATORY = { "Gov-Client-Connection-Method", "Gov-Client-Device-ID", "Gov-Client-Local-IPs", "Gov-Client-Local-IPs-Timestamp", "Gov-Client-MAC-Addresses", "Gov-Client-Timezone", "Gov-Client-User-Agent", "Gov-Client-User-IDs", "Gov-Vendor-License-IDs", "Gov-Vendor-Product-Name", "Gov-Vendor-Version", } # WEB_APP_VIA_SERVER adds browser-origin context on top of the batch set. WEB_APP_EXTRAS = { "Gov-Client-Screens", "Gov-Client-Window-Size", "Gov-Client-Public-IP", "Gov-Client-Public-Port", } def _full_session() -> SessionContext: return SessionContext( user_agent="Mozilla/5.0 (X11; Linux x86_64) hmrc-sync-test", screen_width=1920, screen_height=1080, screen_colour_depth=24, window_width=1600, window_height=900, timezone_offset=0, device_id="6c3a9f60-1111-2222-3333-abcdef012345", public_ip="203.0.113.5", public_port=443, ) # -------------------------------------------------------------------------- # BATCH_PROCESS_DIRECT — the CronJob path. All 11 headers must be present. # -------------------------------------------------------------------------- def test_batch_process_includes_all_11_mandatory_headers() -> None: hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) missing = BATCH_MANDATORY - hdrs.keys() assert not missing, f"BATCH_PROCESS_DIRECT missing mandatory headers: {missing}" def test_batch_process_omits_browser_only_headers() -> None: hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) # Screens / Window-Size are browser-origin; Public-IP/Port route via a # client-facing IP which doesn't apply to a batch job. for h in ("Gov-Client-Screens", "Gov-Client-Window-Size", "Gov-Client-Public-IP", "Gov-Client-Public-Port"): assert h not in hdrs, f"BATCH emitted browser-only header: {h}" def test_batch_process_connection_method_value() -> None: hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) assert hdrs["Gov-Client-Connection-Method"] == "BATCH_PROCESS_DIRECT" # -------------------------------------------------------------------------- # Header-value shape assertions (per HMRC spec). # -------------------------------------------------------------------------- def test_user_ids_starts_with_os_field() -> None: """Per spec: `os=&=`. `os=` is mandatory.""" hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Client-User-IDs"] assert value.startswith("os="), f"User-IDs missing os= prefix: {value!r}" # Key-value pairs separated by & — at least one beyond `os=`. pairs = value.split("&") assert len(pairs) >= 2, f"User-IDs should have app identifier too: {value!r}" def test_user_agent_has_all_four_spec_fields() -> None: """Spec: `os-family=…&os-version=…&device-manufacturer=…&device-model=…`.""" hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Client-User-Agent"] for key in ("os-family=", "os-version=", "device-manufacturer=", "device-model="): assert key in value, f"User-Agent missing {key!r}: {value!r}" def test_mac_addresses_percent_encoded() -> None: """Spec: colons in MACs must be percent-encoded (%3A).""" hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Client-MAC-Addresses"] assert value, "MAC-Addresses must never be empty" assert ":" not in value, f"MAC-Addresses contains raw colons: {value!r}" assert "%3A" in value, f"MAC-Addresses must use %3A: {value!r}" def test_local_ips_ipv6_percent_encoded() -> None: """IPv6 entries percent-encoded; IPv4 passes through.""" hdrs = build_headers( connection_method=CONNECTION_METHOD_BATCH, runtime=_runtime_with_ips(["10.0.0.4", "fe80::1"]), ) value = hdrs["Gov-Client-Local-IPs"] assert "10.0.0.4" in value assert "fe80::1" not in value # raw v6 forbidden assert "fe80%3A%3A1" in value, f"IPv6 not encoded: {value!r}" def test_vendor_license_id_is_sha256_hashed() -> None: """Spec: `Gov-Vendor-License-IDs: =`.""" hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Vendor-License-IDs"] expected_hash = hashlib.sha256(VENDOR_LICENSE_ID.encode()).hexdigest() assert value == f"{VENDOR_PRODUCT_NAME}={expected_hash}", value # Hash must be 64 hex chars — catches accidental plaintext leakage. assert re.fullmatch(r"[a-z0-9-]+=[0-9a-f]{64}", value), value def test_vendor_product_name_percent_encoded() -> None: hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) assert hdrs["Gov-Vendor-Product-Name"] == "hmrc-sync" # no reserved chars in name def test_vendor_version_format() -> None: hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Vendor-Version"] assert re.fullmatch(r"[a-z0-9-]+=\d+\.\d+\.\d+", value), value def test_local_ips_timestamp_spec_format() -> None: """Spec: `yyyy-MM-ddThh:mm:ss.sssZ` — 24-hour, UTC, 3-digit millis.""" hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Client-Local-IPs-Timestamp"] assert re.fullmatch(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z", value), value def test_timezone_utc_offset_format() -> None: hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) assert re.fullmatch(r"UTC[+-]\d{2}:\d{2}", hdrs["Gov-Client-Timezone"]) def test_device_id_is_valid_uuid() -> None: """UUID shape check: 8-4-4-4-12 hex — applies to fallback too.""" hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) value = hdrs["Gov-Client-Device-ID"] assert re.fullmatch( r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", value, ), value # -------------------------------------------------------------------------- # MFA gating + per-call variance. # -------------------------------------------------------------------------- def test_mfa_timestamp_only_emitted_for_mfa_method() -> None: """Gov-Client-MFA-Timestamp is for AUTH_USING_MFA; batch must not emit it.""" batch = build_headers(connection_method=CONNECTION_METHOD_BATCH) assert "Gov-Client-MFA-Timestamp" not in batch session = _full_session() session.mfa_timestamp = "2026-04-19T21:30:00.000Z" mfa = build_headers(session, connection_method="AUTH_USING_MFA") assert mfa.get("Gov-Client-MFA-Timestamp") == "2026-04-19T21:30:00.000Z" def test_correlation_id_differs_per_call() -> None: a = build_headers(connection_method=CONNECTION_METHOD_BATCH) b = build_headers(connection_method=CONNECTION_METHOD_BATCH) assert a["x-correlation-id"] != b["x-correlation-id"] # -------------------------------------------------------------------------- # WEB_APP_VIA_SERVER — batch set + browser extras. # -------------------------------------------------------------------------- def test_web_app_includes_batch_mandatory_plus_browser_extras() -> None: hdrs = build_headers(_full_session(), connection_method=CONNECTION_METHOD_WEB_APP) missing = (BATCH_MANDATORY | WEB_APP_EXTRAS) - hdrs.keys() assert not missing, f"WEB_APP missing headers: {missing}" # -------------------------------------------------------------------------- # Payload reshape (used by the validator smoke test + CI self-tests). # -------------------------------------------------------------------------- def test_as_validator_payload_reshape() -> None: hdrs = {"Gov-Client-Connection-Method": "X", "Gov-Vendor-Product-Name": "y"} payload = as_validator_payload(hdrs) assert payload["headers"] == [ {"name": "Gov-Client-Connection-Method", "value": "X"}, {"name": "Gov-Vendor-Product-Name", "value": "y"}, ] # -------------------------------------------------------------------------- # HMRC sandbox validator smoke test — set HMRC_VALIDATOR=1 to enable. # -------------------------------------------------------------------------- @pytest.mark.skipif( not (os.environ.get("HMRC_VALIDATOR") and os.environ.get("HMRC_SANDBOX_TOKEN")), reason=("HMRC sandbox validator smoke test — set HMRC_VALIDATOR=1 AND " "HMRC_SANDBOX_TOKEN=. Dev Hub app must be subscribed " "to txm-fph-validator-api/1.0 (application-restricted)."), ) def test_headers_pass_hmrc_validator() -> None: """GET /test/fraud-prevention-headers/validate with BATCH headers. Per the OAS spec the validator is a GET endpoint — headers go in the actual HTTP request, not a JSON body. Auth is application-restricted (client_credentials bearer). A successful response has code=VALID_HEADERS; POTENTIALLY_INVALID_HEADERS emits warnings but still passes; only INVALID_HEADERS is a hard fail. """ hdrs = build_headers(connection_method=CONNECTION_METHOD_BATCH) request_headers = { **hdrs, "Accept": "application/vnd.hmrc.1.0+json", "Authorization": f"Bearer {os.environ['HMRC_SANDBOX_TOKEN']}", } resp = httpx.get(VALIDATOR_URL, headers=request_headers, timeout=30.0) assert resp.status_code == 200, ( f"validator refused: {resp.status_code} {resp.text[:500]}") body = resp.json() code = body.get("code") assert code != "INVALID_HEADERS", f"validator rejected: {body}" # POTENTIALLY_INVALID_HEADERS is allowed — HMRC surfaces them as warnings; # log for visibility but don't fail the build on them. if code == "POTENTIALLY_INVALID_HEADERS": print(f"validator warnings: {body.get('warnings')}") def _runtime_with_ips(ips: list[str]) -> RuntimeContext: """Build a RuntimeContext override with caller-specified local_ips.""" return RuntimeContext( mac_addresses=RUNTIME_CONTEXT.mac_addresses, local_ips=ips, os_family=RUNTIME_CONTEXT.os_family, os_version=RUNTIME_CONTEXT.os_version, device_manufacturer=RUNTIME_CONTEXT.device_manufacturer, device_model=RUNTIME_CONTEXT.device_model, os_user=RUNTIME_CONTEXT.os_user, )