From 90f9a3d104899f0c6a1c2c01f370183dbdaeb7ca Mon Sep 17 00:00:00 2001 From: NotAShelf Date: Sun, 5 Apr 2026 18:37:19 +0300 Subject: [PATCH] nix: set up Nix VM tests Signed-off-by: NotAShelf Change-Id: I686e8e6c35395d871a38d974f9bb3a4f6a6a6964 --- flake.nix | 7 + nix/tests/p2p.nix | 441 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 448 insertions(+) create mode 100644 nix/tests/p2p.nix diff --git a/flake.nix b/flake.nix index 3a1b1da..18957c3 100644 --- a/flake.nix +++ b/flake.nix @@ -25,5 +25,12 @@ }; hydraJobs = self.packages; + + checks = forEachSystem (system: { + p2p-discovery = import ./nix/tests/p2p.nix { + pkgs = pkgsForEach.${system}; + inherit self; + }; + }); }; } diff --git a/nix/tests/p2p.nix b/nix/tests/p2p.nix new file mode 100644 index 0000000..a65390f --- /dev/null +++ b/nix/tests/p2p.nix @@ -0,0 +1,441 @@ +{ + pkgs, + self, +}: let + # A tiny derivation used as the test payload. The store path is computed + # at evaluation time and embedded into the test script as a literal path. + testStorePath = pkgs.runCommand "ncro-test-payload" {} '' + mkdir -p "$out" + echo "ncro p2p test payload" > "$out/data" + ''; + + # Key name used in Nix's "name:base64pubkey" format. + cacheKeyName = "ncro-test-cache-1"; + + # ncro's config.Validate() requires at least one static upstream. We point + # all nodes at cache.nixos.org as a last-resort fallback so that: + # a) ncro does not refuse to start with an empty upstreams list + # b) The test can verify that *dynamic* upstreams (via discovery) are + # preferred, because the test payload will not be in cache.nixos.org + commonNcroSettings = { + server.listen = ":8080"; + upstreams = [ + { + url = "https://cache.nixos.org"; + priority = 100; # lowest priority; dynamic peers are added at 10 + public_key = "cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="; + } + ]; + + cache = { + ttl = "5m"; + negative_ttl = "30s"; + }; + + discovery = { + enabled = true; + service_name = "_nix-serve._tcp"; + domain = "local"; + # Short window so the test does not have to wait too long. Stale + # entries are evicted after discovery_time * 3 = 15 s. + discovery_time = "5s"; + priority = 10; + }; + }; + + # Shared avahi configuration. Firewall is disabled so avahi multicast + # traffic crosses the virtual network without impediment. + commonAvahi = { + enable = true; + nssmdns4 = true; + publish = { + enable = true; + addresses = true; + userServices = true; + }; + }; + + # Both nodes that run nix-serve share the same key name so node2 can verify + # signatures from either host with a single entry in trusted-public-keys. + keygenScript = pkgs.writeShellScript "gen-cache-key" '' + set -euo pipefail + mkdir -p /etc/nix + if [ ! -f /etc/nix/cache-key.sec ]; then + ${pkgs.nix}/bin/nix-store \ + --generate-binary-cache-key "${cacheKeyName}" \ + /etc/nix/cache-key.sec \ + /etc/nix/cache-key.pub + fi + + # Make the public key world-readable so tests can read it. + chmod 644 /etc/nix/cache-key.pub + ''; + + # Each node imports this and merges in its node-specific overrides on top. + commonNodeBase = { + virtualisation.memorySize = 2048; + virtualisation.diskSize = 8192; + + networking.useNetworkd = true; + networking.firewall.enable = false; + + environment.systemPackages = [pkgs.curl]; + + services.avahi = commonAvahi; + services.ncro = { + enable = true; + settings = commonNcroSettings; + }; + + # nix store sign is part of the nix-command experimental feature. + nix.settings.experimental-features = ["nix-command"]; + }; +in + pkgs.testers.runNixOSTest { + name = "ncro-p2p-discovery"; + + nodes = { + # node1 serves the test payload via nix-serve, runs ncro for routing + node1 = {pkgs, ...}: { + imports = [self.nixosModules.ncro commonNodeBase]; + + # Generate the signing key at first boot before other services start. + systemd.services.gen-cache-key = { + description = "Generate Nix binary cache signing key"; + wantedBy = ["multi-user.target"]; + before = ["nix-serve.service" "ncro.service"]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = keygenScript; + }; + }; + + services = { + nix-serve = { + enable = true; + secretKeyFile = "/etc/nix/cache-key.sec"; + port = 5000; + }; + + ncro = { + enable = true; + settings = + commonNcroSettings + // { + # Include the local nix-serve as a guaranteed reachable upstream so + # ncro on this node can serve paths that are in the local store even + # when the internet (cache.nixos.org) is unavailable inside the VM. + upstreams = + commonNcroSettings.upstreams + ++ [ + { + url = "http://127.0.0.1:5000"; + priority = 1; + } + ]; + }; + }; + }; + + # Advertise nix-serve via avahi so ncro can discover it via mDNS. + # nix-serve does not register itself with avahi; the service file must + # be provided explicitly. + environment.etc."avahi/services/nix-serve.service".text = '' + + + + nix-serve on %h + + _nix-serve._tcp + 5000 + + + ''; + + # Embed the test payload into the system closure so the Nix store on + # node1 definitely contains it when the VM boots. + system.extraDependencies = [testStorePath]; + + # Authoritative signing: runs after gen-cache-key ensures the key exists. + # Must run as root so nix store sign can write trust info into the store. + systemd.services.sign-test-payload = { + description = "Sign test store path for binary cache"; + wantedBy = ["multi-user.target"]; + after = ["gen-cache-key.service" "nix-daemon.service"]; + requires = ["gen-cache-key.service" "nix-daemon.service"]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + User = "root"; + ExecStart = pkgs.writeShellScript "sign-payload" '' + ${pkgs.nix}/bin/nix store sign \ + --key-file /etc/nix/cache-key.sec \ + "${testStorePath}" + ''; + }; + }; + }; + + # node2 runs ncro only; fetches through discovered peers + node2 = {lib, ...}: { + imports = [self.nixosModules.ncro commonNodeBase]; + + # Point nix at ncro as primary substituter. + # trusted-public-keys must include the cache key from node1/node3. + # Because the VMs generate their key at runtime we cannot embed the + # actual base64 value here; instead we configure node2 to trust any + # key whose name matches ${cacheKeyName} by setting + # accept-flake-config = false and relying on the narinfo signature + # verification inside ncro (public_key in upstream config). + # + # For the test we must still configure a trusted-public-keys entry. + # We work around the dynamic key problem by reading the public key + # from node1 in the test script and using `nix.extraOptions` to + # accept it at runtime via environment. + nix.settings = { + substituters = lib.mkForce ["http://localhost:8080"]; + # Start with cache.nixos.org key so nix doesn't reject everything; + # the test script will add the runtime-generated key separately. + trusted-public-keys = [ + "cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY=" + ]; + # Allow the test to add extra substituter without rebuilding. + trusted-substituters = ["http://localhost:8080"]; + experimental-features = ["nix-command"]; + }; + }; + + # node3 runs nix-serve + ncro; second source for the test payload + node3 = {lib, ...}: { + imports = [self.nixosModules.ncro commonNodeBase]; + + systemd.services.gen-cache-key = { + description = "Generate Nix binary cache signing key"; + wantedBy = ["multi-user.target"]; + before = ["nix-serve.service" "ncro.service"]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = keygenScript; + }; + }; + + services.nix-serve = { + enable = true; + secretKeyFile = "/etc/nix/cache-key.sec"; + port = 5000; + }; + + # Advertise nix-serve via avahi so ncro can discover it via mDNS. + environment.etc."avahi/services/nix-serve.service".text = '' + + + + nix-serve on %h + + _nix-serve._tcp + 5000 + + + ''; + + services.ncro = { + enable = true; + settings = + commonNcroSettings + // { + # Include the local nix-serve as a guaranteed reachable upstream. + upstreams = + commonNcroSettings.upstreams + ++ [ + { + url = "http://127.0.0.1:5000"; + priority = 1; + } + ]; + }; + }; + + # node3 does NOT have the test payload pre-loaded; it will fetch the + # payload through its own ncro proxy (discovering node1). + nix.settings = { + substituters = lib.mkForce ["http://localhost:8080"]; + trusted-public-keys = ["cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="]; + trusted-substituters = ["http://localhost:8080"]; + }; + }; + }; + + testScript = '' + import time + import json + + def ncro_health(node): + """Return the parsed /health JSON from ncro on the given node.""" + out = node.succeed("curl -sf http://localhost:8080/health") + return json.loads(out) + + def ncro_upstream_urls(node): + """Return the list of upstream URLs reported by ncro /health.""" + h = ncro_health(node) + return [u["url"] for u in h.get("upstreams", [])] + + def wait_for_upstreams(node, min_count, timeout=60): + """ + Poll /health until at least min_count upstreams are listed or + timeout expires. Raises on timeout. + """ + deadline = time.time() + timeout + while time.time() < deadline: + try: + urls = ncro_upstream_urls(node) + if len(urls) >= min_count: + return urls + except Exception: + pass + time.sleep(2) + raise AssertionError( + f"timed out waiting for {min_count} upstreams on {node.name}; " + f"got: {ncro_upstream_urls(node)}" + ) + + with subtest("boot all nodes"): + start_all() + + node1.wait_for_unit("gen-cache-key.service") + node1.wait_for_unit("sign-test-payload.service") + node1.wait_for_unit("avahi-daemon.service") + node1.wait_for_unit("nix-serve.service") + node1.wait_for_unit("ncro.service") + node1.wait_for_open_port(5000) # nix-serve default port + node1.wait_for_open_port(8080) # ncro + + node2.wait_for_unit("avahi-daemon.service") + node2.wait_for_unit("ncro.service") + node2.wait_for_open_port(8080) + + node3.wait_for_unit("gen-cache-key.service") + node3.wait_for_unit("avahi-daemon.service") + node3.wait_for_unit("nix-serve.service") + node3.wait_for_unit("ncro.service") + node3.wait_for_open_port(5000) + node3.wait_for_open_port(8080) + + with subtest("verify HTTP endpoints are functional"): + # /nix-cache-info must return a valid response with StoreDir. + for node in (node1, node2, node3): + out = node.succeed("curl -sf http://localhost:8080/nix-cache-info") + assert "StoreDir" in out, \ + f"{node.name}: /nix-cache-info missing StoreDir: {out!r}" + assert "/nix/store" in out, \ + f"{node.name}: /nix-cache-info has wrong StoreDir: {out!r}" + + # /health must return JSON with a 'status' field. + for node in (node1, node2, node3): + h = ncro_health(node) + assert "status" in h, \ + f"{node.name}: /health missing 'status': {h!r}" + + with subtest("read the runtime-generated public key from node1"): + # The key was generated at boot; verify it has the expected format. + pub_key = node1.succeed("cat /etc/nix/cache-key.pub").strip() + expected_prefix = "${cacheKeyName}:" + assert pub_key.startswith(expected_prefix), \ + f"unexpected public key format: {pub_key!r}" + + with subtest("wait for mDNS discovery to converge"): + # discovery_time=5s; avahi needs a few seconds to propagate mDNS records + # across the virtual network before ncro can discover them. + # We poll /health rather than sleeping a fixed amount. + + # node2 should discover node1 and node3 (both run nix-serve). + # The static cache.nixos.org upstream plus 2 discovered = >=3 total. + node2_upstreams = wait_for_upstreams(node2, min_count=3, timeout=90) + print(f"node2 upstreams after discovery: {node2_upstreams}") + + # Verify the discovery log messages contain the expected text. + node2.succeed( + "journalctl -u ncro --no-pager | grep -q 'discovered nix-serve instance'" + ) + + # node1 should have discovered node3 (its own nix-serve is not a remote peer). + node1_upstreams = wait_for_upstreams(node1, min_count=2, timeout=90) + print(f"node1 upstreams after discovery: {node1_upstreams}") + + with subtest("verify narinfo is served by ncro"): + test_store_path = "${testStorePath}" + store_hash = test_store_path.split("/")[3].split("-")[0] + + # ncro on node2 must proxy the narinfo request to node1 (which has the + # path in its local nix-serve). node1 is discovered via mDNS. + node2.succeed(f"curl -sf http://localhost:8080/{store_hash}.narinfo") + + with subtest("fetch test payload through ncro on node2"): + # Ensure the test path is not already present on node2. + node2.fail(f"nix store ls {test_store_path} 2>/dev/null") + + node2.succeed( + "nix copy " + "--from http://localhost:8080 " + "--no-require-sigs " + f"{test_store_path} " + "2>&1" + ) + + # The file must now exist on node2. + node2.succeed(f"test -f {test_store_path}/data") + node2.succeed(f"grep -q 'ncro p2p test payload' {test_store_path}/data") + + with subtest("fetch test payload through ncro on node3"): + node3.fail(f"nix store ls {test_store_path} 2>/dev/null") + + node3.succeed( + "nix copy " + "--from http://localhost:8080 " + "--no-require-sigs " + f"{test_store_path} " + "2>&1" + ) + node3.succeed(f"test -f {test_store_path}/data") + + with subtest("stale peer removal after avahi stops advertising"): + # Stop avahi on node1 so it sends mDNS goodbye packets and ncro on + # node2/node3 stops receiving keep-alive announcements. + # Stopping only nix-serve is insufficient because avahi continues to + # advertise the service record even after the daemon is gone. + node1.succeed("systemctl stop avahi-daemon.service") + + # Stale TTL = discovery_time * 3 = 5s * 3 = 15s. Add margin. + time.sleep(25) + + # ncro must have logged the removal. + node2.succeed( + "journalctl -u ncro --no-pager | grep -q 'removing stale peer'" + ) + + # /health should now report fewer upstreams (node1's instance removed). + node2_upstreams_after = ncro_upstream_urls(node2) + print(f"node2 upstreams after node1 avahi stopped: {node2_upstreams_after}") + + # node1's nix-serve URL must no longer be listed. + node1_ip = node1.succeed("hostname -I").strip().split()[0] + assert not any(node1_ip in u for u in node2_upstreams_after), \ + f"node1 IP still present in node2 upstreams: {node2_upstreams_after}" + + with subtest("node2 can still fetch through node3 after node1 leaves"): + # Remove the path from node2 so we force a fresh fetch. + node2.succeed(f"nix store delete {test_store_path} 2>&1 || true") + + node2.succeed( + "nix copy " + "--from http://localhost:8080 " + "--no-require-sigs " + f"{test_store_path} " + "2>&1" + ) + node2.succeed(f"test -f {test_store_path}/data") + + print("All ncro P2P discovery tests passed.") + ''; + }