"discovery: mDNS/DNS-SD peer discovery; dynamic upstream management"

Allows ncro instances to discover each other dynamically without static
configuration and enables a peer-to-peer style (hello funny cube) mesh
where nodes share cached builds locally instead of all hitting upstream
caches.

Signed-off-by: NotAShelf <raf@notashelf.dev>
Change-Id: I7d723876c6816cb6aaaf3fe14cb24a426a6a6964
This commit is contained in:
raf 2026-03-27 20:03:37 +03:00
commit 5fc3b2883b
Signed by: NotAShelf
GPG key ID: 29D95B64378DB4BF
6 changed files with 336 additions and 3 deletions

View file

@ -71,6 +71,15 @@ type MeshConfig struct {
GossipInterval Duration `yaml:"gossip_interval"`
}
// Controls mDNS/DNS-SD based dynamic upstream discovery.
type DiscoveryConfig struct {
Enabled bool `yaml:"enabled"`
ServiceName string `yaml:"service_name"`
Domain string `yaml:"domain"`
DiscoveryTime Duration `yaml:"discovery_time"`
Priority int `yaml:"priority"`
}
type LoggingConfig struct {
Level string `yaml:"level"`
Format string `yaml:"format"`
@ -81,6 +90,7 @@ type Config struct {
Upstreams []UpstreamConfig `yaml:"upstreams"`
Cache CacheConfig `yaml:"cache"`
Mesh MeshConfig `yaml:"mesh"`
Discovery DiscoveryConfig `yaml:"discovery"`
Logging LoggingConfig `yaml:"logging"`
}
@ -106,6 +116,12 @@ func defaults() Config {
BindAddr: "0.0.0.0:7946",
GossipInterval: Duration{30 * time.Second},
},
Discovery: DiscoveryConfig{
ServiceName: "_nix-serve._tcp",
Domain: "local",
DiscoveryTime: Duration{5 * time.Second},
Priority: 20,
},
Logging: LoggingConfig{
Level: "info",
Format: "json",
@ -161,6 +177,17 @@ func (c *Config) Validate() error {
}
}
}
if c.Discovery.Enabled {
if c.Discovery.ServiceName == "" {
return fmt.Errorf("discovery.service_name is required when discovery is enabled")
}
if c.Discovery.Domain == "" {
return fmt.Errorf("discovery.domain is required when discovery is enabled")
}
if c.Discovery.DiscoveryTime.Duration <= 0 {
return fmt.Errorf("discovery.discovery_time must be positive")
}
}
return nil
}

View file

@ -0,0 +1,216 @@
package discovery
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
"github.com/grandcat/zeroconf"
"notashelf.dev/ncro/internal/config"
"notashelf.dev/ncro/internal/prober"
)
// Tracks discovered nix-serve instances and maintains the upstream list.
type Discovery struct {
cfg config.DiscoveryConfig
prober *prober.Prober
resolver *zeroconf.Resolver
discovered map[string]*discoveredPeer
mu sync.RWMutex
stopCh chan struct{}
waitGroup sync.WaitGroup
onAddUpstream func(url string, priority int)
onRemoveUpstream func(url string)
}
type discoveredPeer struct {
url string
lastSeen time.Time
}
// Creates a new Discovery manager.
func New(cfg config.DiscoveryConfig, p *prober.Prober) (*Discovery, error) {
resolver, err := zeroconf.NewResolver(nil)
if err != nil {
return nil, fmt.Errorf("create zeroconf resolver: %w", err)
}
return &Discovery{
cfg: cfg,
prober: p,
resolver: resolver,
discovered: make(map[string]*discoveredPeer),
stopCh: make(chan struct{}),
}, nil
}
// Sets callbacks for upstream addition/removal. These are invoked when peers
// are discovered or leave the network.
func (d *Discovery) SetCallbacks(
add func(url string, priority int),
remove func(url string),
) {
d.mu.Lock()
defer d.mu.Unlock()
d.onAddUpstream = add
d.onRemoveUpstream = remove
}
// Starts browsing for services on the local network. Blocks until the context
// is cancelled or Stop is called.
func (d *Discovery) Start(ctx context.Context) error {
entries := make(chan *zeroconf.ServiceEntry)
d.waitGroup.Add(1)
go d.handleEntries(ctx, entries)
d.waitGroup.Add(1)
go d.maintainPeers(ctx)
if err := d.resolver.Browse(ctx, d.cfg.ServiceName, d.cfg.Domain, entries); err != nil {
close(entries)
d.waitGroup.Wait()
return fmt.Errorf("browse services: %w", err)
}
select {
case <-ctx.Done():
return ctx.Err()
case <-d.stopCh:
return nil
}
}
// Stops the discovery process.
func (d *Discovery) Stop() {
close(d.stopCh)
d.waitGroup.Wait()
}
// Processes discovered service entries.
func (d *Discovery) handleEntries(ctx context.Context, entries chan *zeroconf.ServiceEntry) {
defer d.waitGroup.Done()
for {
select {
case <-ctx.Done():
return
case <-d.stopCh:
return
case entry, ok := <-entries:
if !ok {
return
}
d.handleEntry(ctx, entry)
}
}
}
// Handles a single service entry.
func (d *Discovery) handleEntry(_ context.Context, entry *zeroconf.ServiceEntry) {
if len(entry.AddrIPv4) == 0 && len(entry.AddrIPv6) == 0 {
slog.Debug("discovered service has no addresses", "instance", entry.Instance)
return
}
var addr string
if len(entry.AddrIPv4) > 0 {
addr = entry.AddrIPv4[0].String()
} else {
addr = entry.AddrIPv6[0].String()
}
port := entry.Port
url := fmt.Sprintf("http://%s:%d", addr, port)
key := fmt.Sprintf("%s@%s", entry.Instance, entry.HostName)
d.mu.Lock()
defer d.mu.Unlock()
// Check if we already know this peer
if _, exists := d.discovered[key]; exists {
d.discovered[key].lastSeen = time.Now()
return
}
// New peer discovered
slog.Info("discovered nix-serve instance", "instance", entry.Instance, "url", url)
d.discovered[key] = &discoveredPeer{
url: url,
lastSeen: time.Now(),
}
// Notify callback if set
if d.onAddUpstream != nil {
go func() {
defer func() {
if r := recover(); r != nil {
slog.Error("panic in add upstream callback", "recover", r)
}
}()
d.onAddUpstream(url, d.cfg.Priority)
}()
}
}
// Removes peers that haven't been seen within the TTL period.
func (d *Discovery) maintainPeers(ctx context.Context) {
defer d.waitGroup.Done()
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-d.stopCh:
return
case <-ticker.C:
d.cleanupPeers()
}
}
}
// Cleans up stale peer entries.
func (d *Discovery) cleanupPeers() {
d.mu.Lock()
defer d.mu.Unlock()
now := time.Now()
// TTL is the discovery response time; peers should re-announce periodically.
// Use 3x TTL as the expiration window.
expiration := d.cfg.DiscoveryTime.Duration * 3
for key, peer := range d.discovered {
if now.Sub(peer.lastSeen) > expiration {
slog.Info("removing stale peer", "url", peer.url)
delete(d.discovered, key)
if d.onRemoveUpstream != nil {
go func(url string) {
defer func() {
if r := recover(); r != nil {
slog.Error("panic in remove upstream callback", "recover", r)
}
}()
d.onRemoveUpstream(url)
}(peer.url)
}
}
}
}
// Returns a list of currently discovered peer URLs.
func (d *Discovery) DiscoveredPeers() []string {
d.mu.RLock()
defer d.mu.RUnlock()
peers := make([]string, 0, len(d.discovered))
for _, peer := range d.discovered {
peers = append(peers, peer.url)
}
return peers
}

View file

@ -232,3 +232,24 @@ func (p *Prober) getOrCreate(url string) *UpstreamHealth {
}
return h
}
// Adds a new upstream dynamically (e.g., discovered via mDNS).
// Thread-safe. Logs the addition and begins probing.
func (p *Prober) AddUpstream(url string, priority int) {
p.mu.Lock()
defer p.mu.Unlock()
if _, exists := p.table[url]; exists {
return
}
p.table[url] = &UpstreamHealth{URL: url, Priority: priority, Status: StatusActive}
// Trigger an immediate probe in background
go p.ProbeUpstream(url)
}
// Removes an upstream from tracking (e.g., when a peer leaves the network).
// Thread-safe. No-op if upstream was not known.
func (p *Prober) RemoveUpstream(url string) {
p.mu.Lock()
defer p.mu.Unlock()
delete(p.table, url)
}