summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2026-03-03 14:48:17 +0200
committerPaul Buetow <paul@buetow.org>2026-03-03 14:48:17 +0200
commit46d4917ea0eaa587e87602200fb6843776cc62a5 (patch)
tree5e3ca8b2cd975d077c3a0f3dca49ca6468ac4da9
parent852f10bb5d87cbaf1089a532c27777ac4153fde7 (diff)
Add dtailhealth no-auth-key flag and adjust turbo EOF handling
-rw-r--r--cmd/dtailhealth/main.go1
-rw-r--r--doc/auth-key-fast-reconnect.md200
-rw-r--r--integrationtests/dtailhealth_test.go6
-rw-r--r--internal/server/handlers/readcommand.go15
4 files changed, 216 insertions, 6 deletions
diff --git a/cmd/dtailhealth/main.go b/cmd/dtailhealth/main.go
index a0ca84e..3d4cccf 100644
--- a/cmd/dtailhealth/main.go
+++ b/cmd/dtailhealth/main.go
@@ -28,6 +28,7 @@ func main() {
flag.StringVar(&args.Logger, "logger", config.DefaultHealthCheckLogger, "Logger name")
flag.StringVar(&args.LogLevel, "logLevel", "none", "Log level")
flag.StringVar(&args.ServersStr, "server", "", "Remote server to connect")
+ flag.BoolVar(&args.NoAuthKey, "no-auth-key", false, "Disable auth-key fast reconnect feature")
flag.StringVar(&pprof, "pprof", "", "Start PProf server this address")
flag.Parse()
diff --git a/doc/auth-key-fast-reconnect.md b/doc/auth-key-fast-reconnect.md
new file mode 100644
index 0000000..cb9884a
--- /dev/null
+++ b/doc/auth-key-fast-reconnect.md
@@ -0,0 +1,200 @@
+# Auth-Key Fast-Reconnect for DTail
+
+## Problem
+
+When using a YubiKey for SSH authentication, each DTail connection requires a
+physical touch of the YubiKey during the SSH handshake. This is slow and becomes
+painful when connecting to many servers concurrently — the YubiKey serialises
+all signing requests, turning parallel connections into sequential ones.
+
+## Solution
+
+Allow the DTail client to register a local SSH public key with the DTail server
+over an already-authenticated SSH session. The server caches this key
+**in-memory only** (never written to disk). On subsequent connections the client
+offers that local key first — a pure in-memory RSA verify with no YubiKey
+interaction — and falls back to the original auth method if the server does not
+recognise the key.
+
+## Design Principles
+
+1. **Transparent fallback** — Go's `golang.org/x/crypto/ssh` tries each
+ `AuthMethod` in order; if the fast key is rejected the client silently falls
+ back to the SSH agent / YubiKey. No user interaction required.
+2. **Server keys are ephemeral** — the in-memory store is lost on server
+ restart. No file I/O, no persistence.
+3. **Trust chain preserved** — an auth-key can only be registered over a session
+ that was already authenticated via the normal (YubiKey) path.
+4. **Minimal protocol addition** — a single `AUTHKEY <base64-pubkey>` command
+ sent over the existing SSH session text protocol.
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ DTail Client │
+│ │
+│ Auth methods (tried in order): │
+│ 1. Local private key (~/.ssh/id_rsa) ← FAST │
+│ 2. SSH Agent / YubiKey ← SLOW fallback │
+│ │
+│ After slow-path auth: │
+│ → sends AUTHKEY <~/.ssh/id_rsa.pub> to server │
+└────────────────────────┬────────────────────────────────┘
+ │ SSH
+┌────────────────────────▼────────────────────────────────┐
+│ DTail Server (dserver) │
+│ │
+│ PublicKeyCallback: │
+│ 1. Check in-memory authkeystore ← FAST │
+│ 2. Check authorized_keys file ← existing path │
+│ │
+│ AUTHKEY command handler: │
+│ → authkeystore.Add(user, pubkey) │
+│ → responds AUTHKEY OK / AUTHKEY ERR │
+│ │
+│ authkeystore (in-memory only): │
+│ map[username] → []PublicKey (with TTL, max per user) │
+└─────────────────────────────────────────────────────────┘
+```
+
+## Sequence of Events
+
+### First Connection (slow path — YubiKey)
+
+1. Client checks for local private key at `~/.ssh/id_rsa` (or `--auth-key-path`).
+2. Client builds auth methods list: `[localKey, sshAgent]`.
+3. SSH handshake begins; server's `PublicKeyCallback` is called with local key.
+4. Server checks in-memory authkeystore → not found.
+5. Server checks `authorized_keys` file → not found (this key isn't in there).
+6. Server rejects the key.
+7. Go SSH client automatically tries next auth method: SSH agent (YubiKey).
+8. YubiKey signs the challenge; server finds the YubiKey pubkey in
+ `authorized_keys` → auth succeeds.
+9. Session is established; client sends DTail commands as usual.
+10. Client reads `~/.ssh/id_rsa.pub` and sends `AUTHKEY <base64-pubkey>`.
+11. Server's handler parses the command, calls `authkeystore.Add(user, pubkey)`.
+12. Server responds `AUTHKEY OK`.
+
+### Subsequent Connections (fast path — no YubiKey)
+
+1. Client builds auth methods list: `[localKey, sshAgent]`.
+2. SSH handshake begins; server's `PublicKeyCallback` is called with local key.
+3. Server checks in-memory authkeystore → **found** → auth succeeds immediately.
+4. No YubiKey touch needed. Session is established instantly.
+
+### Fallback (server restarted, key expired)
+
+1. Client offers local key → server's authkeystore is empty → rejected.
+2. Client falls back to SSH agent → YubiKey auth succeeds.
+3. Client re-registers local pubkey via `AUTHKEY` command.
+
+## Components
+
+### 1. Server: In-Memory Auth-Key Store
+
+**New file:** `internal/ssh/server/authkeystore.go`
+
+- Thread-safe store using `sync.RWMutex`.
+- Data structure: `map[string][]authKeyEntry` where key is username.
+- Each `authKeyEntry` holds `gossh.PublicKey` + `time.Time` (registered at).
+- Methods: `Add(user, pubkey)`, `Has(user, pubkey) bool`, `Remove(user, pubkey)`.
+- Per-user max key limit (default 5, configurable via `AuthKeyMaxPerUser`).
+- TTL-based expiry (default 24h, configurable via `AuthKeyTTLSeconds`).
+- Lazy expiry: check TTL on `Has()` calls; optionally a background reaper.
+- Package-level singleton or passed via dependency injection.
+
+### 2. Server: Extend PublicKeyCallback
+
+**Modified file:** `internal/ssh/server/publickeycallback.go`
+
+- Before the existing `authorizedKeysFile` lookup, check `authkeystore.Has(user, offeredPubKey)`.
+- If found → return success immediately (fast path).
+- If not found → fall through to existing file-based logic (no behaviour change).
+
+### 3. Server: AUTHKEY Command Handler
+
+**Modified file:** `internal/server/handlers/serverhandler.go` (or relevant handler)
+
+- Parse incoming line for `AUTHKEY <base64-pubkey>` prefix.
+- Decode the base64 public key using `gossh.ParsePublicKey()`.
+- Call `authkeystore.Add(user, pubkey)`.
+- Write `AUTHKEY OK\n` or `AUTHKEY ERR <reason>\n` back to the client.
+- Guard: only accept if `AuthKeyEnabled` is true in server config.
+
+### 4. Client: Auth Method Ordering (Multi-Method Support)
+
+**Modified file:** `internal/ssh/client/authmethods.go`
+
+- Change `initKnownHostsAuthMethods` to **collect multiple auth methods**
+ instead of returning after the first successful one.
+- Order: local private key first (from `--auth-key-path`, default `~/.ssh/id_rsa`),
+ then SSH agent, then other default keys.
+- This ensures Go's SSH client tries the fast key before the YubiKey.
+
+### 5. Client: Auth-Key Registration After Slow-Path Connection
+
+**Modified file:** `internal/clients/connectors/serverconnection.go` (or handler layer)
+
+- After session is established and DTail commands are sent, determine whether
+ the connection used the fast path or slow path.
+- If slow path (YubiKey was used): read the public key file
+ (`--auth-key-path` + `.pub`), send `AUTHKEY <base64-pubkey>` command.
+- Parse `AUTHKEY OK` / `AUTHKEY ERR` response.
+- A simple heuristic: if the auth-key-path private key exists and we have a
+ corresponding `.pub` file, always send the registration — sending it again is
+ idempotent and cheap.
+
+### 6. Configuration
+
+**Modified files:** `internal/config/server.go`, `internal/config/client.go`, `internal/config/args.go`
+
+Server config (`dtail.json`):
+- `AuthKeyEnabled` (bool, default `true`)
+- `AuthKeyTTLSeconds` (int, default `86400` = 24h)
+- `AuthKeyMaxPerUser` (int, default `5`)
+
+Client config / CLI flags:
+- `--auth-key-path` (string, default `~/.ssh/id_rsa`) — path to the local
+ private key to try first and whose `.pub` counterpart is registered
+- `--no-auth-key` (bool, default `false`) — disable auth-key feature entirely
+
+### 7. Integration Tests
+
+**Modified/new files in:** `integrationtests/`
+
+- Test that auth-key registration works end-to-end.
+- Test that fast-path auth succeeds after registration.
+- Test fallback when server has no cached key (simulating restart).
+- Test TTL expiry and max-keys-per-user limits.
+- Test `--no-auth-key` disables the feature.
+
+### 8. Documentation
+
+- Update `README.md` with auth-key feature description.
+- Update `AGENTS.md` / `CLAUDE.md` with new config options and architecture notes.
+
+## Security Considerations
+
+- **No server-side disk persistence** — keys exist only in memory, lost on restart.
+- **Trust chain** — auth-keys can only be registered over an already-authenticated
+ session. An attacker cannot register a key without first proving identity.
+- **TTL expiry** — keys auto-expire (default 24h), limiting exposure window.
+- **Per-user limits** — max 5 keys per user prevents memory exhaustion.
+- **Same security model as `~/.ssh/id_rsa`** — the local key is protected by
+ filesystem permissions (0600). If an attacker has access to `~/.ssh/id_rsa`,
+ they already have SSH access anyway.
+- **No new attack surface** — the `AUTHKEY` command is only processed inside an
+ authenticated session. The `PublicKeyCallback` fast-path is equivalent to
+ having the key in `authorized_keys`.
+
+## Implementation Order
+
+1. Auth-key store (server, standalone, unit-testable)
+2. Extend `PublicKeyCallback` (server, minimal change)
+3. `AUTHKEY` command handler (server handler)
+4. Client auth method ordering (multi-method collection)
+5. Client auth-key registration (send pubkey after slow-path)
+6. Configuration and CLI flags
+7. Integration tests
+8. Documentation
diff --git a/integrationtests/dtailhealth_test.go b/integrationtests/dtailhealth_test.go
index d320849..74773f2 100644
--- a/integrationtests/dtailhealth_test.go
+++ b/integrationtests/dtailhealth_test.go
@@ -143,7 +143,7 @@ func testDTailHealth2WithServer(t *testing.T, logger *TestLogger) {
ctx = WithTestLogger(ctx, logger)
defer cancel()
- // Start dserver
+ // Start dserver
_, _, _, err := startCommand(ctx, t,
"", "../dserver",
"--cfg", "none",
@@ -218,7 +218,7 @@ func testDTailHealthCheck3WithServer(t *testing.T, logger *TestLogger) {
}
_, err = runCommandRetry(ctx, t, 10, outFile,
- "../dtailhealth", "--server", fmt.Sprintf("%s:%d", bindAddress, port))
+ "../dtailhealth", "--server", fmt.Sprintf("%s:%d", bindAddress, port), "--no-auth-key")
if err != nil {
t.Error(err)
return
@@ -228,4 +228,4 @@ func testDTailHealthCheck3WithServer(t *testing.T, logger *TestLogger) {
t.Error(err)
return
}
-} \ No newline at end of file
+}
diff --git a/internal/server/handlers/readcommand.go b/internal/server/handlers/readcommand.go
index 7cd9a63..dc3196e 100644
--- a/internal/server/handlers/readcommand.go
+++ b/internal/server/handlers/readcommand.go
@@ -129,14 +129,23 @@ func (r *readCommand) readFiles(ctx context.Context, ltx lcontext.LContext,
dlog.Server.Info(r.server.LogContext(), "All files processed", "count", len(paths))
- // In turbo mode, signal EOF after all files are processed
- // This is crucial for proper shutdown in server mode
+ // In turbo mode, only the final active command should signal EOF and wait for
+ // acknowledgement. Signaling per command in high-concurrency cat/grep sessions
+ // causes repeated EOF timeouts and races with still-running commands.
if !r.server.TurboBoostDisabled() && !r.server.HasRegularAggregate() &&
(r.mode == omode.CatClient || r.mode == omode.GrepClient || r.mode == omode.TailClient) {
if r.server.IsTurboMode() && r.server.HasTurboEOF() {
+ pending, active := r.server.PendingAndActive()
+ shouldSignalEOF := pending == 0 && active == 1
+ if !shouldSignalEOF {
+ dlog.Server.Trace(r.server.LogContext(), "Skipping turbo EOF signal for non-final command",
+ "pending", pending, "active", active)
+ return
+ }
+
dlog.Server.Debug(r.server.LogContext(), "Turbo mode: flushing data before EOF signal")
- // Ensure all turbo data is flushed before signaling EOF
+ // Ensure all turbo data is flushed before signaling EOF.
r.server.FlushTurboData()
// Signal EOF by closing the channel, but only once.