5 files changed, 372 insertions, 5 deletions
diff --git a/README.md b/README.md
index 13951a1..72ba208 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# hyperstack
+# hypr
 
 <img src="logo.svg" alt="Hyperstack · Pi · FreeBSD · AI · tmux logo" width="600"/>
 
@@ -238,7 +238,7 @@ Custom extensions live in `pi/agent/extensions/` and are loaded automatically vi
 | `fresh-subagent` | Spawns a sub-agent in a clean context for isolated tasks |
 | `reload-runtime` | `/reload-runtime` command — hot-reloads extensions without restarting Pi |
 | `nemotron-tool-repair` | Repairs malformed tool calls from Nemotron models |
-| `taskwarrior-plan-mode` | Integrates Taskwarrior task management into Pi sessions |
+| `agent-plan-mode` | Integrates task management into Pi sessions |
 
 ### Web search
 
@@ -316,6 +316,7 @@ Commands:
   delete       Destroy the tracked VM
   delete-both  Destroy both VM1 and VM2
   status       Show VM and WireGuard status
+  watch        Live dashboard: vLLM + GPU stats for all active VMs (refreshes every 5 s)
   test         Run end-to-end inference tests (vLLM)
   model switch <preset>  Hot-switch the running vLLM model
 
@@ -527,6 +528,24 @@ docker run -d \
 
 ## Monitoring vLLM
 
+The `watch` command provides a built-in terminal dashboard that polls all active VMs every 5 seconds:
+
+```bash
+ruby hyperstack.rb watch
+```
+
+It shows per-VM panels with:
+- **GPU** (per device): utilisation bar, temperature, power draw, VRAM %
+- **Requests**: running / waiting / swapped queue depth
+- **KV cache**: GPU fill %
+- **Perf**: decode speed (tok/s), TTFT, e2e latency (means across all completed requests)
+- **Tokens**: cumulative prefill and generation totals
+
+Stats are sourced from the vLLM `/metrics` Prometheus endpoint over the WireGuard tunnel
+and from `nvidia-smi` over SSH.  Press `Ctrl-C` to exit.
+
+For lower-level ad-hoc inspection:
+
 ```bash
 # Live engine stats (throughput, KV cache, prefix cache hit rate)
 ssh ubuntu@<vm-ip> 'docker logs -f vllm_nemotron_super 2>&1 | grep "Engine 000"'
diff --git a/hyperstack.rb b/hyperstack.rb
index bbf76c7..d48260e 100755
--- a/hyperstack.rb
+++ b/hyperstack.rb
@@ -2357,6 +2357,335 @@ module HyperstackVM
     end
   end
 
+  # Continuously polls all active VMs for vLLM Prometheus metrics (over HTTP/WireGuard)
+  # and GPU stats (over SSH) and redraws a compact terminal dashboard every 60 seconds.
+  class VllmWatcher
+    REFRESH_INTERVAL = 5
+
+    # ANSI escape helpers
+    BOLD   = "\033[1m"
+    DIM    = "\033[2m"
+    GREEN  = "\033[32m"
+    YELLOW = "\033[33m"
+    CYAN   = "\033[36m"
+    RED    = "\033[31m"
+    RESET  = "\033[0m"
+    CLEAR  = "\033[2J\033[H"
+
+    # Snapshot of one VM's stats at a point in time.
+    VmSnapshot = Struct.new(
+      :label, :wg_host, :vllm_model, :container_name,
+      :metrics, :gpus,
+      :vllm_error, :gpu_error,
+      :fetched_at,
+      keyword_init: true
+    )
+
+    # Parsed per-GPU row from nvidia-smi.
+    GpuInfo = Struct.new(
+      :index, :name, :temp_c, :util_pct, :power_w,
+      :mem_used_mib, :mem_total_mib,
+      keyword_init: true
+    )
+
+    def initialize(config_loaders:)
+      @config_loaders = config_loaders
+    end
+
+    # Runs the watch loop until the user presses Ctrl-C.
+    def run
+      $stdout.print "\033[?25l" # hide cursor
+      loop do
+        snapshots = fetch_all_parallel
+        draw(snapshots)
+        sleep REFRESH_INTERVAL
+      end
+    rescue Interrupt
+      nil
+    ensure
+      $stdout.print "\033[?25h\n" # restore cursor
+    end
+
+    private
+
+    # Fetches stats for every VM concurrently and returns an array of VmSnapshot.
+    def fetch_all_parallel
+      threads = @config_loaders.map { |loader| Thread.new { fetch_vm(loader) } }
+      threads.map(&:value)
+    end
+
+    # Fetches GPU stats and vLLM engine stats for a single VM via one SSH session.
+    # nvidia-smi covers hardware metrics; docker logs provide the throughput and
+    # cache hit rate numbers that vLLM logs every few seconds as "Engine 0" lines.
+    def fetch_vm(loader)
+      config   = loader.config
+      label    = File.basename(loader.path, '.toml')
+      wg_host  = config.wireguard_gateway_hostname
+      state    = load_state(config.state_file)
+
+      unless state
+        return VmSnapshot.new(label: label, wg_host: wg_host,
+                              vllm_model: nil, container_name: nil,
+                              metrics: nil, gpus: nil,
+                              vllm_error: 'no state file', gpu_error: nil,
+                              fetched_at: Time.now)
+      end
+
+      vllm_model     = state['vllm_model'] || config.vllm_model
+      container_name = state['vllm_container_name'] || config.vllm_container_name
+
+      gpus, metrics, ssh_error = fetch_vm_stats(config, wg_host, container_name)
+
+      VmSnapshot.new(label: label, wg_host: wg_host,
+                     vllm_model: vllm_model, container_name: container_name,
+                     metrics: metrics, gpus: gpus,
+                     vllm_error: ssh_error, gpu_error: ssh_error,
+                     fetched_at: Time.now)
+    rescue StandardError => e
+      VmSnapshot.new(label: label || '?', wg_host: wg_host || '?',
+                     vllm_model: nil, container_name: nil,
+                     metrics: nil, gpus: nil,
+                     vllm_error: e.message, gpu_error: nil,
+                     fetched_at: Time.now)
+    end
+
+    def load_state(path)
+      JSON.parse(File.read(path))
+    rescue Errno::ENOENT, JSON::ParserError
+      nil
+    end
+
+    # Single SSH call that runs nvidia-smi and tails the vLLM container logs.
+    # The two sections are separated by a sentinel line so we can split them.
+    # Returns [gpus, metrics, error_or_nil].
+    def fetch_vm_stats(config, wg_host, container_name)
+      gpu_query = 'index,name,temperature.gpu,utilization.gpu,power.draw,memory.used,memory.total'
+      # --tail 200 instead of --since N so we always get the last stats line
+      # even when the VM has been idle for longer than the refresh interval.
+      script    = <<~BASH
+        nvidia-smi --query-gpu=#{gpu_query} --format=csv,noheader,nounits
+        echo ===VLLM===
+        docker logs --tail 200 #{container_name} 2>&1 | grep 'Engine 0' | tail -1
+      BASH
+
+      ssh = build_ssh_command(config, wg_host)
+      stdout, stderr, status = Timeout.timeout(15) { Open3.capture3(*ssh, stdin_data: script) }
+      unless status.success?
+        return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"]
+      end
+
+      gpu_section, vllm_section = stdout.split("===VLLM===\n", 2)
+      gpus    = parse_nvidia_smi(gpu_section.to_s)
+      metrics = parse_engine_log_line(vllm_section.to_s.strip)
+      [gpus, metrics, nil]
+    end
+
+    # Parse a vLLM "Engine 0" log line into a plain Hash.
+    # Actual log format (loggers.py):
+    #   (APIServer pid=1) INFO ... [loggers.py:259] Engine 000:
+    #     Avg prompt throughput: 6154.6 tokens/s,
+    #     Avg generation throughput: 27.4 tokens/s,
+    #     Running: 1 reqs, Waiting: 0 reqs,
+    #     GPU KV cache usage: 0.7%, Prefix cache hit rate: 0.0%
+    # Returns an empty hash when no matching line was found (container still loading).
+    def parse_engine_log_line(line)
+      return {} if line.empty?
+
+      {
+        'avg_prompt_throughput'         => extract_float(line, /Avg prompt throughput:\s*([\d.]+)/),
+        'avg_generation_throughput'     => extract_float(line, /Avg generation throughput:\s*([\d.]+)/),
+        'running'                       => extract_float(line, /Running:\s*(\d+)\s*reqs/),
+        'pending'                       => extract_float(line, /Waiting:\s*(\d+)\s*reqs/),
+        'swapped'                       => extract_float(line, /Swapped:\s*(\d+)\s*reqs/),
+        'gpu_cache_usage_pct'           => extract_float(line, /GPU KV cache usage:\s*([\d.]+)%/),
+        'gpu_prefix_cache_hit_rate_pct' => extract_float(line, /Prefix cache hit rate:\s*([\d.]+)%/),
+      }.compact
+    end
+
+    def extract_float(text, pattern)
+      m = text.match(pattern)
+      m ? m[1].to_f : nil
+    end
+
+    # Build an SSH command array for the watcher.
+    # Uses accept-new rather than yes because the known-hosts file was populated
+    # with the VM's public IP during provisioning; the WireGuard hostname
+    # (hyperstack1.wg1 etc.) won't be in it yet. accept-new auto-trusts the first
+    # connection and caches the key — safe here because we're connecting over the
+    # already-authenticated WireGuard tunnel.
+    def build_ssh_command(config, host)
+      cmd = [
+        'ssh',
+        '-o', 'BatchMode=yes',
+        '-o', 'StrictHostKeyChecking=accept-new',
+        '-o', "UserKnownHostsFile=#{config.ssh_known_hosts_path}",
+        '-o', "ConnectTimeout=#{config.ssh_connect_timeout}",
+        '-p', config.ssh_port.to_s
+      ]
+      key = config.ssh_private_key_path
+      cmd.concat(['-i', key]) if File.exist?(key)
+      cmd << "#{config.ssh_username}@#{host}"
+      cmd << 'bash -se'
+      cmd
+    end
+
+    def parse_nvidia_smi(text)
+      text.lines.filter_map do |line|
+        parts = line.strip.split(',').map(&:strip)
+        next if parts.length < 7
+
+        GpuInfo.new(
+          index:        parts[0].to_i,
+          name:         parts[1],
+          temp_c:       parts[2].to_f,
+          util_pct:     parts[3].to_f,
+          power_w:      parts[4].to_f,
+          mem_used_mib: parts[5].to_f,
+          mem_total_mib: parts[6].to_f
+        )
+      end
+    end
+
+    # ── Rendering ────────────────────────────────────────────────────────────
+
+    # Clears the screen and redraws the full dashboard for all VMs.
+    def draw(snapshots)
+      time_str = Time.now.strftime('%H:%M:%S')
+      header   = "#{BOLD}#{CYAN}vLLM watch#{RESET}  " \
+                 "#{DIM}#{time_str}  Ctrl-C to stop  " \
+                 "refreshing every #{REFRESH_INTERVAL}s#{RESET}"
+
+      panels = snapshots.map { |snap| render_vm(snap) }
+
+      if panels.size >= 2
+        # Lay out VM panels side-by-side, padding each to its own visible width
+        # so the separator column stays aligned regardless of content length.
+        panel_widths = panels.map { |p| p.map { |l| strip_ansi(l).length }.max.to_i }
+        max_rows     = panels.map(&:size).max
+        panels.each { |p| p.fill('', p.size...max_rows) }
+        sep = "  #{DIM}│#{RESET}  "
+
+        panel_lines = (0...max_rows).map do |i|
+          panels.each_with_index.map do |panel, j|
+            cell = panel[i] || ''
+            # Pad every column except the last so the separator stays in column.
+            next cell if j == panels.size - 1
+
+            visible_len = strip_ansi(cell).length
+            cell + ' ' * [panel_widths[j] - visible_len, 0].max
+          end.join(sep)
+        end
+
+        rule_w = [strip_ansi(panel_lines.first || '').length, 72].max
+        rule   = DIM + ('─' * rule_w) + RESET
+        lines  = [header, rule, *panel_lines, '']
+      else
+        # Single VM: simple vertical layout.
+        rule  = DIM + ('─' * 72) + RESET
+        lines = [header, rule]
+        panels.each { |p| lines << ''; lines.concat(p) }
+        lines << ''
+      end
+
+      $stdout.write(CLEAR + lines.join("\n"))
+      $stdout.flush
+    end
+
+    # Width of the label column used in every metric row, keeping bars aligned.
+    LABEL_W = 10
+
+    # Renders a single VM panel as an array of strings (one per display line).
+    def render_vm(snap)
+      lines = []
+
+      model_label = snap.vllm_model ? DIM + snap.vllm_model.split('/').last + RESET : ''
+      lines << "#{BOLD}#{snap.label}#{RESET}  #{DIM}#{snap.wg_host}#{RESET}  #{model_label}"
+
+      # Both GPU and vLLM stats come from the same SSH call; show one error if it failed.
+      if snap.gpu_error
+        lines << "  #{RED}#{snap.gpu_error}#{RESET}"
+      else
+        snap.gpus&.each do |gpu|
+          mem_pct = gpu.mem_total_mib > 0 ? (gpu.mem_used_mib / gpu.mem_total_mib * 100.0) : 0.0
+          lines << format('  GPU%-2d  %-26s  %3.0f°C  %5.0fW',
+                          gpu.index, gpu.name, gpu.temp_c, gpu.power_w)
+          lines << bar_row('util', gpu.util_pct)
+          lines << bar_row('VRAM', mem_pct)
+        end
+        if snap.metrics&.any?
+          lines.concat(render_vllm_metrics(snap.metrics))
+        elsif snap.metrics&.empty?
+          lines << "  #{DIM}(no Engine log line yet — container may still be loading)#{RESET}"
+        end
+      end
+
+      lines
+    end
+
+    # Formats the vLLM engine log stats into display lines.
+    # All values come directly from the "Engine 0" log line that vLLM emits
+    # every few seconds, so tok/s figures are the rolling averages vLLM computes
+    # internally — no client-side rate derivation needed.
+    def render_vllm_metrics(m)
+      lines = []
+
+      # Throughput: rolling averages already computed by vLLM
+      prefill_tps = m['avg_prompt_throughput']
+      decode_tps  = m['avg_generation_throughput']
+      tput_parts  = []
+      tput_parts << "prefill #{format('%.1f', prefill_tps)} tok/s" if prefill_tps
+      tput_parts << "decode #{format('%.1f', decode_tps)} tok/s"   if decode_tps
+      lines << row('throughput', tput_parts.empty? ? 'n/a' : tput_parts.join('  '))
+
+      # Request queue depth
+      running = m['running']
+      swapped = m['swapped']
+      pending = m['pending']
+      q_parts = []
+      q_parts << "#{running.to_i} running" if running
+      q_parts << "#{pending.to_i} waiting" if pending
+      q_parts << "#{swapped.to_i} swapped" if swapped && swapped > 0
+      lines << row('requests', q_parts.empty? ? 'n/a' : q_parts.join('  '))
+
+      # KV-cache fill and prefix-cache hit rate, each with an aligned bar
+      gpu_cache    = m['gpu_cache_usage_pct']
+      hit_rate_gpu = m['gpu_prefix_cache_hit_rate_pct']
+      lines << bar_row('KV cache',   gpu_cache)    if gpu_cache
+      lines << bar_row('cache hits', hit_rate_gpu) if hit_rate_gpu
+
+      lines
+    end
+
+    # Formats one metric row: fixed-width label then value, giving all rows the same indent.
+    def row(label, value)
+      "  #{label.ljust(LABEL_W)}  #{value}"
+    end
+
+    # Formats one bar row: fixed-width label, proportional bar, percentage number.
+    # All bar rows share the same column for '[', aligning bars across GPU and vLLM sections.
+    def bar_row(label, pct)
+      row(label, "#{pct_bar(pct, 10)}  #{format('%5.1f', pct)}%")
+    end
+
+    # Renders a proportional bar for any percentage (0–100).
+    # Colour: green below 50%, yellow 50–79%, red 80%+.
+    def pct_bar(pct, width)
+      filled = [(pct / 100.0 * width).round, width].min
+      color  = pct >= 80 ? RED : pct >= 50 ? YELLOW : GREEN
+      "[#{color}#{'█' * filled}#{RESET}#{' ' * (width - filled)}]"
+    end
+
+    # Strips ANSI escape sequences to measure the visible length of a string.
+    def strip_ansi(str)
+      str.gsub(/\033\[[0-9;]*m/, '')
+    end
+
+    # Formats an integer with thousands separators, e.g. 1234567 → "1,234,567".
+    def fmt_num(n)
+      n.to_i.to_s.reverse.scan(/\d{1,3}/).join(',').reverse
+    end
+  end
+
   class CLI
     def initialize(argv)
       @argv = argv.dup
@@ -2377,6 +2706,8 @@ module HyperstackVM
       puts '  delete-both [--dry-run]'
       puts '               Delete the VMs tracked by hyperstack-vm1.toml and hyperstack-vm2.toml.'
       puts '  status'
+      puts '  watch'
+      puts '               Poll all active VMs for vLLM and GPU stats every 60 s.'
       puts '  test'
       puts '  model list'
       puts '  model switch PRESET [--dry-run]'
@@ -2421,6 +2752,11 @@ module HyperstackVM
         return
       end
 
+      if command == 'watch'
+        run_watch
+        return
+      end
+
       # All other commands operate on a single VM defined by the --config path.
       config_loader = ConfigLoader.load(@config_path)
       manager       = build_manager(config_loader.config)
@@ -2460,7 +2796,7 @@ module HyperstackVM
           raise Error, "Unknown model subcommand #{sub.inspect}. Use list or switch."
         end
       else
-        raise Error, "Unknown command #{command.inspect}. Use create, create-both, delete, delete-both, status, test, or model."
+        raise Error, "Unknown command #{command.inspect}. Use create, create-both, delete, delete-both, status, watch, test, or model."
       end
     end
 
@@ -2516,6 +2852,18 @@ module HyperstackVM
       )
     end
 
+    # Starts the VllmWatcher dashboard for all active VMs.
+    # Reuses status_config_loaders so it auto-discovers the same set of VMs
+    # that `status` would show (honours --config if given explicitly).
+    def run_watch
+      loaders = status_config_loaders
+      if loaders.empty?
+        raise Error, 'No active VMs found. Run `create` or `create-both` first.'
+      end
+
+      VllmWatcher.new(config_loaders: loaders).run
+    end
+
     def run_status
       loaders = status_config_loaders
       if loaders.one?
diff --git a/hyperstack.fish b/hypr.fish
index 09706b5..09706b5 100644
--- a/hyperstack.fish
+++ b/hypr.fish
diff --git a/pi/agent/extensions/modal-editor/index.ts b/pi/agent/extensions/modal-editor/index.ts
index abb660a..ecf36c3 100644
--- a/pi/agent/extensions/modal-editor/index.ts
+++ b/pi/agent/extensions/modal-editor/index.ts
@@ -41,7 +41,7 @@ function charKind(char: string | null): CharKind {
 }
 
 class ModalEditor extends CustomEditor {
-	private mode: Mode = "normal";
+	private mode: Mode = "insert";
 	private pending: PendingAction = null;
 
 	private internals(): EditorStateAccess {
diff --git a/pi/agent/settings.json b/pi/agent/settings.json
index 972476b..fbb3874 100644
--- a/pi/agent/settings.json
+++ b/pi/agent/settings.json
@@ -1,5 +1,5 @@
 {
-  "lastChangelogVersion": "0.61.1",
+  "lastChangelogVersion": "0.62.0",
   "defaultProvider": "openai",
   "defaultModel": "gpt-4.1"
 }
 \ No newline at end of file