hyperstack: fix TOML paths, add live provisioning progress, and auto end-to-end test on create

- cli: introduce REPO_ROOT constant so create-both/delete-both/watch find TOML configs at the repo root instead of lib/hyperstack/ - manager: with_polling prints a heartbeat every 30s so silent waits (SSH, VM ready, etc.) are visibly alive - provisioning: bootstrap_guest streams SSH output in real time so apt-lock waits and setup steps are visible as they happen - provisioning: vLLM wait loop reads docker logs to show the current startup stage (shard loading %, torch.compile, CUDA graphs, API up) instead of a plain "not ready yet" counter - manager: create automatically runs the end-to-end inference test after provisioning completes, removing the manual 'test' step Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
author: Paul Buetow <paul@buetow.org> 2026-03-26 08:35:41 +0200
committer: Paul Buetow <paul@buetow.org> 2026-03-26 08:35:41 +0200
commit: 0771e0eccca45cfc1ea439852b779062c418673c (patch)
tree: 29e146d9dabea0ea7cb4aee738e040fc91a9aac2 /lib
parent: 6f33d31f3a3726a554a6454a92ab9d969dd3f4e4 (diff)
3 files changed, 37 insertions, 20 deletions
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb
index f575b59..8568474 100644
--- a/lib/hyperstack/cli.rb
+++ b/lib/hyperstack/cli.rb
@@ -4,9 +4,13 @@ require 'optparse'
 
 module HyperstackVM
   class CLI
+    # Repo root is two levels above this file (lib/hyperstack/ → lib/ → repo root).
+    # All TOML config files live at the repo root, not alongside this library file.
+    REPO_ROOT = File.expand_path(File.join(__dir__, '..', '..'))
+
     def initialize(argv)
       @argv = argv.dup
-      @config_path = File.join(__dir__, 'hyperstack-vm.toml')
+      @config_path = File.join(REPO_ROOT, 'hyperstack-vm.toml')
       @config_explicit = false
     end
 
@@ -212,9 +216,9 @@ module HyperstackVM
 
       candidates = [
         @config_path,
-        File.join(__dir__, 'hyperstack-vm1-gptoss.toml'),
-        File.join(__dir__, 'hyperstack-vm2.toml'),
-        File.join(__dir__, 'hyperstack-vm-photo.toml')
+        File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml'),
+        File.join(REPO_ROOT, 'hyperstack-vm2.toml'),
+        File.join(REPO_ROOT, 'hyperstack-vm-photo.toml')
       ].uniq.select { |path| File.exist?(path) }
 
       loaders = candidates.map { |path| ConfigLoader.load(path) }
@@ -224,8 +228,8 @@ module HyperstackVM
 
     def pair_config_loaders
       [
-        ConfigLoader.load(File.join(__dir__, 'hyperstack-vm1-gptoss.toml')),
-        ConfigLoader.load(File.join(__dir__, 'hyperstack-vm2.toml'))
+        ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml')),
+        ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm2.toml'))
       ]
     end
 
diff --git a/lib/hyperstack/manager.rb b/lib/hyperstack/manager.rb
index 0d17b2f..2134c92 100644
--- a/lib/hyperstack/manager.rb
+++ b/lib/hyperstack/manager.rb
@@ -343,15 +343,8 @@ module HyperstackVM
 
       info "VM ready: #{state['public_ip']} (id=#{state['vm_id']})"
       print_local_wireguard_summary(state['public_ip'])
-      wg_ip = @config.wireguard_gateway_hostname
-      if effective_vllm?
-        info "Run 'ruby hyperstack.rb test' to verify vLLM."
-        info "  vLLM:    http://#{wg_ip}:#{@config.ollama_port}/v1/models"
-      end
-      return unless effective_comfyui?
-
-      info "Run 'ruby hyperstack.rb test' to verify ComfyUI."
-      info "  ComfyUI: http://#{wg_ip}:#{@config.comfyui_port}/system_stats"
+      # Run end-to-end tests automatically so the human doesn't need a manual step.
+      test
       info "  Enhance: ruby photo-enhance.rb --config #{File.basename(@config.path)} --indir ~/Pictures --outdir ~/Pictures/enhanced"
     end
 
@@ -693,12 +686,16 @@ module HyperstackVM
 
     def with_polling(description, timeout: 900, interval: 5)
       deadline = Time.now + timeout
+      attempt = 0
       loop do
         result = yield
         return result if result
 
         raise Error, "Timed out waiting for #{description}." if Time.now >= deadline
 
+        attempt += 1
+        # Print a heartbeat every 30 seconds so the user can see the script hasn't stalled.
+        info("  still waiting for #{description}... (#{attempt * interval}s)") if (attempt % 6).zero?
         sleep interval
       end
     end
diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb
index eb3518e..fd1e212 100644
--- a/lib/hyperstack/provisioning.rb
+++ b/lib/hyperstack/provisioning.rb
@@ -204,11 +204,26 @@ module HyperstackVM
       script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true"
       script << 'docker pull vllm/vllm-openai:latest' if pull_image
       script << docker_run
-      script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."'
+      # Stage patterns cover the full vLLM startup sequence:
+      #   HuggingFace download → safetensors shard loading → torch.compile → CUDA graphs → API up.
+      # The sed strip removes the "(EngineCore pid=N) INFO date time [file.py:line] " log prefix
+      # so only the human-readable message is shown.
+      stage_pat = 'Starting to load model|Fetching|Downloading shards|checkpoint shards:.*% Completed' \
+                  '|Loading weights took|Model loading took|torch\\.compile took' \
+                  '|Graph capturing|Application startup complete'
+      strip_pfx = 's/^\\([A-Za-z]+ [^)]+\\) INFO [^ ]+ [^ ]+ \\[[^]]+\\] //'
+      script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."'
+      script << "stage_pat='#{stage_pat}'"
+      script << "strip_pfx='#{strip_pfx}'"
       script << 'for i in $(seq 1 240); do'
       script << "  if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi"
       script << "  state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)"
-      script << '  echo "  vLLM not ready yet ($i/240, container=$state)..."'
+      script << "  progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)"
+      script << '  if [ -n "$progress" ]; then'
+      script << '    echo "  vLLM ($i/240, $state): $progress"'
+      script << '  else'
+      script << '    echo "  vLLM not ready yet ($i/240, container=$state)..."'
+      script << '  fi'
       script << '  sleep 5'
       script << 'done'
       script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }"
@@ -375,13 +390,14 @@ module HyperstackVM
       info 'Bootstrapping Ubuntu guest over SSH...'
       retries = 3
       retries.times do |attempt|
-        stdout, stderr, status = @ssh_command_runner.call(host, @scripts.guest_bootstrap_script)
+        # Stream output so apt-lock waits and individual bootstrap steps are visible in real time.
+        output, status = @ssh_stream_runner.call(host, @scripts.guest_bootstrap_script)
         return if status.success?
 
-        msg = stderr.strip.empty? ? stdout : stderr
+        msg = output.lines.last&.strip || output.strip
         raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1
 
-        warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..."
+        warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg}), retrying in 15s..."
         sleep 15
       end
     end
author	Paul Buetow <paul@buetow.org>	2026-03-26 08:35:41 +0200
committer	Paul Buetow <paul@buetow.org>	2026-03-26 08:35:41 +0200
commit	0771e0eccca45cfc1ea439852b779062c418673c (patch)
tree	29e146d9dabea0ea7cb4aee738e040fc91a9aac2 /lib
parent	6f33d31f3a3726a554a6454a92ab9d969dd3f4e4 (diff)