diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-26 08:35:41 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-26 08:35:41 +0200 |
| commit | 0771e0eccca45cfc1ea439852b779062c418673c (patch) | |
| tree | 29e146d9dabea0ea7cb4aee738e040fc91a9aac2 /lib | |
| parent | 6f33d31f3a3726a554a6454a92ab9d969dd3f4e4 (diff) | |
hyperstack: fix TOML paths, add live provisioning progress, and auto end-to-end test on create
- cli: introduce REPO_ROOT constant so create-both/delete-both/watch
find TOML configs at the repo root instead of lib/hyperstack/
- manager: with_polling prints a heartbeat every 30s so silent waits
(SSH, VM ready, etc.) are visibly alive
- provisioning: bootstrap_guest streams SSH output in real time so
apt-lock waits and setup steps are visible as they happen
- provisioning: vLLM wait loop reads docker logs to show the current
startup stage (shard loading %, torch.compile, CUDA graphs, API up)
instead of a plain "not ready yet" counter
- manager: create automatically runs the end-to-end inference test
after provisioning completes, removing the manual 'test' step
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/hyperstack/cli.rb | 16 | ||||
| -rw-r--r-- | lib/hyperstack/manager.rb | 15 | ||||
| -rw-r--r-- | lib/hyperstack/provisioning.rb | 26 |
3 files changed, 37 insertions, 20 deletions
diff --git a/lib/hyperstack/cli.rb b/lib/hyperstack/cli.rb index f575b59..8568474 100644 --- a/lib/hyperstack/cli.rb +++ b/lib/hyperstack/cli.rb @@ -4,9 +4,13 @@ require 'optparse' module HyperstackVM class CLI + # Repo root is two levels above this file (lib/hyperstack/ → lib/ → repo root). + # All TOML config files live at the repo root, not alongside this library file. + REPO_ROOT = File.expand_path(File.join(__dir__, '..', '..')) + def initialize(argv) @argv = argv.dup - @config_path = File.join(__dir__, 'hyperstack-vm.toml') + @config_path = File.join(REPO_ROOT, 'hyperstack-vm.toml') @config_explicit = false end @@ -212,9 +216,9 @@ module HyperstackVM candidates = [ @config_path, - File.join(__dir__, 'hyperstack-vm1-gptoss.toml'), - File.join(__dir__, 'hyperstack-vm2.toml'), - File.join(__dir__, 'hyperstack-vm-photo.toml') + File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml'), + File.join(REPO_ROOT, 'hyperstack-vm2.toml'), + File.join(REPO_ROOT, 'hyperstack-vm-photo.toml') ].uniq.select { |path| File.exist?(path) } loaders = candidates.map { |path| ConfigLoader.load(path) } @@ -224,8 +228,8 @@ module HyperstackVM def pair_config_loaders [ - ConfigLoader.load(File.join(__dir__, 'hyperstack-vm1-gptoss.toml')), - ConfigLoader.load(File.join(__dir__, 'hyperstack-vm2.toml')) + ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm1-gptoss.toml')), + ConfigLoader.load(File.join(REPO_ROOT, 'hyperstack-vm2.toml')) ] end diff --git a/lib/hyperstack/manager.rb b/lib/hyperstack/manager.rb index 0d17b2f..2134c92 100644 --- a/lib/hyperstack/manager.rb +++ b/lib/hyperstack/manager.rb @@ -343,15 +343,8 @@ module HyperstackVM info "VM ready: #{state['public_ip']} (id=#{state['vm_id']})" print_local_wireguard_summary(state['public_ip']) - wg_ip = @config.wireguard_gateway_hostname - if effective_vllm? - info "Run 'ruby hyperstack.rb test' to verify vLLM." - info " vLLM: http://#{wg_ip}:#{@config.ollama_port}/v1/models" - end - return unless effective_comfyui? - - info "Run 'ruby hyperstack.rb test' to verify ComfyUI." - info " ComfyUI: http://#{wg_ip}:#{@config.comfyui_port}/system_stats" + # Run end-to-end tests automatically so the human doesn't need a manual step. + test info " Enhance: ruby photo-enhance.rb --config #{File.basename(@config.path)} --indir ~/Pictures --outdir ~/Pictures/enhanced" end @@ -693,12 +686,16 @@ module HyperstackVM def with_polling(description, timeout: 900, interval: 5) deadline = Time.now + timeout + attempt = 0 loop do result = yield return result if result raise Error, "Timed out waiting for #{description}." if Time.now >= deadline + attempt += 1 + # Print a heartbeat every 30 seconds so the user can see the script hasn't stalled. + info(" still waiting for #{description}... (#{attempt * interval}s)") if (attempt % 6).zero? sleep interval end end diff --git a/lib/hyperstack/provisioning.rb b/lib/hyperstack/provisioning.rb index eb3518e..fd1e212 100644 --- a/lib/hyperstack/provisioning.rb +++ b/lib/hyperstack/provisioning.rb @@ -204,11 +204,26 @@ module HyperstackVM script << "docker rm #{Shellwords.escape(container)} 2>/dev/null || true" script << 'docker pull vllm/vllm-openai:latest' if pull_image script << docker_run - script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."' + # Stage patterns cover the full vLLM startup sequence: + # HuggingFace download → safetensors shard loading → torch.compile → CUDA graphs → API up. + # The sed strip removes the "(EngineCore pid=N) INFO date time [file.py:line] " log prefix + # so only the human-readable message is shown. + stage_pat = 'Starting to load model|Fetching|Downloading shards|checkpoint shards:.*% Completed' \ + '|Loading weights took|Model loading took|torch\\.compile took' \ + '|Graph capturing|Application startup complete' + strip_pfx = 's/^\\([A-Za-z]+ [^)]+\\) INFO [^ ]+ [^ ]+ \\[[^]]+\\] //' + script << 'echo "Waiting for vLLM to become ready (live progress from container logs)..."' + script << "stage_pat='#{stage_pat}'" + script << "strip_pfx='#{strip_pfx}'" script << 'for i in $(seq 1 240); do' script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" - script << ' echo " vLLM not ready yet ($i/240, container=$state)..."' + script << " progress=$(docker logs --tail 100 #{Shellwords.escape(container)} 2>&1 | grep -E \"$stage_pat\" | tail -1 | sed -E \"$strip_pfx\" | cut -c1-100)" + script << ' if [ -n "$progress" ]; then' + script << ' echo " vLLM ($i/240, $state): $progress"' + script << ' else' + script << ' echo " vLLM not ready yet ($i/240, container=$state)..."' + script << ' fi' script << ' sleep 5' script << 'done' script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }" @@ -375,13 +390,14 @@ module HyperstackVM info 'Bootstrapping Ubuntu guest over SSH...' retries = 3 retries.times do |attempt| - stdout, stderr, status = @ssh_command_runner.call(host, @scripts.guest_bootstrap_script) + # Stream output so apt-lock waits and individual bootstrap steps are visible in real time. + output, status = @ssh_stream_runner.call(host, @scripts.guest_bootstrap_script) return if status.success? - msg = stderr.strip.empty? ? stdout : stderr + msg = output.lines.last&.strip || output.strip raise Error, "Guest bootstrap failed after #{retries} attempts: #{msg}" if attempt == retries - 1 - warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg.lines.last&.strip}), retrying in 15s..." + warn "Bootstrap attempt #{attempt + 1}/#{retries} failed (#{msg}), retrying in 15s..." sleep 15 end end |
