diff options
| author | Paul Buetow <paul@buetow.org> | 2026-03-24 23:35:00 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2026-03-24 23:35:00 +0200 |
| commit | 9f45954847e7aa904ebda55e3c23277d7c7a3079 (patch) | |
| tree | 2a30a1782c21b0616a70c8536aa07a4019df7213 | |
| parent | 862b132ddee2cc343a3edc98797554937ea5f595 (diff) | |
hyperstack: gpt-oss-120b + qwen3-coder-next dual-VM pair on A100x1
- Add hyperstack-vm1-gptoss.toml: A100x1 config for gpt-oss-120b (VM1)
and qwen3-coder-next (VM2) pair, replacing the H100x2 default
- Fix pi/agent/models.json: hyperstack provider URL was pointing at
hyperstack.wg1 (unresolvable); corrected to hyperstack1.wg1 (192.168.3.1)
- Update hyperstack.rb, hypr.fish: reference vm1-gptoss.toml for create-both
and pair commands; update fish abbrs for the new pair setup
- Update ask-mode/utils.ts: allow read-only 'ask' commands in ask-mode
- Update agent-plan-mode/utils.ts: tighten isAskCommand check
- Add state files for provisioned vm1/vm2 instances
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
| -rw-r--r-- | .hyperstack-vm1-state.json | 90 | ||||
| -rw-r--r-- | .hyperstack-vm2-state.json | 90 | ||||
| -rw-r--r-- | hyperstack-vm1-gptoss.toml | 173 | ||||
| -rwxr-xr-x | hyperstack.rb | 206 | ||||
| -rw-r--r-- | hypr.fish | 16 | ||||
| -rw-r--r-- | pi/agent/extensions/agent-plan-mode/utils.ts | 6 | ||||
| -rw-r--r-- | pi/agent/extensions/ask-mode/utils.ts | 25 | ||||
| -rw-r--r-- | pi/agent/models.json | 2 | ||||
| -rw-r--r-- | pi/agent/settings.json | 3 |
9 files changed, 511 insertions, 100 deletions
diff --git a/.hyperstack-vm1-state.json b/.hyperstack-vm1-state.json new file mode 100644 index 0000000..acb9185 --- /dev/null +++ b/.hyperstack-vm1-state.json @@ -0,0 +1,90 @@ +{ + "vm_id": 698908, + "vm_name": "hyperstack1-20260324205516", + "environment_name": "snonux-ollama", + "region": "CANADA-1", + "flavor_name": "n3-A100x1", + "image_name": "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker", + "key_name": "earth", + "public_ip": "69.19.136.193", + "created_at": "2026-03-24T20:55:17Z", + "services": { + "vllm_enabled": true, + "ollama_enabled": false, + "comfyui_enabled": false + }, + "security_rules": [ + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 22, + "port_range_max": 22, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "udp", + "port_range_min": 56710, + "port_range_max": 56710, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 11434, + "port_range_max": 11434, + "remote_ip_prefix": "192.168.3.0/24" + }, + { + "direction": "egress", + "ethertype": "IPv6", + "protocol": "any", + "port_range_min": 1, + "port_range_max": 65535, + "remote_ip_prefix": "0.0.0.0/0" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "udp", + "port_range_min": 56710, + "port_range_max": 56710, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 22, + "port_range_max": 22, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "egress", + "ethertype": "IPv4", + "protocol": "any", + "port_range_min": 1, + "port_range_max": 65535, + "remote_ip_prefix": "0.0.0.0/0" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 11434, + "port_range_max": 11434, + "remote_ip_prefix": "192.168.3.0/24" + } + ], + "bootstrapped_at": "2026-03-24T20:57:32Z", + "vllm_setup_at": "2026-03-24T21:13:42Z", + "vllm_model": "openai/gpt-oss-120b", + "vllm_container_name": "vllm_gpt_oss_120b", + "vllm_preset": null, + "status": "ACTIVE", + "vm_state": "active", + "provisioned_at": "2026-03-24T21:13:43Z" +}
\ No newline at end of file diff --git a/.hyperstack-vm2-state.json b/.hyperstack-vm2-state.json new file mode 100644 index 0000000..8480b29 --- /dev/null +++ b/.hyperstack-vm2-state.json @@ -0,0 +1,90 @@ +{ + "vm_id": 698909, + "vm_name": "hyperstack2-20260324205518", + "environment_name": "snonux-ollama", + "region": "CANADA-1", + "flavor_name": "n3-A100x1", + "image_name": "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker", + "key_name": "earth", + "public_ip": "69.19.136.171", + "created_at": "2026-03-24T20:55:19Z", + "services": { + "vllm_enabled": true, + "ollama_enabled": false, + "comfyui_enabled": false + }, + "security_rules": [ + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 22, + "port_range_max": 22, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "udp", + "port_range_min": 56710, + "port_range_max": 56710, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 11434, + "port_range_max": 11434, + "remote_ip_prefix": "192.168.3.0/24" + }, + { + "direction": "egress", + "ethertype": "IPv4", + "protocol": "any", + "port_range_min": 1, + "port_range_max": 65535, + "remote_ip_prefix": "0.0.0.0/0" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 22, + "port_range_max": 22, + "remote_ip_prefix": "79.100.218.77/32" + }, + { + "direction": "egress", + "ethertype": "IPv6", + "protocol": "any", + "port_range_min": 1, + "port_range_max": 65535, + "remote_ip_prefix": "0.0.0.0/0" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "tcp", + "port_range_min": 11434, + "port_range_max": 11434, + "remote_ip_prefix": "192.168.3.0/24" + }, + { + "direction": "ingress", + "ethertype": "IPv4", + "protocol": "udp", + "port_range_min": 56710, + "port_range_max": 56710, + "remote_ip_prefix": "79.100.218.77/32" + } + ], + "bootstrapped_at": "2026-03-24T20:57:46Z", + "vllm_setup_at": "2026-03-24T21:30:54Z", + "vllm_model": "bullpoint/Qwen3-Coder-Next-AWQ-4bit", + "vllm_container_name": "vllm_qwen3", + "vllm_preset": null, + "status": "ACTIVE", + "vm_state": "active", + "provisioned_at": "2026-03-24T21:30:54Z" +}
\ No newline at end of file diff --git a/hyperstack-vm1-gptoss.toml b/hyperstack-vm1-gptoss.toml new file mode 100644 index 0000000..80f908c --- /dev/null +++ b/hyperstack-vm1-gptoss.toml @@ -0,0 +1,173 @@ +[auth] +api_key_file = "~/.hyperstack" + +[hyperstack] +base_url = "https://infrahub-api.nexgencloud.com/v1" + +[state] +# Separate state file for VM1 so vm1 and vm2 can be managed independently. +file = ".hyperstack-vm1-state.json" + +[vm] +name_prefix = "hyperstack1" +hostname = "hyperstack1" +environment_name = "snonux-ollama" + +# A100-80GB single GPU for gpt-oss-120b +flavor_name = "n3-A100x1" +image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" +assign_floating_ip = true +create_bootable_volume = false +enable_port_randomization = false +labels = ["gpt-oss-120b", "wireguard"] + +[ssh] +username = "ubuntu" +private_key_path = "~/.ssh/id_rsa" +hyperstack_key_name = "earth" +port = 22 +connect_timeout_sec = 10 + +[network] +wireguard_udp_port = 56710 +wireguard_subnet = "192.168.3.0/24" +# VM1 gets the first server-side WireGuard IP (gateway address + 0). +# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. +wireguard_server_ip = "192.168.3.1" +# Secure default: "auto" resolves your current public egress IP to /32 at runtime. +# Override with explicit CIDRs if you deploy from multiple networks or want broader access. +allowed_ssh_cidrs = ["auto"] +allowed_wireguard_cidrs = ["auto"] +# Port 11434 is shared by both Ollama and vLLM for firewall compatibility. +ollama_port = 11434 + +[bootstrap] +enable_guest_bootstrap = true +install_wireguard = true +configure_ufw = true +configure_ollama_host = false + +[ollama] +# Disabled in favour of vLLM; set install = true to switch back to Ollama. +install = false +models_dir = "/ephemeral/ollama/models" +listen_host = "0.0.0.0:11434" +gpu_overhead_mb = 2000 +num_parallel = 1 +context_length = 32768 +pull_models = ["qwen3-coder-next", "qwen3-coder:30b", "gpt-oss:20b", "gpt-oss:120b", "nemotron-3-super"] + +# vLLM serves one model via Docker on the OpenAI-compatible API. +[vllm] +install = true +model = "openai/gpt-oss-120b" +# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). +hug_cache_dir = "/ephemeral/hug" +container_name = "vllm_gpt_oss_120b" +# Hard architecture limit: max_position_embeddings=131072 in model config.json. +max_model_len = 131072 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +# tool_call_parser="" disables --enable-auto-tool-choice; the llama3_json parser crashes +# on gpt-oss responses (vLLM 0.17.1 adds token_ids to responses, breaking the parser API). +tool_call_parser = "" + +# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1-gptoss.toml model switch <name>'. +# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. + +[vllm.presets.qwen3-coder-next] +model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" +container_name = "vllm_qwen3" +max_model_len = 262144 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_coder" + +# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). +[vllm.presets.nemotron-super] +model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" +container_name = "vllm_nemotron_super" +max_model_len = 1048576 +gpu_memory_utilization = 0.85 +tensor_parallel_size = 2 +tool_call_parser = "qwen3_xml" +trust_remote_code = true +enable_prefix_caching = false +extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"] +extra_vllm_args = ["--reasoning-parser", "nemotron_v3"] + +# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. +[vllm.presets.gpt-oss-20b] +model = "openai/gpt-oss-20b" +container_name = "vllm_gpt_oss_20b" +max_model_len = 65536 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" + +# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. +# Hard architecture limit: max_position_embeddings=131072 in model config.json. +[vllm.presets.gpt-oss-120b] +model = "openai/gpt-oss-120b" +container_name = "vllm_gpt_oss_120b" +max_model_len = 131072 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" + +# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. +[vllm.presets.qwen25-coder-32b] +model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" +container_name = "vllm_qwen25_coder32b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "hermes" + +# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. +[vllm.presets.qwen3-coder-30b] +model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" +container_name = "vllm_qwen3_coder30b" +max_model_len = 65536 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "qwen3_coder" + +# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. +[vllm.presets.deepseek-r1-32b] +model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" +container_name = "vllm_deepseek_r1_32b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" +extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] + +# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. +[vllm.presets.qwen3-32b] +model = "Qwen/Qwen3-32B-AWQ" +container_name = "vllm_qwen3_32b" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "" +extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] + +# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). +[vllm.presets.devstral] +model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" +container_name = "vllm_devstral" +max_model_len = 32768 +gpu_memory_utilization = 0.92 +tensor_parallel_size = 1 +tool_call_parser = "mistral" +extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] + +[wireguard] +auto_setup = true +setup_script = "./wg1-setup.sh" + +[local_client] +check_wg1_service = true +interface_name = "wg1" +config_path = "/etc/wireguard/wg1.conf" diff --git a/hyperstack.rb b/hyperstack.rb index 61f53ec..5065fd1 100755 --- a/hyperstack.rb +++ b/hyperstack.rb @@ -169,17 +169,18 @@ module HyperstackVM end server_ip = fetch('network', 'wireguard_server_ip') - if server_ip - # Validate that the explicit server WireGuard IP is within the configured subnet. - begin - subnet = IPAddr.new(fetch('network', 'wireguard_subnet')) - unless subnet.include?(IPAddr.new(server_ip)) - raise Error, - "wireguard_server_ip #{server_ip.inspect} is not in wireguard_subnet #{fetch('network', 'wireguard_subnet')}" - end - rescue IPAddr::InvalidAddressError => e - raise Error, "Invalid wireguard_server_ip #{server_ip.inspect}: #{e.message}" + return unless server_ip + + # Validate that the explicit server WireGuard IP is within the configured subnet. + begin + subnet = IPAddr.new(fetch('network', 'wireguard_subnet')) + unless subnet.include?(IPAddr.new(server_ip)) + raise Error, + "wireguard_server_ip #{server_ip.inspect} is not in wireguard_subnet #{fetch('network', + 'wireguard_subnet')}" end + rescue IPAddr::InvalidAddressError => e + raise Error, "Invalid wireguard_server_ip #{server_ip.inspect}: #{e.message}" end end @@ -471,7 +472,7 @@ module HyperstackVM # mode which pre-allocates states for all sequences, consuming extra VRAM on startup. def vllm_prefix_caching_enabled? val = dig('vllm', 'enable_prefix_caching') - val.nil? ? true : truthy?(val) + val.nil? || truthy?(val) end def vllm_presets @@ -626,7 +627,8 @@ module HyperstackVM def fetch_public_cidr(url) uri = URI(url) - response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https', open_timeout: 5, read_timeout: 5) do |http| + response = Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == 'https', open_timeout: 5, + read_timeout: 5) do |http| http.request(Net::HTTP::Get.new(uri)) end return nil unless response.is_a?(Net::HTTPSuccess) @@ -1288,13 +1290,13 @@ module HyperstackVM script << 'docker pull vllm/vllm-openai:latest' if pull_image script << docker_run script << 'echo "Waiting for vLLM to become ready (up to 10 min for first model download)..."' - script << 'for i in $(seq 1 120); do' + script << 'for i in $(seq 1 240); do' script << " if curl -sf http://localhost:#{port}/v1/models >/dev/null 2>&1; then echo vllm-ready; break; fi" script << " state=$(docker inspect --format='{{.State.Status}}' #{Shellwords.escape(container)} 2>/dev/null || echo unknown)" - script << ' echo " vLLM not ready yet ($i/120, container=$state)..."' + script << ' echo " vLLM not ready yet ($i/240, container=$state)..."' script << ' sleep 5' script << 'done' - script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 10 minutes'; exit 1; }" + script << "curl -sf http://localhost:#{port}/v1/models >/dev/null || { echo 'FATAL: vLLM did not become ready within 20 minutes'; exit 1; }" script << 'echo vllm-install-ok' script.join("\n") end @@ -1367,7 +1369,7 @@ module HyperstackVM script << 'echo "Waiting for ComfyUI to become ready (up to 5 min)..."' script << 'for i in $(seq 1 60); do' script << " if curl -sf http://localhost:#{port}/system_stats >/dev/null 2>&1; then echo comfyui-ready; break; fi" - script << " echo \" ComfyUI not ready yet ($i/60)...\"; sleep 5" + script << ' echo " ComfyUI not ready yet ($i/60)..."; sleep 5' script << 'done' script << "curl -sf http://localhost:#{port}/system_stats >/dev/null || { echo 'FATAL: ComfyUI did not become ready within 5 minutes'; exit 1; }" @@ -1411,7 +1413,7 @@ module HyperstackVM script << 'echo "Waiting for ComfyUI restart..."' script << 'for i in $(seq 1 60); do' script << " if curl -sf http://localhost:#{port}/system_stats >/dev/null 2>&1; then echo comfyui-ready; break; fi" - script << " echo \" ComfyUI not ready yet ($i/60)...\"; sleep 5" + script << ' echo " ComfyUI not ready yet ($i/60)..."; sleep 5' script << 'done' script << 'echo comfyui-install-ok' @@ -1443,7 +1445,6 @@ module HyperstackVM ordered << normalized end end - end class RemoteProvisioner @@ -1520,7 +1521,11 @@ module HyperstackVM return unless status.success? remote_models = stdout.lines.drop(1).map { |line| line.split.first }.compact - missing = @scripts.desired_ollama_models.reject { |model| remote_models.any? { |remote| remote.start_with?(model) } } + missing = @scripts.desired_ollama_models.reject do |model| + remote_models.any? do |remote| + remote.start_with?(model) + end + end return if missing.empty? raise Error, "Models missing after setup: #{missing.join(', ')}. Remote has: #{remote_models.join(', ')}" @@ -1555,7 +1560,8 @@ module HyperstackVM @wg_setup_post = wg_setup_post end - def create(replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, install_comfyui: nil, vllm_preset: nil) + def create(replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, install_comfyui: nil, + vllm_preset: nil) # CLI flags override config; nil means "use config default". @effective_vllm = install_vllm.nil? ? @config.vllm_install_enabled? : install_vllm @effective_ollama = install_ollama.nil? ? @config.ollama_install_enabled? : install_ollama @@ -1624,6 +1630,7 @@ module HyperstackVM state = @state_store.load target_vm_id = vm_id || state&.dig('vm_id') raise Error, "No VM ID provided and no state file found at #{@state_store.path}." if target_vm_id.nil? + cleanup_local = !skip_local_cleanup && state && target_vm_id == state['vm_id'] if dry_run @@ -1666,7 +1673,8 @@ module HyperstackVM info "Tracked VM: #{state['vm_id']} #{vm['name']}" info "Status: #{vm['status']} / #{vm['vm_state']}" info "Public IP: #{connect_host_for(vm) || 'none'}" - info "Service mode: #{service_mode_summary(vllm_enabled: vllm_enabled, ollama_enabled: ollama_enabled, comfyui_enabled: comfyui_enabled)}" + info "Service mode: #{service_mode_summary(vllm_enabled: vllm_enabled, ollama_enabled: ollama_enabled, + comfyui_enabled: comfyui_enabled)}" info "Active model: #{state['vllm_model'] || @config.vllm_model}" if vllm_enabled if comfyui_enabled wg_ip = @config.wireguard_gateway_hostname @@ -1737,9 +1745,7 @@ module HyperstackVM @provisioner.decommission_litellm(host) # Stop the old container only when it has a different name from the new one. - if old_container != new_container - @provisioner.stop_vllm_container(host, old_container) - end + @provisioner.stop_vllm_container(host, old_container) if old_container != new_container info "Starting vLLM with preset '#{preset_name}' (#{preset['model']})..." # Skip docker pull: image is already present; pulling on every switch risks a @@ -1770,9 +1776,7 @@ module HyperstackVM ollama_enabled = state_ollama_enabled?(state) info "Running end-to-end inference tests via WireGuard (#{wg_ip})..." - if vllm_enabled - test_vllm(wg_ip) - end + test_vllm(wg_ip) if vllm_enabled info " Ollama test: connect via SSH and run 'ollama list' to verify models." if ollama_enabled @@ -1876,11 +1880,11 @@ module HyperstackVM info "Run 'ruby hyperstack.rb test' to verify vLLM." info " vLLM: http://#{wg_ip}:#{@config.ollama_port}/v1/models" end - if effective_comfyui? - info "Run 'ruby hyperstack.rb test' to verify ComfyUI." - info " ComfyUI: http://#{wg_ip}:#{@config.comfyui_port}/system_stats" - info " Enhance: ruby photo-enhance.rb --config #{File.basename(@config.path)} --indir ~/Pictures --outdir ~/Pictures/enhanced" - end + return unless effective_comfyui? + + info "Run 'ruby hyperstack.rb test' to verify ComfyUI." + info " ComfyUI: http://#{wg_ip}:#{@config.comfyui_port}/system_stats" + info " Enhance: ruby photo-enhance.rb --config #{File.basename(@config.path)} --indir ~/Pictures --outdir ~/Pictures/enhanced" end def build_create_payload(vm_name, resolved) @@ -2048,7 +2052,8 @@ module HyperstackVM 'HYPERSTACK_SSH_PRIVATE_KEY_PATH' => (File.exist?(@config.ssh_private_key_path) ? @config.ssh_private_key_path : '') } - Open3.popen2e(env, 'bash', @config.wireguard_setup_script, host, server_ip, wg_hostname) do |stdin, output, wait_thr| + Open3.popen2e(env, 'bash', @config.wireguard_setup_script, host, server_ip, + wg_hostname) do |stdin, output, wait_thr| stdin.sync = true stdin.puts stdin.close @@ -2325,8 +2330,12 @@ module HyperstackVM return end - output.puts("DRY RUN: local WireGuard peers would be removed for #{peer_summary}.") unless cleanup[:peers].empty? - output.puts("DRY RUN: local host entries would be removed for #{host_summary}.") unless cleanup[:hostnames].empty? + unless cleanup[:peers].empty? + output.puts("DRY RUN: local WireGuard peers would be removed for #{peer_summary}.") + end + unless cleanup[:hostnames].empty? + output.puts("DRY RUN: local host entries would be removed for #{host_summary}.") + end return end @@ -2392,9 +2401,7 @@ module HyperstackVM models = @scripts.desired_ollama_models info "Ollama models to pre-pull: #{models.join(', ')}" unless models.empty? end - if vllm_setup_needed?(state) - info "vLLM would be installed: #{@config.vllm_model}" - end + info "vLLM would be installed: #{@config.vllm_model}" if vllm_setup_needed?(state) if wireguard_setup_needed?(state) info "WireGuard auto-setup script would run: #{@config.wireguard_setup_script} #{state['public_ip'] || '<pending-public-ip>'}" end @@ -2559,7 +2566,11 @@ module HyperstackVM info 'Local WireGuard has peers for all managed VM IPs.' else present = expected_endpoints - missing - info "Local WireGuard has peers for: #{present.map { |endpoint| endpoint.split(':', 2).first }.join(', ')}" unless present.empty? + unless present.empty? + info "Local WireGuard has peers for: #{present.map do |endpoint| + endpoint.split(':', 2).first + end.join(', ')}" + end warn "Local WireGuard missing peers for: #{missing.map { |endpoint| endpoint.split(':', 2).first }.join(', ')}." end end @@ -2711,11 +2722,9 @@ module HyperstackVM ssh = build_ssh_command(config, wg_host) stdout, stderr, status = Timeout.timeout(15) { Open3.capture3(*ssh, stdin_data: script) } - unless status.success? - return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] - end + return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success? - gpu_section, rest = stdout.split("===COMFYUI===\n", 2) + gpu_section, rest = stdout.split("===COMFYUI===\n", 2) queue_section, hist_section = rest.to_s.split("===HISTORY===\n", 2) gpus = parse_nvidia_smi(gpu_section.to_s) metrics = parse_comfyui_queue(queue_section.to_s.strip, hist_section.to_s.strip) @@ -2724,7 +2733,11 @@ module HyperstackVM # Parse ComfyUI /queue JSON into a plain Hash. def parse_comfyui_queue(queue_json, history_count_str) - q = JSON.parse(queue_json) rescue {} + q = begin + JSON.parse(queue_json) + rescue StandardError + {} + end { 'queue_running' => Array(q['queue_running']).size, 'queue_pending' => Array(q['queue_pending']).size, @@ -2747,9 +2760,7 @@ module HyperstackVM ssh = build_ssh_command(config, wg_host) stdout, stderr, status = Timeout.timeout(15) { Open3.capture3(*ssh, stdin_data: script) } - unless status.success? - return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] - end + return [nil, nil, "exit #{status.exitstatus}: #{stderr.strip}"] unless status.success? gpu_section, vllm_section = stdout.split("===VLLM===\n", 2) gpus = parse_nvidia_smi(gpu_section.to_s) @@ -2769,13 +2780,13 @@ module HyperstackVM return {} if line.empty? { - 'avg_prompt_throughput' => extract_float(line, /Avg prompt throughput:\s*([\d.]+)/), - 'avg_generation_throughput' => extract_float(line, /Avg generation throughput:\s*([\d.]+)/), - 'running' => extract_float(line, /Running:\s*(\d+)\s*reqs/), - 'pending' => extract_float(line, /Waiting:\s*(\d+)\s*reqs/), - 'swapped' => extract_float(line, /Swapped:\s*(\d+)\s*reqs/), - 'gpu_cache_usage_pct' => extract_float(line, /GPU KV cache usage:\s*([\d.]+)%/), - 'gpu_prefix_cache_hit_rate_pct' => extract_float(line, /Prefix cache hit rate:\s*([\d.]+)%/), + 'avg_prompt_throughput' => extract_float(line, /Avg prompt throughput:\s*([\d.]+)/), + 'avg_generation_throughput' => extract_float(line, /Avg generation throughput:\s*([\d.]+)/), + 'running' => extract_float(line, /Running:\s*(\d+)\s*reqs/), + 'pending' => extract_float(line, /Waiting:\s*(\d+)\s*reqs/), + 'swapped' => extract_float(line, /Swapped:\s*(\d+)\s*reqs/), + 'gpu_cache_usage_pct' => extract_float(line, /GPU KV cache usage:\s*([\d.]+)%/), + 'gpu_prefix_cache_hit_rate_pct' => extract_float(line, /Prefix cache hit rate:\s*([\d.]+)%/) }.compact end @@ -2812,11 +2823,11 @@ module HyperstackVM next if parts.length < 7 GpuInfo.new( - index: parts[0].to_i, - name: parts[1], - temp_c: parts[2].to_f, - util_pct: parts[3].to_f, - power_w: parts[4].to_f, + index: parts[0].to_i, + name: parts[1], + temp_c: parts[2].to_f, + util_pct: parts[3].to_f, + power_w: parts[4].to_f, mem_used_mib: parts[5].to_f, mem_total_mib: parts[6].to_f ) @@ -2860,7 +2871,10 @@ module HyperstackVM # Single VM: simple vertical layout. rule = DIM + ('─' * 72) + RESET lines = [header, rule] - panels.each { |p| lines << ''; lines.concat(p) } + panels.each do |p| + lines << '' + lines.concat(p) + end lines << '' end @@ -2895,7 +2909,7 @@ module HyperstackVM lines.concat(render_comfyui_metrics(snap.metrics)) elsif snap.metrics&.any? lines.concat(render_vllm_metrics(snap.metrics)) - elsif snap.metrics&.empty? + elsif snap.metrics && snap.metrics.empty? lines << " #{DIM}(no Engine log line yet — container may still be loading)#{RESET}" end end @@ -2968,7 +2982,11 @@ module HyperstackVM # Colour: green below 50%, yellow 50–79%, red 80%+. def pct_bar(pct, width) filled = [(pct / 100.0 * width).round, width].min - color = pct >= 80 ? RED : pct >= 50 ? YELLOW : GREEN + color = if pct >= 80 + RED + else + pct >= 50 ? YELLOW : GREEN + end "[#{color}#{'█' * filled}#{RESET}#{' ' * (width - filled)}]" end @@ -2996,12 +3014,12 @@ module HyperstackVM puts 'Commands:' puts ' create [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama] [--model PRESET]' puts ' create-both [--replace] [--dry-run] [--vllm|--no-vllm] [--ollama|--no-ollama]' - puts ' Provision hyperstack-vm1.toml and hyperstack-vm2.toml concurrently.' + puts ' Provision hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml concurrently.' puts ' WireGuard setup is serialized: VM1 writes the base wg1.conf first,' puts ' then VM2 adds its peer. Requires both TOML files next to the script.' puts ' delete [--vm-id ID] [--dry-run]' puts ' delete-both [--dry-run]' - puts ' Delete the VMs tracked by hyperstack-vm1.toml and hyperstack-vm2.toml.' + puts ' Delete the VMs tracked by hyperstack-vm1-gptoss.toml and hyperstack-vm2.toml.' puts ' status' puts ' watch' puts ' Poll all active VMs for vLLM and GPU stats every 60 s.' @@ -3093,7 +3111,8 @@ module HyperstackVM raise Error, "Unknown model subcommand #{sub.inspect}. Use list or switch." end else - raise Error, "Unknown command #{command.inspect}. Use create, create-both, delete, delete-both, status, watch, test, or model." + raise Error, + "Unknown command #{command.inspect}. Use create, create-both, delete, delete-both, status, watch, test, or model." end end @@ -3104,7 +3123,8 @@ module HyperstackVM # (create-both), the --model flag is not registered because each VM uses its own # TOML default. Returns a hash suitable for splatting into Manager#create. def parse_create_options(argv, include_model_preset: true) - opts = { replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, install_comfyui: nil, vllm_preset: nil } + opts = { replace: false, dry_run: false, install_vllm: nil, install_ollama: nil, install_comfyui: nil, + vllm_preset: nil } OptionParser.new do |o| o.on('--replace', 'Delete the tracked VM before creating a new one') { opts[:replace] = true } o.on('--dry-run', 'Print the create plan without creating a VM') { opts[:dry_run] = true } @@ -3114,7 +3134,11 @@ module HyperstackVM o.on('--no-ollama', 'Disable Ollama setup (overrides config)') { opts[:install_ollama] = false } o.on('--comfyui', 'Enable ComfyUI setup (overrides config)') { opts[:install_comfyui] = true } o.on('--no-comfyui', 'Disable ComfyUI setup (overrides config)') { opts[:install_comfyui] = false } - o.on('--model PRESET', 'Use a named vLLM preset at create time') { |v| opts[:vllm_preset] = v } if include_model_preset + if include_model_preset + o.on('--model PRESET', 'Use a named vLLM preset at create time') do |v| + opts[:vllm_preset] = v + end + end end.parse!(argv) opts end @@ -3134,20 +3158,20 @@ module HyperstackVM client = HyperstackClient.new(base_url: config.api_base_url, api_key: config.api_key) local_wireguard = build_local_wireguard(config) Manager.new( - config: config, - client: client, - state_store: state_store, + config: config, + client: client, + state_store: state_store, local_wireguard: local_wireguard, - out: out, - wg_setup_pre: wg_setup_pre, - wg_setup_post: wg_setup_post + out: out, + wg_setup_pre: wg_setup_pre, + wg_setup_post: wg_setup_post ) end def build_local_wireguard(config) LocalWireGuard.new( interface_name: config.local_interface_name, - config_path: config.local_wg_config_path + config_path: config.local_wg_config_path ) end @@ -3156,9 +3180,7 @@ module HyperstackVM # that `status` would show (honours --config if given explicitly). def run_watch loaders = status_config_loaders - if loaders.empty? - raise Error, 'No active VMs found. Run `create` or `create-both` first.' - end + raise Error, 'No active VMs found. Run `create` or `create-both` first.' if loaders.empty? VllmWatcher.new(config_loaders: loaders).run end @@ -3188,7 +3210,7 @@ module HyperstackVM candidates = [ @config_path, - File.join(__dir__, 'hyperstack-vm1.toml'), + File.join(__dir__, 'hyperstack-vm1-gptoss.toml'), File.join(__dir__, 'hyperstack-vm2.toml'), File.join(__dir__, 'hyperstack-vm-photo.toml') ].uniq.select { |path| File.exist?(path) } @@ -3200,7 +3222,7 @@ module HyperstackVM def pair_config_loaders [ - ConfigLoader.load(File.join(__dir__, 'hyperstack-vm1.toml')), + ConfigLoader.load(File.join(__dir__, 'hyperstack-vm1-gptoss.toml')), ConfigLoader.load(File.join(__dir__, 'hyperstack-vm2.toml')) ] end @@ -3222,21 +3244,24 @@ module HyperstackVM # VM1 signals the latch after its WG step (whether WG ran or was already done). vm1_wg_post = proc do - wg_mutex.synchronize { vm1_wg_state[:done] = true; wg_cv.broadcast } + wg_mutex.synchronize do + vm1_wg_state[:done] = true + wg_cv.broadcast + end end # VM2 blocks here until VM1's WG step resolves, then raises if VM1 failed. vm2_wg_pre = proc do wg_mutex.synchronize { wg_cv.wait(wg_mutex) until vm1_wg_state[:done] || vm1_wg_state[:error] } - raise Error, "VM1 WireGuard setup failed; cannot add VM2 peer." if vm1_wg_state[:error] + raise Error, 'VM1 WireGuard setup failed; cannot add VM2 peer.' if vm1_wg_state[:error] end manager1 = build_manager(vm1_config, - out: PrefixedOutput.new('[vm1] ', $stdout, out_mutex), - wg_setup_post: vm1_wg_post) + out: PrefixedOutput.new('[vm1] ', $stdout, out_mutex), + wg_setup_post: vm1_wg_post) manager2 = build_manager(vm2_config, - out: PrefixedOutput.new('[vm2] ', $stdout, out_mutex), - wg_setup_pre: vm2_wg_pre) + out: PrefixedOutput.new('[vm2] ', $stdout, out_mutex), + wg_setup_pre: vm2_wg_pre) errors = {} create_opts = { replace: replace, dry_run: dry_run, @@ -3247,7 +3272,10 @@ module HyperstackVM rescue Error => e errors[:vm1] = e.message # Unblock VM2 even if VM1 failed so the process doesn't hang. - wg_mutex.synchronize { vm1_wg_state[:error] = e.message; wg_cv.broadcast } + wg_mutex.synchronize do + vm1_wg_state[:error] = e.message + wg_cv.broadcast + end end vm2_thread = Thread.new do @@ -3258,7 +3286,7 @@ module HyperstackVM [vm1_thread, vm2_thread].each(&:join) - errors.each { |vm, msg| $stderr.puts("ERROR [#{vm}]: #{msg}") } + errors.each { |vm, msg| warn("ERROR [#{vm}]: #{msg}") } exit 1 unless errors.empty? end @@ -3286,14 +3314,14 @@ module HyperstackVM begin local_manager = build_manager(loaders.first.config, out: local_wg_out) cleanup = local_manager.send(:cleanup_local_access, dry_run: dry_run, hostnames: hostnames, - allowed_ips: allowed_ips) + allowed_ips: allowed_ips) local_manager.send(:report_local_cleanup, local_wg_out, cleanup, dry_run: dry_run) rescue Error => e errors[:local_wireguard] = e.message end end - errors.each { |vm, msg| $stderr.puts("ERROR [#{vm}]: #{msg}") } + errors.each { |vm, msg| warn("ERROR [#{vm}]: #{msg}") } exit 1 unless errors.empty? end end @@ -1,11 +1,11 @@ # Single-VM setup (hyperstack-vm.toml → hyperstack.wg1) -abbr pi-hyperstack pi --model hyperstack/openai/gpt-oss-120b -abbr hyperstack-create ruby ~/git/hyperstack/hyperstack.rb create -abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete -abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test +abbr pi-hyperstack-gpt-oss-120b pi --model hyperstack/openai/gpt-oss-120b +abbr hyperstack-create ruby ~/git/hyperstack/hyperstack.rb create +abbr hyperstack-delete ruby ~/git/hyperstack/hyperstack.rb delete +abbr hyperstack-test ruby ~/git/hyperstack/hyperstack.rb test # Dual-VM setup (hyperstack-vm1/vm2.toml → hyperstack1/2.wg1) -abbr pi-hyperstack-nemotron pi --model hyperstack1/cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit -abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit -abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both -abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both +abbr pi-hyperstack-nemotron pi --model hyperstack1/openai/gpt-oss-120b +abbr pi-hyperstack-coder pi --model hyperstack2/bullpoint/Qwen3-Coder-Next-AWQ-4bit +abbr hyperstack-create-both ruby ~/git/hyperstack/hyperstack.rb create-both +abbr hyperstack-delete-both ruby ~/git/hyperstack/hyperstack.rb delete-both diff --git a/pi/agent/extensions/agent-plan-mode/utils.ts b/pi/agent/extensions/agent-plan-mode/utils.ts index 3f945e3..c7f66de 100644 --- a/pi/agent/extensions/agent-plan-mode/utils.ts +++ b/pi/agent/extensions/agent-plan-mode/utils.ts @@ -151,9 +151,13 @@ export function isSafeAskCommand(command: string): boolean { return !MUTATING_TASK_PATTERNS.some((pattern) => pattern.test(trimmed)); } +function isAskCommand(command: string): boolean { + return command.trim().startsWith("ask ") || command.trim() === "ask"; +} + export function isSafePlanCommand(command: string): boolean { if (containsRawTaskCommand(command)) return false; - if (isSafeAskCommand(command)) return true; + if (isAskCommand(command)) return true; const isDestructive = DESTRUCTIVE_PATTERNS.some((pattern) => pattern.test(command)); const isSafe = SAFE_PATTERNS.some((pattern) => pattern.test(command)); diff --git a/pi/agent/extensions/ask-mode/utils.ts b/pi/agent/extensions/ask-mode/utils.ts index db8c889..835549e 100644 --- a/pi/agent/extensions/ask-mode/utils.ts +++ b/pi/agent/extensions/ask-mode/utils.ts @@ -87,7 +87,32 @@ const SAFE_PATTERNS = [ /^\s*exa\b/, ]; +const MUTATING_ASK_PATTERNS = [ + /\badd\b/i, + /\bannotate\b/i, + /\bappend\b/i, + /\bdelete\b/i, + /\bdenotate\b/i, + /\bdone\b/i, + /\blog\b/i, + /\bmodify\b/i, + /\bprepend\b/i, + /\bstart\b/i, + /\bstop\b/i, + /\bundo\b/i, + /\bpriority\b/i, + /\btag\b/i, +]; + +function isReadOnlyAskCommand(command: string): boolean { + const trimmed = command.trim(); + if (!trimmed.startsWith("ask ") && trimmed !== "ask") return false; + if (/[;&]/.test(trimmed) || /(^|[^|])\|([^|]|$)/.test(trimmed)) return false; + return !MUTATING_ASK_PATTERNS.some((pattern) => pattern.test(trimmed)); +} + export function isSafeAskModeCommand(command: string): boolean { + if (isReadOnlyAskCommand(command)) return true; const isDestructive = DESTRUCTIVE_PATTERNS.some((pattern) => pattern.test(command)); const isSafe = SAFE_PATTERNS.some((pattern) => pattern.test(command)); return !isDestructive && isSafe; diff --git a/pi/agent/models.json b/pi/agent/models.json index 76e37ab..3dfd72d 100644 --- a/pi/agent/models.json +++ b/pi/agent/models.json @@ -1,7 +1,7 @@ { "providers": { "hyperstack": { - "baseUrl": "http://hyperstack.wg1:11434/v1", + "baseUrl": "http://hyperstack1.wg1:11434/v1", "apiKey": "EMPTY", "api": "openai-completions", "compat": { diff --git a/pi/agent/settings.json b/pi/agent/settings.json index fbb3874..d8ea9a3 100644 --- a/pi/agent/settings.json +++ b/pi/agent/settings.json @@ -1,5 +1,6 @@ { "lastChangelogVersion": "0.62.0", "defaultProvider": "openai", - "defaultModel": "gpt-4.1" + "defaultModel": "gpt-4.1", + "defaultThinkingLevel": "high" }
\ No newline at end of file |
