[auth] api_key_file = "~/.hyperstack" [hyperstack] base_url = "https://infrahub-api.nexgencloud.com/v1" [state] # Separate state file for VM2 so vm1 and vm2 can be managed independently. file = ".hyperstack-vm2-state.json" [vm] name_prefix = "hyperstack2" hostname = "hyperstack2" environment_name = "snonux-ollama" # A100-80GB is the cost-first default for qwen3-coder-next inference. # Switch this to n3-H100x1 if you want safer throughput and compatibility headroom. flavor_name = "n3-A100x1" image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker" assign_floating_ip = true create_bootable_volume = false enable_port_randomization = false labels = ["qwen3-coder-next", "wireguard"] [ssh] username = "ubuntu" private_key_path = "~/.ssh/id_rsa" hyperstack_key_name = "earth" port = 22 connect_timeout_sec = 10 [network] wireguard_udp_port = 56710 wireguard_subnet = "192.168.3.0/24" # VM2 gets the third server-side WireGuard IP (skipping .2 which is the earth client). # earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3. wireguard_server_ip = "192.168.3.3" # Secure default: "auto" resolves your current public egress IP to /32 at runtime. # Override with explicit CIDRs if you deploy from multiple networks or want broader access. allowed_ssh_cidrs = ["auto"] allowed_wireguard_cidrs = ["auto"] # Port 11434 is shared by both Ollama and vLLM for firewall compatibility. ollama_port = 11434 [bootstrap] enable_guest_bootstrap = true install_wireguard = true configure_ufw = true configure_ollama_host = false [ollama] # Disabled in favour of vLLM; set install = true to switch back to Ollama. install = false models_dir = "/ephemeral/ollama/models" listen_host = "0.0.0.0:11434" gpu_overhead_mb = 2000 num_parallel = 1 context_length = 32768 pull_models = ["qwen3-coder-next"] # vLLM serves one model via Docker on the OpenAI-compatible API. # VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset. [vllm] install = true model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" # HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers). hug_cache_dir = "/ephemeral/hug" container_name = "vllm_qwen3" max_model_len = 262144 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch '. # Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults. [vllm.presets.qwen3-coder-next] model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit" container_name = "vllm_qwen3" max_model_len = 262144 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total). # ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation. # Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget. # Requires trust_remote_code=true for the nemotron_h architecture. [vllm.presets.nemotron-super] model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit" container_name = "vllm_nemotron_super" max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_xml" trust_remote_code = true # --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model # otherwise needs alongside the ~60 GB weights. Trades some throughput for stability. extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"] # OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100. [vllm.presets.gpt-oss-20b] model = "openai/gpt-oss-20b" container_name = "vllm_gpt_oss_20b" max_model_len = 65536 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "" # OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100. # Hard architecture limit: max_position_embeddings=131072 in model config.json. [vllm.presets.gpt-oss-120b] model = "openai/gpt-oss-120b" container_name = "vllm_gpt_oss_120b" max_model_len = 131072 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "" # Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100. [vllm.presets.qwen25-coder-32b] model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" container_name = "vllm_qwen25_coder32b" max_model_len = 32768 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "hermes" # Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB. [vllm.presets.qwen3-coder-30b] model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ" container_name = "vllm_qwen3_coder30b" max_model_len = 65536 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "qwen3_coder" # DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100. [vllm.presets.deepseek-r1-32b] model = "casperhansen/deepseek-r1-distill-qwen-32b-awq" container_name = "vllm_deepseek_r1_32b" max_model_len = 32768 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "" extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] # Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100. [vllm.presets.qwen3-32b] model = "Qwen/Qwen3-32B-AWQ" container_name = "vllm_qwen3_32b" max_model_len = 32768 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "" extra_vllm_args = ["--reasoning-parser", "deepseek_r1"] # Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100). [vllm.presets.devstral] model = "cyankiwi/Devstral-Small-2507-AWQ-4bit" container_name = "vllm_devstral" max_model_len = 32768 gpu_memory_utilization = 0.92 tensor_parallel_size = 1 tool_call_parser = "mistral" extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"] [wireguard] auto_setup = true setup_script = "./wg1-setup.sh" [local_client] check_wg1_service = true interface_name = "wg1" config_path = "/etc/wireguard/wg1.conf"