hyperstack-vm2.toml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

[auth]
api_key_file = "~/.hyperstack"

[hyperstack]
base_url = "https://infrahub-api.nexgencloud.com/v1"

[state]
# Separate state file for VM2 so vm1 and vm2 can be managed independently.
file = ".hyperstack-vm2-state.json"

[vm]
name_prefix = "hyperstack2"
hostname = "hyperstack2"
environment_name = "snonux-ollama"

# A100-80GB is the cost-first default for qwen3-coder-next inference.
# Switch this to n3-H100x1 if you want safer throughput and compatibility headroom.
flavor_name = "n3-A100x1"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
labels = ["qwen3-coder-next", "wireguard"]

[ssh]
username = "ubuntu"
private_key_path = "~/.ssh/id_rsa"
hyperstack_key_name = "earth"
port = 22
connect_timeout_sec = 10

[network]
wireguard_udp_port = 56710
wireguard_subnet = "192.168.3.0/24"
# VM2 gets the third server-side WireGuard IP (skipping .2 which is the earth client).
# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3.
wireguard_server_ip = "192.168.3.3"
# Secure default: "auto" resolves your current public egress IP to /32 at runtime.
# Override with explicit CIDRs if you deploy from multiple networks or want broader access.
allowed_ssh_cidrs = ["auto"]
allowed_wireguard_cidrs = ["auto"]
# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
ollama_port = 11434

[bootstrap]
enable_guest_bootstrap = true
install_wireguard = true
configure_ufw = true
configure_ollama_host = false

[ollama]
# Disabled in favour of vLLM; set install = true to switch back to Ollama.
install = false
models_dir = "/ephemeral/ollama/models"
listen_host = "0.0.0.0:11434"
gpu_overhead_mb = 2000
num_parallel = 1
context_length = 32768
pull_models = ["qwen3-coder-next"]

# vLLM serves one model via Docker on the OpenAI-compatible API.
# VM2 defaults to qwen3-coder-next; use 'model switch' to load any other preset.
[vllm]
install = true
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_qwen3"
max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm2.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.

[vllm.presets.qwen3-coder-next]
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
container_name = "vllm_qwen3"
max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
# ~60 GB weights on A100 80GB; ~13 GB remaining for KV cache at 0.92 utilisation.
# Uses NoPE so any context length is valid; capped at 131072 to keep KV cache within VRAM budget.
# Requires trust_remote_code=true for the nemotron_h architecture.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_xml"
trust_remote_code = true
# --enforce-eager disables CUDA graph capture, freeing ~3-4 GB of VRAM the model
# otherwise needs alongside the ~60 GB weights. Trades some throughput for stability.
extra_vllm_args = ["--reasoning-parser", "nemotron_v3", "--enforce-eager"]

# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]
model = "openai/gpt-oss-20b"
container_name = "vllm_gpt_oss_20b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""

# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
# Hard architecture limit: max_position_embeddings=131072 in model config.json.
[vllm.presets.gpt-oss-120b]
model = "openai/gpt-oss-120b"
container_name = "vllm_gpt_oss_120b"
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]

# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
[vllm.presets.qwen25-coder-32b]
model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
container_name = "vllm_qwen25_coder32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "hermes"

# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB.
[vllm.presets.qwen3-coder-30b]
model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ"
container_name = "vllm_qwen3_coder30b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
[vllm.presets.deepseek-r1-32b]
model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
container_name = "vllm_deepseek_r1_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
[vllm.presets.qwen3-32b]
model = "Qwen/Qwen3-32B-AWQ"
container_name = "vllm_qwen3_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
[vllm.presets.devstral]
model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
container_name = "vllm_devstral"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "mistral"
extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]

[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"

[local_client]
check_wg1_service = true
interface_name = "wg1"
config_path = "/etc/wireguard/wg1.conf"