summaryrefslogtreecommitdiff
path: root/hyperstack-vm1.toml
blob: 8df93f56618cc4a22414267da2ffc4bf6bd41243 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
[auth]
api_key_file = "~/.hyperstack"

[hyperstack]
base_url = "https://infrahub-api.nexgencloud.com/v1"

[state]
# Separate state file for VM1 so vm1 and vm2 can be managed independently.
file = ".hyperstack-vm1-state.json"

[vm]
name_prefix = "hyperstack1"
hostname = "hyperstack1"
environment_name = "snonux-ollama"

# H100-80GB x2: dual GPU enables tensor-parallel inference for Nemotron-3-Super at 1M context.
# Two 80 GB GPUs = 160 GB total VRAM; ~68 GB weights leave ~84 GB for KV cache (enough for 1M tokens).
# Also eliminates the --enforce-eager workaround required on a single H100 (insufficient KV cache headroom).
flavor_name = "n3-H100x2"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
labels = ["nemotron-3-super", "wireguard"]

[ssh]
username = "ubuntu"
private_key_path = "~/.ssh/id_rsa"
hyperstack_key_name = "earth"
port = 22
connect_timeout_sec = 10

[network]
wireguard_udp_port = 56710
wireguard_subnet = "192.168.3.0/24"
# VM1 gets the first server-side WireGuard IP (gateway address + 0).
# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3.
wireguard_server_ip = "192.168.3.1"
# Secure default: "auto" resolves your current public egress IP to /32 at runtime.
# Override with explicit CIDRs if you deploy from multiple networks or want broader access.
allowed_ssh_cidrs = ["auto"]
allowed_wireguard_cidrs = ["auto"]
# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
ollama_port = 11434

[bootstrap]
enable_guest_bootstrap = true
install_wireguard = true
configure_ufw = true
configure_ollama_host = false

[ollama]
# Disabled in favour of vLLM; set install = true to switch back to Ollama.
install = false
models_dir = "/ephemeral/ollama/models"
listen_host = "0.0.0.0:11434"
gpu_overhead_mb = 2000
num_parallel = 1
context_length = 32768
pull_models = ["nemotron-3-super"]

# vLLM serves one model via Docker on the OpenAI-compatible API.
# VM1 defaults to nemotron-3-super with extended context via tensor parallelism across both H100s.
[vllm]
install = true
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_nemotron_super"
# 1M context requested; VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 overrides the config.json limit of 262144.
# NemotronH is a hybrid Mamba+attention MoE: Mamba layers are positionless (unlimited context),
# attention layers use short local windows — so exceeding max_position_embeddings is safe here.
max_model_len = 1048576
# 0.85 leaves ~12 GiB free per GPU for Mamba state cache + CUDA graphs + sampler warmup.
# 0.92+ OOMs during sampler warmup: prefix caching triggers Mamba "all" mode (pre-allocated states)
# which consumes the remaining headroom before the dummy sampler pass can allocate.
gpu_memory_utilization = 0.85
tensor_parallel_size = 2
# NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML.
tool_call_parser = "qwen3_xml"
trust_remote_code = true
# Disable prefix caching: on NemotronH it forces Mamba into "all" cache mode (pre-allocated states
# for all max_num_seqs), which exhausts VRAM before the sampler warmup. Without prefix caching,
# Mamba uses per-request state allocation, which is cheaper at startup.
enable_prefix_caching = false
# VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 overrides the config.json max_position_embeddings=262144 limit.
# PYTORCH_ALLOC_CONF=expandable_segments:True reduces fragmentation in large allocations.
extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
# No --enforce-eager: dual-GPU VRAM headroom supports CUDA graph capture alongside the KV cache.
extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]

# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.

[vllm.presets.qwen3-coder-next]
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
container_name = "vllm_qwen3"
max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
# ~68 GB weights split across 2x H100 PCIe 80GB via tensor parallelism (~34 GB per GPU).
# max_position_embeddings=262144 is the model's architectural limit; CUDA graphs work without --enforce-eager.
# Requires trust_remote_code=true for the nemotron_h architecture.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
max_model_len = 1048576
gpu_memory_utilization = 0.85
tensor_parallel_size = 2
tool_call_parser = "qwen3_xml"
trust_remote_code = true
enable_prefix_caching = false
extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]

# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]
model = "openai/gpt-oss-20b"
container_name = "vllm_gpt_oss_20b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""

# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
# Hard architecture limit: max_position_embeddings=131072 in model config.json.
[vllm.presets.gpt-oss-120b]
model = "openai/gpt-oss-120b"
container_name = "vllm_gpt_oss_120b"
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]

# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
[vllm.presets.qwen25-coder-32b]
model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
container_name = "vllm_qwen25_coder32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "hermes"

# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB.
[vllm.presets.qwen3-coder-30b]
model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ"
container_name = "vllm_qwen3_coder30b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
[vllm.presets.deepseek-r1-32b]
model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
container_name = "vllm_deepseek_r1_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
[vllm.presets.qwen3-32b]
model = "Qwen/Qwen3-32B-AWQ"
container_name = "vllm_qwen3_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
[vllm.presets.devstral]
model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
container_name = "vllm_devstral"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "mistral"
extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]

[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"

[local_client]
check_wg1_service = true
interface_name = "wg1"
config_path = "/etc/wireguard/wg1.conf"