[auth]
api_key_file = "~/.hyperstack"

[hyperstack]
base_url = "https://infrahub-api.nexgencloud.com/v1"

[state]
# Separate state file for VM1 so vm1 and vm2 can be managed independently.
file = ".hyperstack-vm1-state.json"

[vm]
name_prefix = "hyperstack1"
hostname = "hyperstack1"
environment_name = "snonux-ollama"

# H100-80GB x2: dual GPU enables tensor-parallel inference for Nemotron-3-Super at 1M context.
# Two 80 GB GPUs = 160 GB total VRAM; ~68 GB weights leave ~84 GB for KV cache (enough for 1M tokens).
# Also eliminates the --enforce-eager workaround required on a single H100 (insufficient KV cache headroom).
flavor_name = "n3-H100x2"
image_name = "Ubuntu Server 24.04 LTS R570 CUDA 12.8 with Docker"
assign_floating_ip = true
create_bootable_volume = false
enable_port_randomization = false
labels = ["nemotron-3-super", "wireguard"]

[ssh]
username = "ubuntu"
private_key_path = "~/.ssh/id_rsa"
hyperstack_key_name = "earth"
port = 22
connect_timeout_sec = 10

[network]
wireguard_udp_port = 56710
wireguard_subnet = "192.168.3.0/24"
# VM1 gets the first server-side WireGuard IP (gateway address + 0).
# earth (client) is 192.168.3.2; VM1 is 192.168.3.1; VM2 is 192.168.3.3.
wireguard_server_ip = "192.168.3.1"
# Secure default: "auto" resolves your current public egress IP to /32 at runtime.
# Override with explicit CIDRs if you deploy from multiple networks or want broader access.
allowed_ssh_cidrs = ["auto"]
allowed_wireguard_cidrs = ["auto"]
# Port 11434 is shared by both Ollama and vLLM for firewall compatibility.
ollama_port = 11434

[bootstrap]
enable_guest_bootstrap = true
install_wireguard = true
configure_ufw = true
configure_ollama_host = false

[ollama]
# Disabled in favour of vLLM; set install = true to switch back to Ollama.
install = false
models_dir = "/ephemeral/ollama/models"
listen_host = "0.0.0.0:11434"
gpu_overhead_mb = 2000
num_parallel = 1
context_length = 32768
pull_models = ["nemotron-3-super"]

# vLLM serves one model via Docker on the OpenAI-compatible API.
# VM1 defaults to nemotron-3-super with extended context via tensor parallelism across both H100s.
[vllm]
install = true
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
# HuggingFace model cache on ephemeral NVMe (fast; survives reboots on most providers).
hug_cache_dir = "/ephemeral/hug"
container_name = "vllm_nemotron_super"
# 1M context requested; VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 overrides the config.json limit of 262144.
# NemotronH is a hybrid Mamba+attention MoE: Mamba layers are positionless (unlimited context),
# attention layers use short local windows — so exceeding max_position_embeddings is safe here.
max_model_len = 1048576
# 0.85 leaves ~12 GiB free per GPU for Mamba state cache + CUDA graphs + sampler warmup.
# 0.92+ OOMs during sampler warmup: prefix caching triggers Mamba "all" mode (pre-allocated states)
# which consumes the remaining headroom before the dummy sampler pass can allocate.
gpu_memory_utilization = 0.85
tensor_parallel_size = 2
# NVIDIA Nemotron-3-Super uses the same XML tool call format as Qwen3 XML.
tool_call_parser = "qwen3_xml"
trust_remote_code = true
# Disable prefix caching: on NemotronH it forces Mamba into "all" cache mode (pre-allocated states
# for all max_num_seqs), which exhausts VRAM before the sampler warmup. Without prefix caching,
# Mamba uses per-request state allocation, which is cheaper at startup.
enable_prefix_caching = false
# VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 overrides the config.json max_position_embeddings=262144 limit.
# PYTORCH_ALLOC_CONF=expandable_segments:True reduces fragmentation in large allocations.
extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
# No --enforce-eager: dual-GPU VRAM headroom supports CUDA graph capture alongside the KV cache.
extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]

# Named model presets for 'ruby hyperstack.rb --config hyperstack-vm1.toml model switch <name>'.
# Each preset overrides the matching [vllm] field; unset fields fall back to [vllm] defaults.

[vllm.presets.qwen3-coder-next]
model = "bullpoint/Qwen3-Coder-Next-AWQ-4bit"
container_name = "vllm_qwen3"
max_model_len = 262144
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# NVIDIA Nemotron-3-Super-120B-A12B AWQ 4-bit — hybrid Mamba+MoE (12B active / 120B total).
# ~68 GB weights split across 2x H100 PCIe 80GB via tensor parallelism (~34 GB per GPU).
# max_position_embeddings=262144 is the model's architectural limit; CUDA graphs work without --enforce-eager.
# Requires trust_remote_code=true for the nemotron_h architecture.
[vllm.presets.nemotron-super]
model = "cyankiwi/NVIDIA-Nemotron-3-Super-120B-A12B-AWQ-4bit"
container_name = "vllm_nemotron_super"
max_model_len = 1048576
gpu_memory_utilization = 0.85
tensor_parallel_size = 2
tool_call_parser = "qwen3_xml"
trust_remote_code = true
enable_prefix_caching = false
extra_docker_env = ["VLLM_ALLOW_LONG_MAX_MODEL_LEN=1", "PYTORCH_ALLOC_CONF=expandable_segments:True"]
extra_vllm_args = ["--reasoning-parser", "nemotron_v3"]

# OpenAI GPT-OSS 20B — ultra-fast MoE (3.6B active / 20B total, MXFP4), ~14 GB on A100.
[vllm.presets.gpt-oss-20b]
model = "openai/gpt-oss-20b"
container_name = "vllm_gpt_oss_20b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""

# OpenAI GPT-OSS 120B — powerful MoE (5.1B active / 117B total, MXFP4), ~65 GB on A100.
# Hard architecture limit: max_position_embeddings=131072 in model config.json.
[vllm.presets.gpt-oss-120b]
model = "openai/gpt-oss-120b"
container_name = "vllm_gpt_oss_120b"
max_model_len = 131072
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "openai_gptoss"]

# Qwen2.5-Coder-32B-Instruct AWQ — best-in-class open coding model at 32B, ~18 GB on A100.
[vllm.presets.qwen25-coder-32b]
model = "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"
container_name = "vllm_qwen25_coder32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "hermes"

# Qwen3-Coder-30B-A3B AWQ — Qwen3 generation coding MoE (3B active / 30B total), ~18 GB.
[vllm.presets.qwen3-coder-30b]
model = "QuantTrio/Qwen3-Coder-30B-A3B-Instruct-AWQ"
container_name = "vllm_qwen3_coder30b"
max_model_len = 65536
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "qwen3_coder"

# DeepSeek-R1-Distill-Qwen-32B AWQ — R1 reasoning distillation of Qwen 32B, ~18 GB on A100.
[vllm.presets.deepseek-r1-32b]
model = "casperhansen/deepseek-r1-distill-qwen-32b-awq"
container_name = "vllm_deepseek_r1_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Qwen3-32B AWQ — dense 32B reasoning model with extended context, ~18 GB on A100.
[vllm.presets.qwen3-32b]
model = "Qwen/Qwen3-32B-AWQ"
container_name = "vllm_qwen3_32b"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = ""
extra_vllm_args = ["--reasoning-parser", "deepseek_r1"]

# Devstral-Small-2507 AWQ — Mistral's coding agent model (~15 GB on A100).
[vllm.presets.devstral]
model = "cyankiwi/Devstral-Small-2507-AWQ-4bit"
container_name = "vllm_devstral"
max_model_len = 32768
gpu_memory_utilization = 0.92
tensor_parallel_size = 1
tool_call_parser = "mistral"
extra_vllm_args = ["--tokenizer_mode", "mistral", "--config_format", "mistral"]

[wireguard]
auto_setup = true
setup_script = "./wg1-setup.sh"

[local_client]
check_wg1_service = true
interface_name = "wg1"
config_path = "/etc/wireguard/wg1.conf"