Created
May 1, 2026 17:42
-
-
Save craftogrammer/3eaacbcf32dbb07a20fed4a3b42992be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| param( | |
| # Optional explicit override. If empty, shows interactive picker scanning HF cache. | |
| [string]$Model = "", | |
| # Context budget. Bench-validated max on 16 GB at ub=512 + ncmoe=4: see notes below. | |
| # If you OOM at deep context, raise -NCpuMoe. | |
| [int]$Context = 196608, | |
| [int]$ReasoningBudget = 8192, | |
| # Reasoning ON by default. Pass -ReasoningOff to disable on a per-launch basis. | |
| [switch]$ReasoningOff | |
| ) | |
| $ErrorActionPreference = "Stop" | |
| $llama = Join-Path $PSScriptRoot "build\bin\llama-server.exe" | |
| if (-not (Test-Path $llama)) { | |
| Write-Host "[qwen-moe-turbo] llama-server.exe not built. Run: compile.ps1" -ForegroundColor Red | |
| exit 1 | |
| } | |
| # ---------------------------------------------------------------------- | |
| # Model picker — scans ~\models recursively for .gguf files. | |
| # Each subfolder may contain one main model + optional mmproj.gguf | |
| # (vision projector). When a model is picked, the launcher auto-links | |
| # any mmproj sibling found in the same folder via --mmproj. | |
| # ---------------------------------------------------------------------- | |
| $mmproj = "" | |
| if (-not $Model) { | |
| $modelsDir = Join-Path $env:USERPROFILE "models" | |
| if (-not (Test-Path $modelsDir)) { | |
| Write-Host "[qwen-moe-turbo] models folder not found at $modelsDir - creating it" -ForegroundColor Yellow | |
| New-Item -ItemType Directory -Path $modelsDir -Force | Out-Null | |
| } | |
| # Recursive scan; treat any .gguf whose name contains "mmproj" as a projector, | |
| # not a selectable model. | |
| $allGgufs = Get-ChildItem -Path $modelsDir -Filter "*.gguf" -File -Recurse -ErrorAction SilentlyContinue | |
| $ggufs = $allGgufs | Where-Object { $_.Name -notmatch '(?i)mmproj' } | Sort-Object Length | |
| if ($ggufs.Count -eq 0) { | |
| Write-Host "[qwen-moe-turbo] No model .gguf files found in $modelsDir (or its subfolders)" -ForegroundColor Red | |
| Write-Host "Drop .gguf files into that folder (or subfolders) and rerun." -ForegroundColor Yellow | |
| exit 1 | |
| } | |
| # Single match -> skip picker | |
| if ($ggufs.Count -eq 1) { | |
| $Model = $ggufs[0].FullName | |
| Write-Host "[qwen-moe-turbo] Auto-selected only available model: $($ggufs[0].Name)" -ForegroundColor Cyan | |
| } else { | |
| # Pre-compute display rows; show folder name + filename + size | |
| $rows = @() | |
| for ($i = 0; $i -lt $ggufs.Count; $i++) { | |
| $sizeGB = [math]::Round($ggufs[$i].Length / 1GB, 2) | |
| $folder = Split-Path $ggufs[$i].DirectoryName -Leaf | |
| # Detect any mmproj sibling for this gguf's directory | |
| $hasMmproj = ($allGgufs | Where-Object { | |
| $_.DirectoryName -eq $ggufs[$i].DirectoryName -and $_.Name -match '(?i)mmproj' | |
| } | Select-Object -First 1) -ne $null | |
| $tag = if ($hasMmproj) { " [+vision]" } else { "" } | |
| $rows += [PSCustomObject]@{ | |
| Line1 = ("{0,6:N2} GB {1}{2}" -f $sizeGB, $ggufs[$i].Name, $tag) | |
| Line2 = " in $folder/" | |
| } | |
| } | |
| Write-Host "" | |
| Write-Host "Available GGUFs in ${modelsDir}:" -ForegroundColor Cyan | |
| Write-Host "" | |
| $selected = 0 | |
| $count = $rows.Count | |
| [Console]::CursorVisible = $false | |
| $startTop = [Console]::CursorTop | |
| try { | |
| while ($true) { | |
| [Console]::SetCursorPosition(0, $startTop) | |
| for ($i = 0; $i -lt $count; $i++) { | |
| $marker = if ($i -eq $selected) { ">" } else { " " } | |
| $col1 = if ($i -eq $selected) { "Yellow" } else { "White" } | |
| $col2 = if ($i -eq $selected) { "DarkYellow" } else { "DarkGray" } | |
| $line1 = "{0} {1,2} {2}" -f $marker, $i, $rows[$i].Line1 | |
| $line2 = " {0}" -f $rows[$i].Line2 | |
| # Pad to clear any leftover characters from previous longer rows | |
| Write-Host ($line1.PadRight([Console]::WindowWidth - 1)) -ForegroundColor $col1 | |
| Write-Host ($line2.PadRight([Console]::WindowWidth - 1)) -ForegroundColor $col2 | |
| } | |
| Write-Host "" | |
| Write-Host ((" up/down navigate | Enter select | Esc cancel | digit jumps").PadRight([Console]::WindowWidth - 1)) -ForegroundColor DarkGray | |
| $key = [Console]::ReadKey($true) | |
| switch ($key.Key) { | |
| 'UpArrow' { if ($selected -gt 0) { $selected-- } } | |
| 'DownArrow' { if ($selected -lt $count-1) { $selected++ } } | |
| 'Home' { $selected = 0 } | |
| 'End' { $selected = $count - 1 } | |
| 'Enter' { | |
| [Console]::CursorVisible = $true | |
| $Model = $ggufs[$selected].FullName | |
| Write-Host "" | |
| break | |
| } | |
| 'Escape' { | |
| [Console]::CursorVisible = $true | |
| Write-Host "" | |
| Write-Host "[qwen-moe-turbo] cancelled" -ForegroundColor Yellow | |
| exit 1 | |
| } | |
| default { | |
| if ($key.KeyChar -match '^\d$') { | |
| $n = [int]$key.KeyChar.ToString() | |
| if ($n -lt $count) { $selected = $n } | |
| } | |
| } | |
| } | |
| if ($key.Key -eq 'Enter') { break } | |
| } | |
| } finally { | |
| [Console]::CursorVisible = $true | |
| } | |
| } | |
| } | |
| if (-not (Test-Path $Model)) { | |
| Write-Host "[qwen-moe-turbo] model not found: $Model" -ForegroundColor Red | |
| exit 1 | |
| } | |
| # ---------------------------------------------------------------------- | |
| # Detect mmproj (vision projector) in the model's directory. | |
| # Any .gguf with "mmproj" in the name (case-insensitive) qualifies. | |
| # ---------------------------------------------------------------------- | |
| $modelDir = Split-Path $Model -Parent | |
| $mmprojCandidate = Get-ChildItem -Path $modelDir -Filter "*.gguf" -File -ErrorAction SilentlyContinue | | |
| Where-Object { $_.Name -match '(?i)mmproj' } | | |
| Select-Object -First 1 | |
| if ($mmprojCandidate) { | |
| $mmproj = $mmprojCandidate.FullName | |
| Write-Host "[qwen-moe-turbo] vision mmproj detected: $($mmprojCandidate.Name)" -ForegroundColor Cyan | |
| } | |
| # ---------------------------------------------------------------------- | |
| # Reasoning ON by default; override with -ReasoningOff | |
| # ---------------------------------------------------------------------- | |
| $reasoningEnabled = -not $ReasoningOff | |
| $reasoningArg = if ($reasoningEnabled) { "on" } else { "off" } | |
| $mode = if ($reasoningEnabled) { "reasoning ON" } else { "reasoning OFF" } | |
| # Chat-template kwargs — keep simple, let llama.cpp's --reasoning flag | |
| # handle the on/off semantics. Layering enable_thinking + budget=0 on | |
| # top caused weirdness during testing; the --reasoning flag alone is | |
| # what's bench-validated. | |
| $chatKwargs = '{\"preserve_thinking\": true}' | |
| $effectiveBudget = $ReasoningBudget | |
| # ---------------------------------------------------------------------- | |
| # Sampler defaults — empirically validated config (fork qwen-turbo.ps1 path): | |
| # Coding (reasoning ON): 11/11 EventBus, 12/12 TaskScheduler at these params | |
| # Qwen3 model card suggested presence=0.0 for coding, but the fork's | |
| # presence=1.5 outperformed that on real coding harnesses; trusting the | |
| # empirical result over the card recommendation. | |
| # ---------------------------------------------------------------------- | |
| if ($reasoningEnabled) { | |
| $temp = 0.6; $topP = 0.95; $topK = 20; $presencePenalty = 1.5 | |
| } else { | |
| $temp = 0.7; $topP = 0.80; $topK = 20; $presencePenalty = 1.5 | |
| } | |
| $env:LLAMA_ARG_KV_UNIFIED = "1" | |
| $env:LLAMA_ARG_CACHE_IDLE_SLOTS = "1" | |
| $modelName = [System.IO.Path]::GetFileNameWithoutExtension($Model) | |
| Write-Host "" | |
| Write-Host "[qwen-moe-turbo] $modelName" -ForegroundColor Cyan | |
| Write-Host " ctx=$Context (fit-ctx 65536), ub=512, ctk/v=q8_0, $mode" -ForegroundColor Gray | |
| Write-Host "" | |
| # ============================================================================ | |
| # Bench-validated config for RTX 5080 16GB + Qwen3.6-35B-A3B (mainline b8967) | |
| # | |
| # Validated A/B (server print_timing eval time on bill-splitter prompt): | |
| # default mainline: 63.0 t/s (--fit-ctx 196608, ub=1024) | |
| # --fit-ctx 65536: 68.1 t/s (+8% — smaller compute graph) | |
| # + ub=512: 78.9 t/s (+25% over default) | |
| # + ctk/ctv q8_0: pending (~1.9 GB KV freed; expected ~+10-15%) | |
| # + spec ngram-mod: 77.7 t/s (-1.6%; ngram cache kept resetting) | |
| # | |
| # Sampler params from Qwen3.6-A3B model card recommendations: | |
| # - thinking + precise coding: temp=0.6 top_p=0.95 top_k=20 presence=0.0 | |
| # - non-thinking + general: temp=0.7 top_p=0.80 top_k=20 presence=1.5 | |
| # | |
| # Other knobs returned to mainline defaults (no measured benefit): | |
| # --fit-target (was 32, default 1024 — more allocator headroom) | |
| # --prio (was 2, default 0) | |
| # --poll (was 100, default 50) | |
| # removed --repeat-penalty 1.00 (no-op math), --min-p 0.0 (no-op default), | |
| # --reasoning-budget-message (default suffices) | |
| # | |
| # Kept non-default but useful: | |
| # --no-mmap (full RAM load; faster repeat starts) | |
| # | |
| # DO NOT add: --spec-type (neutral-to-negative on this model), | |
| # --cache-type-k turbo3_tcq (beaten by f16 on Blackwell). | |
| # ============================================================================ | |
| $mmprojArgs = if ($mmproj) { @("--mmproj", $mmproj) } else { @() } | |
| $alias = [System.IO.Path]::GetFileNameWithoutExtension($Model) | |
| # ---------------------------------------------------------------------- | |
| # Adaptive fit-target by model size (16 GB VRAM card). | |
| # Bigger models leave less GPU headroom → need more reserve for runtime | |
| # allocator spikes (CUDA caching alloc, KV growth past fit-ctx, graph | |
| # captures). Tiers calibrated for Qwen3.6-A3B family on RTX 5080: | |
| # < 14 GB (Mini): margin 128 MiB — plenty of GPU room, pack tight | |
| # 14-18 GB (Compact/I-Compact): 256 MiB — current default, validated | |
| # 18-22 GB (Quality/I-Quality): 512 MiB — needs more allocator buffer | |
| # > 22 GB (Balanced/I-Balanced/Q4_K_XL): 1024 MiB — heavy host offload, | |
| # don't squeeze GPU | |
| # Override with -FitTarget N if needed. | |
| # ---------------------------------------------------------------------- | |
| $modelSizeGB = (Get-Item $Model).Length / 1GB | |
| $fitTarget = if ($modelSizeGB -lt 14) { 128 } | |
| elseif ($modelSizeGB -lt 18) { 256 } | |
| elseif ($modelSizeGB -lt 22) { 512 } | |
| else { 1024 } | |
| Write-Host "[qwen-moe-turbo] model $([math]::Round($modelSizeGB,2)) GB -> --fit-target $fitTarget MiB" -ForegroundColor Gray | |
| & $llama ` | |
| -m $Model ` | |
| @mmprojArgs ` | |
| --alias $alias ` | |
| --host 0.0.0.0 --port 8080 ` | |
| --fit on --fit-target $fitTarget --fit-ctx 65536 ` | |
| -c $Context ` | |
| --parallel 1 ` | |
| --flash-attn on ` | |
| --batch-size 2048 --ubatch-size 512 ` | |
| --threads 12 --threads-batch 12 ` | |
| -ctk q8_0 -ctv q8_0 ` | |
| --cache-ram -1 ` | |
| --checkpoint-every-n-tokens 32768 ` | |
| --prio 2 --prio-batch 2 ` | |
| --poll 100 ` | |
| --no-mmap ` | |
| --jinja ` | |
| --reasoning $reasoningArg ` | |
| --reasoning-budget $effectiveBudget ` | |
| --reasoning-budget-message "Time to wrap up. Let me give my answer." ` | |
| --presence-penalty $presencePenalty ` | |
| --repeat-penalty 1.00 ` | |
| --chat-template-kwargs $chatKwargs ` | |
| --temp $temp --top-p $topP --top-k $topK --min-p 0.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment