feat: GPU bakeoff — 3090 Ti vs V100 vs Strix Halo

Cross-host Gemma 4 throughput comparison across three architectures.
Harness at scripts/gpu-bakeoff/; writeup at
docs/reference/gpu-bakeoff-2026-04-20.md.

Key findings:
- RTX 3090 Ti wins decode decisively (128 tok/s on gemma4:26b MoE Q4,
  ~4.7× faster than gemma4:31b dense on the same card).
- AMD Strix Halo iGPU lands at ~42% of 3090 Ti decode on ~25% of the
  memory bandwidth — good SIMD utilization, especially for MoE.
- V100 numbers are DEGRADED: CT 167 ai-visualizer SDXL consumes 31/32
  GB of its VRAM, forcing Gemma 4 models 95% onto CPU. Isolated V100
  run requires SDXL eviction — left as follow-up.
- MoE vs dense is the dominant latency factor across all GPUs: ~4 B
  active params of gemma4:26b beats 31.3 B active of gemma4:31b by
  the same ratio (~4.7×) on every card tested.

Methodology: 1 warmup + 3 measurement runs per (host × model ×
prompt-length), Ollama's canonical timing fields, temp=0 greedy,
num_predict=256. All three Ollama servers accessed via HTTP (Strix
via Tailscale).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mortdecai
2026-04-20 05:45:26 -04:00
parent df5542f7d6
commit b6190357ba
20 changed files with 1483 additions and 0 deletions
@@ -0,0 +1,81 @@
{
"host": "pve197",
"gpu": "Tesla V100-PCIE-32GB",
"vram_gb": 32,
"model_alias": "gemma4:26b",
"model_tag": "gemma4:26b",
"prompt_key": "long",
"prompt_chars": 1614,
"num_predict": 256,
"num_ctx": 4096,
"runs": [
{
"prompt_tokens": 318,
"prompt_eval_ms": 118.0,
"prefill_tok_per_s": 2695.59,
"output_tokens": 256,
"eval_ms": 32720.5,
"decode_tok_per_s": 7.82,
"load_ms": 475.8,
"total_ms": 33548.1,
"harness_wall_s": 33.555,
"done_reason": "length"
},
{
"prompt_tokens": 318,
"prompt_eval_ms": 118.3,
"prefill_tok_per_s": 2689.01,
"output_tokens": 256,
"eval_ms": 31273.0,
"decode_tok_per_s": 8.19,
"load_ms": 492.5,
"total_ms": 32116.6,
"harness_wall_s": 32.123,
"done_reason": "length"
},
{
"prompt_tokens": 318,
"prompt_eval_ms": 117.3,
"prefill_tok_per_s": 2711.41,
"output_tokens": 256,
"eval_ms": 33434.9,
"decode_tok_per_s": 7.66,
"load_ms": 496.0,
"total_ms": 34298.7,
"harness_wall_s": 34.305,
"done_reason": "length"
}
],
"warmup": {
"prompt_tokens": 318,
"prompt_eval_ms": 3562.7,
"prefill_tok_per_s": 89.26,
"output_tokens": 256,
"eval_ms": 32215.7,
"decode_tok_per_s": 7.95,
"load_ms": 491.7,
"total_ms": 36521.3,
"harness_wall_s": 36.529,
"done_reason": "length"
},
"summary": {
"prefill_tok_per_s": {
"min": 2689.01,
"median": 2695.59,
"max": 2711.41,
"n": 3
},
"decode_tok_per_s": {
"min": 7.66,
"median": 7.82,
"max": 8.19,
"n": 3
},
"total_ms": {
"min": 32116.6,
"median": 33548.1,
"max": 34298.7,
"n": 3
}
}
}
@@ -0,0 +1,81 @@
{
"host": "pve197",
"gpu": "Tesla V100-PCIE-32GB",
"vram_gb": 32,
"model_alias": "gemma4:26b",
"model_tag": "gemma4:26b",
"prompt_key": "short",
"prompt_chars": 78,
"num_predict": 256,
"num_ctx": 4096,
"runs": [
{
"prompt_tokens": 27,
"prompt_eval_ms": 112.5,
"prefill_tok_per_s": 240.05,
"output_tokens": 256,
"eval_ms": 30919.5,
"decode_tok_per_s": 8.28,
"load_ms": 531.1,
"total_ms": 31828.4,
"harness_wall_s": 31.832,
"done_reason": "length"
},
{
"prompt_tokens": 27,
"prompt_eval_ms": 113.6,
"prefill_tok_per_s": 237.6,
"output_tokens": 256,
"eval_ms": 30399.9,
"decode_tok_per_s": 8.42,
"load_ms": 479.4,
"total_ms": 31242.1,
"harness_wall_s": 31.246,
"done_reason": "length"
},
{
"prompt_tokens": 27,
"prompt_eval_ms": 111.0,
"prefill_tok_per_s": 243.16,
"output_tokens": 256,
"eval_ms": 30712.9,
"decode_tok_per_s": 8.34,
"load_ms": 483.2,
"total_ms": 31552.8,
"harness_wall_s": 31.557,
"done_reason": "length"
}
],
"warmup": {
"prompt_tokens": 27,
"prompt_eval_ms": 843.7,
"prefill_tok_per_s": 32.0,
"output_tokens": 256,
"eval_ms": 30499.4,
"decode_tok_per_s": 8.39,
"load_ms": 5877.7,
"total_ms": 37664.4,
"harness_wall_s": 37.668,
"done_reason": "length"
},
"summary": {
"prefill_tok_per_s": {
"min": 237.6,
"median": 240.05,
"max": 243.16,
"n": 3
},
"decode_tok_per_s": {
"min": 8.28,
"median": 8.34,
"max": 8.42,
"n": 3
},
"total_ms": {
"min": 31242.1,
"median": 31552.8,
"max": 31828.4,
"n": 3
}
}
}
@@ -0,0 +1,81 @@
{
"host": "pve197",
"gpu": "Tesla V100-PCIE-32GB",
"vram_gb": 32,
"model_alias": "gemma4:31b",
"model_tag": "gemma4:31b-it-q4_K_M",
"prompt_key": "long",
"prompt_chars": 1614,
"num_predict": 256,
"num_ctx": 4096,
"runs": [
{
"prompt_tokens": 318,
"prompt_eval_ms": 728.7,
"prefill_tok_per_s": 436.37,
"output_tokens": 256,
"eval_ms": 163511.0,
"decode_tok_per_s": 1.57,
"load_ms": 495.0,
"total_ms": 164970.4,
"harness_wall_s": 164.977,
"done_reason": "length"
},
{
"prompt_tokens": 318,
"prompt_eval_ms": 682.8,
"prefill_tok_per_s": 465.71,
"output_tokens": 256,
"eval_ms": 168727.1,
"decode_tok_per_s": 1.52,
"load_ms": 545.3,
"total_ms": 170207.4,
"harness_wall_s": 170.214,
"done_reason": "length"
},
{
"prompt_tokens": 318,
"prompt_eval_ms": 950.0,
"prefill_tok_per_s": 334.75,
"output_tokens": 256,
"eval_ms": 163102.9,
"decode_tok_per_s": 1.57,
"load_ms": 507.9,
"total_ms": 164801.8,
"harness_wall_s": 164.809,
"done_reason": "length"
}
],
"warmup": {
"prompt_tokens": 318,
"prompt_eval_ms": 3883.3,
"prefill_tok_per_s": 81.89,
"output_tokens": 256,
"eval_ms": 172199.4,
"decode_tok_per_s": 1.49,
"load_ms": 528.0,
"total_ms": 176864.8,
"harness_wall_s": 176.871,
"done_reason": "length"
},
"summary": {
"prefill_tok_per_s": {
"min": 334.75,
"median": 436.37,
"max": 465.71,
"n": 3
},
"decode_tok_per_s": {
"min": 1.52,
"median": 1.57,
"max": 1.57,
"n": 3
},
"total_ms": {
"min": 164801.8,
"median": 164970.4,
"max": 170207.4,
"n": 3
}
}
}
@@ -0,0 +1,81 @@
{
"host": "pve197",
"gpu": "Tesla V100-PCIE-32GB",
"vram_gb": 32,
"model_alias": "gemma4:31b",
"model_tag": "gemma4:31b-it-q4_K_M",
"prompt_key": "short",
"prompt_chars": 78,
"num_predict": 256,
"num_ctx": 4096,
"runs": [
{
"prompt_tokens": 27,
"prompt_eval_ms": 665.6,
"prefill_tok_per_s": 40.56,
"output_tokens": 256,
"eval_ms": 164631.1,
"decode_tok_per_s": 1.55,
"load_ms": 512.6,
"total_ms": 166062.7,
"harness_wall_s": 166.067,
"done_reason": "length"
},
{
"prompt_tokens": 27,
"prompt_eval_ms": 660.3,
"prefill_tok_per_s": 40.89,
"output_tokens": 256,
"eval_ms": 159594.3,
"decode_tok_per_s": 1.6,
"load_ms": 523.6,
"total_ms": 161012.3,
"harness_wall_s": 161.016,
"done_reason": "length"
},
{
"prompt_tokens": 27,
"prompt_eval_ms": 887.8,
"prefill_tok_per_s": 30.41,
"output_tokens": 256,
"eval_ms": 167584.3,
"decode_tok_per_s": 1.53,
"load_ms": 486.8,
"total_ms": 169188.9,
"harness_wall_s": 169.194,
"done_reason": "length"
}
],
"warmup": {
"prompt_tokens": 27,
"prompt_eval_ms": 6642.4,
"prefill_tok_per_s": 4.06,
"output_tokens": 256,
"eval_ms": 173530.1,
"decode_tok_per_s": 1.48,
"load_ms": 20142.1,
"total_ms": 200836.5,
"harness_wall_s": 200.841,
"done_reason": "length"
},
"summary": {
"prefill_tok_per_s": {
"min": 30.41,
"median": 40.56,
"max": 40.89,
"n": 3
},
"decode_tok_per_s": {
"min": 1.53,
"median": 1.55,
"max": 1.6,
"n": 3
},
"total_ms": {
"min": 161012.3,
"median": 166062.7,
"max": 169188.9,
"n": 3
}
}
}