{
"scraped_at": "2026-04-27T14:48:37Z",
"presets": {
"RTX 5090 (32 GB)": {
"rows": [
{
"id": "cmofapunx0008l20407bq5ln9",
"modelRevision": "main",
"promptTokens": 34,
"outputTokens": 1499,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": 54,
"tokSOut": 190,
"tokSTotal": null,
"peakVramGb": 30,
"createdAt": "2026-04-26T04:56:21.117Z",
"notes": "TurboQuant2 fork (TheTom/llama-cpp-turboquant). cache-type-k q8_0, cache-type-v q8_0, flash-attn on, ctx 262144, thinking enabled (Qwen3 reasoning mode), mlock, no-mmap, prio 2",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 5090",
"gpuCount": 1,
"vramGb": 32,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen 7 3700X",
"ramGb": 64,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8965",
"quantization": "UD-Q4_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmoevbq6o000njo04gjks2ldt",
"name": "Hath",
"username": "claudehath",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofps6zq0010l7040imyvvtx",
"modelRevision": "main",
"promptTokens": 21,
"outputTokens": 1000,
"contextLength": 131072,
"batchSize": 1,
"ttftMs": 108,
"tokSOut": 97.1,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T11:58:04.647Z",
"notes": "Abliterated model with MTP (3 speculative tokens), FP8 KV cache. Decode measured with 1000-token output (MTP steady-state). TTFT from warm prefix cache. GPU running at 600W max TDP. 600w",
"model": {
"hfId": "sakamakismile/Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP",
"displayName": "Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "huihui-ai/Huihui-Qwen3.6-27B-abliterated",
"displayName": "Huihui-Qwen3.6-27B-abliterated"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 5090",
"gpuCount": 1,
"vramGb": 32,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen 7 3700X",
"ramGb": 64,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "v0.19.2rc1.dev134+gfe9c3d6c5",
"quantization": "NVFP4",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "fp8",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "vllm serve --kv-cache-dtype fp8 --max-model-len 131072 --max-num-batched-tokens 4096 --enable-prefix-caching --max-num-seqs 4 --gpu-memory-utilization 0.90 --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}' --reasoning-parser qwen3 --tool-call-parser qwen3_coder --trust-remote-code --served-model-name Qwen3.6-27B --default-chat-template-kwargs '{\"preserve_thinking\": true}' --enable-auto-tool-choice"
},
"user": {
"id": "cmoessqbj0009jo04ovjn5szr",
"name": "Rohit Paul",
"username": "rohit",
"verified": false,
"verifiedAt": null
},
"rank": 2,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofqbobm0016l504s1pd2x5c",
"modelRevision": "main",
"promptTokens": 21,
"outputTokens": 1000,
"contextLength": 131072,
"batchSize": 1,
"ttftMs": 112,
"tokSOut": 94.4,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T12:13:13.570Z",
"notes": "Abliterated model with MTP (3 speculative tokens), FP8 KV cache. Decode measured with 1000-token output (MTP steady-state). TTFT from warm prefix cache. GPU running at 400W max TDP. Average of 6 runs.",
"model": {
"hfId": "sakamakismile/Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP",
"displayName": "Huihui-Qwen3.6-27B-abliterated-NVFP4-MTP",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "huihui-ai/Huihui-Qwen3.6-27B-abliterated",
"displayName": "Huihui-Qwen3.6-27B-abliterated"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 5090",
"gpuCount": 1,
"vramGb": 32,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen 7 3700X",
"ramGb": 64,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "v0.19.2rc1.dev134+gfe9c3d6c5",
"quantization": "NVFP4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "fp8",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "vllm serve --kv-cache-dtype fp8 --max-model-len 131072 --max-num-batched-tokens 4096 --enable-prefix-caching --max-num-seqs 4 --gpu-memory-utilization 0.90 --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}' --reasoning-parser qwen3 --tool-call-parser qwen3_coder --trust-remote-code --served-model-name Qwen3.6-27B --default-chat-template-kwargs '{\"preserve_thinking\": true}' --enable-auto-tool-choice"
},
"user": {
"id": "cmoessqbj0009jo04ovjn5szr",
"name": "Rohit Paul",
"username": "rohit",
"verified": false,
"verifiedAt": null
},
"rank": 3,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoevn4cu0016l804vaasia9h",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": 459,
"tokSOut": 89,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:54:19.471Z",
"notes": null,
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 5090",
"gpuCount": 1,
"vramGb": 32,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen 7 3700X",
"ramGb": 64,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 99,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server --model Qwen3.6-27B-Q4_K_M.gguf --n-gpu-layers 99 --ctx-size 262144 --cache-type-k q8_0 --cache-type-v q8_0 --flash-attn on --parallel 1 --batch-size 4096 --ubatch-size 4096 --temp 1.0 --top-k 20 --top-p 0.95 --swa-full --no-context-shift --no-mmap --mlock"
},
"user": {
"id": "cmoevbq6o000njo04gjks2ldt",
"name": "Hath",
"username": "claudehath",
"verified": false,
"verifiedAt": null
},
"rank": 4,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoevgkv6000iib04cmisudtv",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": 459,
"tokSOut": 89,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:49:14.274Z",
"notes": null,
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 5090",
"gpuCount": 1,
"vramGb": 32,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen 7 3700X",
"ramGb": 64,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmoevbq6o000njo04gjks2ldt",
"name": "Hath",
"username": "claudehath",
"verified": false,
"verifiedAt": null
},
"rank": 5,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoevcpkp000sl804pb3yoj0c",
"modelRevision": "main",
"promptTokens": 128,
"outputTokens": 128,
"contextLength": 131072,
"batchSize": 4,
"ttftMs": 258.05,
"tokSOut": 57.54,
"tokSTotal": 57.54,
"peakVramGb": 28.1,
"createdAt": "2026-04-25T21:46:13.753Z",
"notes": "vLLM on Unraid RTX 5090 (600W). NVFP4 quant, fp8 KV cache, prefix caching, 131K context. Updated after power limit increase \u2014 ~13% faster output, ~17% faster TTFT.",
"model": {
"hfId": "sakamakismile/Qwen3.6-27B-NVFP4",
"displayName": "Qwen3.6-27B-NVFP4",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 5090",
"gpuCount": 1,
"vramGb": 32,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen 7 3700X",
"ramGb": 64,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev134+gfe9c3d6c5",
"quantization": "NVFP4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "fp8",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "vllm serve sakamakismile/Qwen3.6-27B-NVFP4 --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --kv-cache-dtype fp8 --trust-remote-code --max-model-len 131072 --max-num-batched-tokens 4096 --enable-prefix-caching --max-num-seqs 4 --gpu-memory-utilization 0.90"
},
"user": {
"id": "cmoessqbj0009jo04ovjn5szr",
"name": "Rohit Paul",
"username": "rohit",
"verified": false,
"verifiedAt": null
},
"rank": 6,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 6
},
"RTX 5080 (16 GB)": {
"rows": [],
"total": 0
},
"RTX 4090 (24 GB)": {
"rows": [
{
"id": "cmofe5y0g0003l704etimxv7c",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 64000,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 165.06,
"tokSTotal": null,
"peakVramGb": 23691,
"createdAt": "2026-04-26T06:32:50.801Z",
"notes": null,
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B-GGUF",
"displayName": "Qwen3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 4090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "i9 14900k",
"ramGb": 128000,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "build_info: b8925-0adede866",
"quantization": "Q4_K_M",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "./llama-server -hf unsloth/Qwen3.6-35B-A3B-GGUF:UD-Q4_K_XL"
},
"user": {
"id": "cmofdq6250000la04xxal87ej",
"name": "adeeb aldkheel",
"username": "adeebaldkheel",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof7f8y90002ju04kejdop3a",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 157.4,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T03:24:07.569Z",
"notes": null,
"model": {
"hfId": "unsloth/Qwen3.5-35B-A3B-GGUF",
"displayName": "Qwen3.5-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 4090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Intel Core Ultra 9 285K",
"ramGb": 64,
"os": "Fedora 43"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "UD-Q4_K_XL",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 2,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof85if50007l4048uhps6jx",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 46.69,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T03:44:32.897Z",
"notes": "BigBeast TurboQuant turbo3 KV, Qwen3.5-27B Q4_K_M on RTX 4090. pp512 3021 tok/s, tg128 46.69 tok/s. Source: ~/aurora_docs/BIGBEAST_TURBO3_BENCHMARKS.md.",
"model": {
"hfId": "unsloth/Qwen3.5-27B-GGUF",
"displayName": "Qwen3.5-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 4090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Intel Core Ultra 9 285K",
"ramGb": 64,
"os": "Fedora 43"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 3,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof7sd4b0002l40473uvzk0u",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 44,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T03:34:19.499Z",
"notes": "BigBeast single RTX 4090, Qwen3.5-27B dense Q4_0, Q4_1 KV due 256K VRAM fit. Prompt processing 1105 tok/s. Source: ~/aurora_docs/QWEN35_LOCAL_BENCHMARKS.md.",
"model": {
"hfId": "unsloth/Qwen3.5-27B-GGUF",
"displayName": "Qwen3.5-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 4090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Intel Core Ultra 9 285K",
"ramGb": 64,
"os": "Fedora 43"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_0",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 4,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofbmhhd0007jt04k1tmw4hb",
"modelRevision": "main",
"promptTokens": 2048,
"outputTokens": 128,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 41.82,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T05:21:43.681Z",
"notes": "BigBeast RTX 4090 Qwen3.6-27B UD-Q4_K_XL GGUF, q4_1/q4_1 KV, flash attention, llama.cpp build 8892. Batched bench B=1: pp2048 2981 tok/s, tg128 41.82 tok/s. Source: /home/radar/llama.cpp/logs/bench/qwen36-27b-q4-batched-4090-20260422_215014.md on bigbeast-fedora.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 4090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Intel Core Ultra 9 285K",
"ramGb": 64,
"os": "Fedora 43"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "UD-Q4_K_XL",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 5,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 5
},
"RTX 4080 (16 GB)": {
"rows": [],
"total": 0
},
"RTX 4070 Ti (12 GB)": {
"rows": [
{
"id": "cmofvuf7v0005jv046xvcrah0",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 65536,
"batchSize": 1,
"ttftMs": 248,
"tokSOut": 77.4,
"tokSTotal": 85,
"peakVramGb": 7,
"createdAt": "2026-04-26T14:47:46.315Z",
"notes": "Qwen3.5-9B-Q4_K_M (nouveau mod\u00e8le non encore sur HF) - M\u00e9triques r\u00e9elles depuis logs - RTX 4070 SUPER 12GB - TurboQuant turbo2",
"model": {
"hfId": "Qwen/Qwen2.5-7B-Instruct",
"displayName": "Qwen2.5-7B-Instruct",
"family": "Qwen",
"params": 7,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen2.5-7B",
"displayName": "Qwen2.5-7B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 4070 SUPER",
"gpuCount": 1,
"vramGb": 12,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD Ryzen (CachyOS)",
"ramGb": 32,
"os": "CachyOS Linux"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 99,
"kvCacheDtype": "turbo2",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-bench -m Qwen3.5-9B-Q4_K_M.gguf -ngl 99 -fa 1 -pg 65536,128 -ctk turbo2 -ctv turbo2"
},
"user": {
"id": "cmoft7gd6000ekz04kdqf80s2",
"name": "le",
"username": "leclampin",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 1
},
"RTX 3090 (24 GB)": {
"rows": [
{
"id": "cmof1zrjo0003h204re6e99ao",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 170.5,
"tokSTotal": null,
"peakVramGb": 22.5,
"createdAt": "2026-04-26T00:52:07.092Z",
"notes": "Quant: Intel/Qwen3.6-35B-A3B-int4-AutoRound (group_size=128, sym, W4A16, repo currently gated on HF \u2014 used while it was public). Requires our pad-Marlin patch (vLLM PR #40361, OPEN) for TP=2 \u2014 without it Marlin rejects size_n=32 sub-tile shards. Measured 2026-04-20 at 230W power cap on consumer RTX 3090s, no NVLink bridge.",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev21+g893611813",
"quantization": "INT4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp16",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "vllm serve Intel/Qwen3.6-35B-A3B-int4-AutoRound --tensor-parallel-size 2 --quantization auto_round --dtype float16 --max-model-len 32768 --gpu-memory-utilization 0.92 --max-num-seqs 8 --kv-cache-dtype auto --trust-remote-code --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --enable-chunked-prefill"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogpa6m10002k004199sqmb4",
"modelRevision": "82d411acf4a06cfb8d9b073a5211bf410bfc29bf",
"promptTokens": 30,
"outputTokens": 500,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 150.0733108123318,
"tokSTotal": 154.707,
"peakVramGb": 19.525,
"createdAt": "2026-04-27T04:31:50.521Z",
"notes": "DFlash speculative decoding benchmark submitted by Codex after two warmup requests against llama-server /v1/chat/completions. Recorded request used temperature=0, max_tokens=500, prompt: Write a Python binary search tree implementation with insert, search, delete, and traversal methods. DFlash acceptance on recorded run: 452 accepted draft tokens out of 693 draft tokens. This is a prompt-sensitive high-acceptance coding workload; earlier open-ended prose prompt on the same setup was much slower. Server launched with -c 4096; DFlash recurrent backup makes usable slot n_ctx 2048.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8945-30759dfa0",
"quantization": "UD-Q4_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 1,
"gpuLayers": 99,
"kvCacheDtype": null,
"attentionBackend": "flash_attn",
"flashAttn": true,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "/home/joao/buun-llama-cpp/build-cuda/bin/llama-server -m Qwen3.6-27B-UD-Q4_K_XL.gguf -md dflash-draft-3.6-q4_k_m.gguf --spec-type dflash -dev CUDA1 -devd CUDA1 -sm none -ngl 99 -ngld 99 -np 1 -c 4096 -cd 256 -fa on -b 512 -ub 128 --draft-max 16 --draft-min 1 --temp 0 --reasoning off --host 127.0.0.1 --port 8081 --no-webui"
},
"user": {
"id": "cmofelkpt0005l704vd0viq0a",
"name": "Jo\u00e3o Vieira",
"username": "joaosump",
"verified": false,
"verifiedAt": null
},
"rank": 2,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogqkco20007l704rhj8sq83",
"modelRevision": "main",
"promptTokens": 117,
"outputTokens": 512,
"contextLength": 1024,
"batchSize": 1,
"ttftMs": 710,
"tokSOut": 140.28,
"tokSTotal": 146.6,
"peakVramGb": 22,
"createdAt": "2026-04-27T05:07:44.546Z",
"notes": "DFlash speculative decoding via Lucebox-Hub test_dflash binary (custom C++/CUDA on top of Luce-Org/llama.cpp@luce-dflash ggml fork). Block-diffusion draft from z-lab/Qwen3.6-27B-DFlash (BF16 safetensors, 3.46 GB, 2026-04-26 z-lab snapshot). DDTree tree-verify budget=22, chain_seed=1, fa_window=2048, KV f16. Bench: scripts/bench_he.py --n-gen 512 --ddtree-budget 22, mean of 10 HumanEval-style code-completion prompts (single-stream, batch=1). Per-prompt range 75.9-203.4 tok/s; peak sum_product 203.37 tok/s. Mean acceptance length 9.22, accept rate 57.7%. AR baseline same binary on same hardware: 35.79 tok/s (3.92x speedup). vLLM FP8 production single-stream on this rig: 44.36 tok/s (3.16x speedup). Critical CMake flags: -DCMAKE_CUDA_ARCHITECTURES=86 -DGGML_CUDA_GRAPHS=ON -DCMAKE_BUILD_TYPE=Release. Defaults compile sm_52 + graphs OFF, costs 3x.",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090 Ti",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD Ryzen 9 9950X",
"ramGb": 32,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "lucebox-hub b6ffab4a9",
"quantization": "Q4_K_M",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "build/test_dflash Qwen3.6-27B-Q4_K_M.gguf z-lab/Qwen3.6-27B-DFlash --fast-rollback --ddtree --ddtree-budget=22 --max-ctx=1024"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 3,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogixabp0004kw0453ux7bjd",
"modelRevision": "main",
"promptTokens": 22,
"outputTokens": 1000,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 122,
"tokSOut": 140.06,
"tokSTotal": null,
"peakVramGb": 21.05,
"createdAt": "2026-04-27T01:33:51.109Z",
"notes": "Quant: Intel/Qwen3.6-35B-A3B-Int4-AutoRound (HF gated, base hfId used). Genesis v7.48 (P67 multi-query Triton kernel + P67b spec-verify routing + P78 capture-guard). DFlash N=4 is the throughput sweet spot of an N-sweep on this stack (N=3 134, N=4 140, N=5 137 TPS). Mean acceptance length 2.44, avg draft accept 36.0%, per-position accept 0.70 / 0.43 / 0.23 / 0.10. Required on Ampere SM 8.6: --dtype bfloat16 (vllm#40334 dtype mismatch), FP16 KV cache (FA2 only backend that handles non-causal + head_size=256). Bench: 800-word essay prompt \u00d7 1000 max_tokens \u00d7 temp 0.6 streaming, 5 measured runs after 3 warmup, CV 4.3%.",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen ",
"ramGb": 96,
"os": "windows"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev205+g07351e088",
"quantization": "INT4",
"backend": null
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp16",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "vllm serve Qwen/Qwen3.6-35B-A3B --tensor-parallel-size 2 --quantization auto_round --dtype bfloat16 --max-model-len 32768 --gpu-memory-utilization 0.90 --max-num-seqs 1 --speculative-config '{\"method\":\"dflash\",\"num_speculative_tokens\":4}'"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 4,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof9h1970008l704j9dc7oo7",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 148.56,
"tokSOut": 140.01,
"tokSTotal": 280.03,
"peakVramGb": 22.15,
"createdAt": "2026-04-26T04:21:30.140Z",
"notes": "Qwen 3.6 35B-A3B MoE (3B active per token, sparse activation). Mixed-pair rig: RTX 3090 Ti (slot 1, x16) + RTX 3090 FE (slot 2, x4); TP=2 still gated by FE bandwidth. Single-stream 512/512 via `vllm bench serve` at concurrency=1, 20 prompts. Median TPOT 6.86 ms (~146 t/s decode). Required `--max-num-batched-tokens 4096` (default 2048 < hybrid Mamba block size 2096 \u2192 boot assertion). Prefix caching + torch.compile + CUDAGraphs engaged.",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B-FP8",
"displayName": "Qwen3.6-35B-A3B-FP8",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.1",
"quantization": "fp8",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp8",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "vllm serve Qwen/Qwen3.6-35B-A3B-FP8 --tensor-parallel-size 2 --gpu-memory-utilization 0.88 --max-model-len 32768 --max-num-seqs 2 --max-num-batched-tokens 4096 --kv-cache-dtype fp8 --dtype auto --enable-prefix-caching --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 5,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdju0x0006ju041pympxzf",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 8192,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 138.01,
"tokSTotal": 3335.33,
"peakVramGb": null,
"createdAt": "2026-04-26T06:15:39.202Z",
"notes": "2x RTX 3090 NVLink, Ornstein 3.6-35B-A3B MoE (3B active params), all layers on GPU, flash attention. llama-bench pp512 tg128.",
"model": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B-GGUF",
"displayName": "Ornstein3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B",
"displayName": "Ornstein3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8934",
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 6,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogj4tyd0001ic0446kgjl3h",
"modelRevision": "main",
"promptTokens": 22,
"outputTokens": 1000,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 120,
"tokSOut": 134.4,
"tokSTotal": null,
"peakVramGb": 21,
"createdAt": "2026-04-27T01:39:43.141Z",
"notes": "Quant: Intel/Qwen3.6-35B-A3B-Int4-AutoRound (HF gated, base hfId used). Genesis v7.48 (P67/P67b/P78). DFlash N=3 datapoint of an N-sweep (N=3 134, N=4 140, N=5 137 TPS). N=3 has highest avg draft accept rate (43.9%) but lowest AL (2.32) \u2192 less throughput than N=4. Per-position accept 0.71 / 0.42 / 0.21. Same hardware/patches as the N=4 entry. Bench: 800-word essay \u00d7 1000 max_tokens \u00d7 temp 0.6 streaming, 5 measured runs after 3 warmup, CV 2.7%.",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen ",
"ramGb": 96,
"os": "windows"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev205+g07351e088",
"quantization": "INT4",
"backend": null
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp16",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "vllm serve Qwen3.6-35B-A3B-AutoRound-INT4 --tensor-parallel-size 2 --dtype bfloat16 --max-model-len 32768 --gpu-memory-utilization 0.90 --speculative-config '{\"method\":\"dflash\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 7,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogsu1xc000mjv04iforumvd",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 8192,
"batchSize": 1,
"ttftMs": 122.57,
"tokSOut": 133.01,
"tokSTotal": 266.01,
"peakVramGb": 7,
"createdAt": "2026-04-27T06:11:16.416Z",
"notes": "Qwen 3-8B Q4_K_M via llama-server on single RTX 3090 Ti (slot 1 Gen4 x16). Bench: vllm bench serve openai-chat random 512/512 num-prompts 20 concurrency 1 --ignore-eos. TPOT 7.29 ms, ITL 7.34 ms \u2014 8B Q4 is HBM-bandwidth-saturated on 3090 Ti at ~135 t/s decode. KV q8_0, ngl 999, fa on, no spec decode. ~5 GB on disk, ~7 GB peak VRAM. **+51% output throughput vs M3 Ultra entry (88.25 t/s) on the same model+quant** \u2014 Ti's 1008 GB/s HBM2 + tensor cores beat M3 Ultra's unified-memory bandwidth on dense 8B inference.",
"model": {
"hfId": "Qwen/Qwen3-8B",
"displayName": "Qwen3-8B",
"family": "Qwen",
"params": 8,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3-8B-Base",
"displayName": "Qwen3-8B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090 Ti",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD Ryzen 9 9950X",
"ramGb": 32,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b1-6217b49 (CUDA)",
"quantization": "Q4_K_M",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3-8B-Q4_K_M.gguf --alias qwen3-8b-q4 -ngl 999 -fa 1 --no-mmap -c 8192 -ctk q8_0 -ctv q8_0 -b 2048 -ub 2048 --metrics --jinja --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0.0 --host 127.0.0.1 --port 8003"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 8,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogsauq70006k40418jb5vob",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 65536,
"batchSize": 1,
"ttftMs": 212.55,
"tokSOut": 131.58,
"tokSTotal": 263.17,
"peakVramGb": 17,
"createdAt": "2026-04-27T05:56:20.624Z",
"notes": "Qwen 3.6 35B-A3B MoE (3B active) UD-Q6_K_XL via llama-server (Unsloth Dynamic 2.0). Dual RTX 3090 (Ti slot 1 Gen4 x16 + Dell slot 2 Gen4 x4), default layer split. Bench: vllm bench serve openai-chat random 512/512 num-prompts 20 concurrency 1 --ignore-eos. TPOT 7.20 ms \u2014 MoE 3B-active sparse activation gives sharply lower per-token latency than dense FP8 27B (13.34 ms TPOT). Total throughput 263 t/s. Quant choice matches Bosgame M5 production sweet spot (validated against Hermes agent harness \u2014 8/8 tool-call passes, clean format compliance). Thinking off via server-side --chat-template-kwargs '{\"enable_thinking\":false}'. Use case: HTML/forms/quick-codegen mode in opencode toggle (vllm-mode dense|moe). VRAM 17 GB per card across both = ~34 GB total (Q6_K_XL 32 GB on disk). Prefix caching off, KV q8_0.",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090 Ti",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD Ryzen 9 9950X",
"ramGb": 32,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b1-6217b49 (CUDA)",
"quantization": "UD-Q6_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf --alias qwen3.6-35b-a3b-q6 -ngl 999 -fa 1 --no-mmap -c 65536 --cache-reuse 256 -ctk q8_0 -ctv q8_0 --no-context-shift --parallel 4 -b 2048 -ub 2048 --metrics --jinja --chat-template-kwargs '{\"enable_thinking\":false}' --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0.0 --host 0.0.0.0 --port 8000"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 9,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogjbets0008kw045ktjd4zx",
"modelRevision": "main",
"promptTokens": 22,
"outputTokens": 1000,
"contextLength": 131072,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 125.94,
"tokSTotal": null,
"peakVramGb": 10.1,
"createdAt": "2026-04-27T01:44:50.128Z",
"notes": "Tom Turney's TurboQuant+ fork of llama.cpp (commit b3401eaf9). Asymmetric KV: q8_0 keys + turbo4 values per @no_stp_on_snek's twitter post. 2\u00d7 RTX 3090 layer-split, 50/50 tensor-split. Replication of his 2\u00d7 GTX 1080 Ti benchmark \u2014 Pascal got 36 TPS at 128K, Ampere gets 125.94 TPS = 3.5\u00d7 faster on same model + KV scheme. CV 0.0% across 5 runs. Combined with our 27B Dense Q4_K_M = 37.78 TPS bench, MoE-vs-dense advantage on Ampere = 3.3\u00d7 (vs Pascal's 2.6\u00d7). Bench: 800-word essay \u00d7 1000 max_tokens \u00d7 temp 0.6 streaming.",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B-GGUF",
"displayName": "Qwen3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen ",
"ramGb": 96,
"os": "windows"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "Tom-TurboQuant-fork-b3401eaf9",
"quantization": "Q3_K_XL",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 999,
"kvCacheDtype": "q8_0/turbo4 (asymmetric)",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server --model Qwen3.6-35B-A3B-UD-Q3_K_XL.gguf --ctx-size 131072 --n-gpu-layers 999 --split-mode layer --tensor-split 0.5,0.5 --cache-type-k q8_0 --cache-type-v turbo4 --batch-size 2048 --ubatch-size 512 --threads 8"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 10,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof7ls5a0003l604jnromcue",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 124.4,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T03:29:12.382Z",
"notes": "BigBeast single RTX 3090, Qwen3.5-35B-A3B MoE, Q8_0 KV, single slot. Prompt processing 811 tok/s. Source: ~/aurora_docs/QWEN35_LOCAL_BENCHMARKS.md on bigbeast-fedora.",
"model": {
"hfId": "unsloth/Qwen3.5-35B-A3B-GGUF",
"displayName": "Qwen3.5-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "UD-Q4_K_XL",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 11,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof29ivq000ajr047dp6behr",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 1048576,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 124,
"tokSTotal": null,
"peakVramGb": 22.5,
"createdAt": "2026-04-26T00:59:42.423Z",
"notes": "Qwen3.6-35B-A3B (MoE, 3B-active) at 1M context window. Layer-split across 2\u00d7 RTX 3090, llama.cpp turboquant fork (Tom Jobbins' fork supporting q4_0 KV cache + extended ctx). Short-prompt 124 TPS sustained at full 1M ctx. Loaded-context generation 37 TPS @ 128K. UD-Q4_K_XL = Unsloth Dynamic Quantization (variable-precision Q4_K_M derivative). Measured 2026-04-19 at 230W power cap, no NVLink bridge.",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B-GGUF",
"displayName": "Qwen3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "turboquant-fork",
"quantization": "UD-Q4_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 999,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf --ctx-size 1048576 --n-gpu-layers 999 --split-mode layer -ctk q4_0 -ctv q4_0 -fa on --threads 16"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 12,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogt0ksa000wjv04oqlbcm4w",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 8192,
"batchSize": 1,
"ttftMs": 155.81,
"tokSOut": 119.05,
"tokSTotal": 238.09,
"peakVramGb": 8,
"createdAt": "2026-04-27T06:16:20.795Z",
"notes": "Qwen 3.5-9B unsloth Dynamic 2.0 UD-Q4_K_XL via llama-server on single RTX 3090 Ti (slot 1 Gen4 x16). Bench: vllm bench serve openai-chat random 512/512 num-prompts 20 concurrency 1 --ignore-eos. TPOT 8.11 ms. KV q8_0, ngl 999, fa on, no spec decode. ~6 GB on disk (5.96 GB), ~8 GB peak VRAM. Model is from Qwen 3.5 Small family (0.8B/2B/4B/9B) released Feb 2026.",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090 Ti",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD Ryzen 9 9950X",
"ramGb": 32,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b1-6217b49 (CUDA)",
"quantization": "UD-Q4_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.5-9B-UD-Q4_K_XL.gguf --alias qwen3.5-9b-q4 -ngl 999 -fa 1 --no-mmap -c 8192 -ctk q8_0 -ctv q8_0 -b 2048 -ub 2048 --metrics --jinja --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0.0 --host 127.0.0.1 --port 8003"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 13,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofbm4i90004jt04iovaovs7",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 8192,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 110.11,
"tokSTotal": 3842.59,
"peakVramGb": null,
"createdAt": "2026-04-26T05:21:26.865Z",
"notes": "2x RTX 3090 NVLink, all layers on GPU, flash attention. llama-bench pp512 tg128.",
"model": {
"hfId": "unsloth/Qwen3.5-9B-GGUF",
"displayName": "Qwen3.5-9B-GGUF",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8934",
"quantization": "Q4_K_S",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 14,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoemsu520001kv046904a8an",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 138000,
"batchSize": 1,
"ttftMs": 224,
"tokSOut": 102,
"tokSTotal": null,
"peakVramGb": 28,
"createdAt": "2026-04-25T17:46:49.622Z",
"notes": null,
"model": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B",
"displayName": "Ornstein3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "unsloth/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": "CUDA"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "LMstudio"
},
"user": {
"id": "cmo3v1l6p0000jk04cplvn9g5",
"name": "Lotto",
"username": "Lottolottolotto",
"verified": false,
"verifiedAt": null
},
"rank": 15,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogcx9tt0002jr04w918m9mh",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 256,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 479.7,
"tokSOut": 88.96,
"tokSTotal": 234.02,
"peakVramGb": 22.56,
"createdAt": "2026-04-26T22:45:52.769Z",
"notes": "MTP speculative decoding (3 tokens). Greedy decode, ignore_eos=true. Median of 5 runs after 2 warmup; greedy; ignore_eos=true. Actual prompt tokens (server): 527. Decode tok/s stddev: 0.05.",
"model": {
"hfId": "Lorbus/Qwen3.6-27B-int4-AutoRound",
"displayName": "Qwen3.6-27B-int4-AutoRound",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev215+g32e45636e",
"quantization": "INT4",
"backend": null
},
"engineFlags": {
"tensorParallel": 1,
"gpuLayers": null,
"kvCacheDtype": "fp8_e5m2",
"attentionBackend": "FLASHINFER",
"flashAttn": true,
"specDecoding": true,
"mtpEnabled": true,
"commandSnippet": "vllm serve Lorbus/Qwen3.6-27B-int4-AutoRound \\\n --tensor-parallel-size 1 --quantization auto_round --dtype float16 \\\n --max-model-len 32768 --gpu-memory-utilization 0.95 --max-num-seqs 1 \\\n --kv-cache-dtype fp8_e5m2 --trust-remote-code \\\n --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder \\\n --enable-prefix-caching --enable-chunked-prefill \\\n --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmoga3m060002l704h3s1zjue",
"name": "moussaba",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 16,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoggj8vc0003le04bd623hh7",
"modelRevision": "main",
"promptTokens": 4096,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 1968.7,
"tokSOut": 85.22,
"tokSTotal": 580.15,
"peakVramGb": 22.56,
"createdAt": "2026-04-27T00:26:56.808Z",
"notes": "MTP speculative decoding (3 tokens). Greedy decode, ignore_eos=true. Median of 5 runs after 2 warmup; greedy; ignore_eos=true. Actual prompt tokens (server): 4111. Decode tok/s stddev: 0.06.",
"model": {
"hfId": "Lorbus/Qwen3.6-27B-int4-AutoRound",
"displayName": "Qwen3.6-27B-int4-AutoRound",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev215+g32e45636e",
"quantization": "INT4",
"backend": null
},
"engineFlags": {
"tensorParallel": 1,
"gpuLayers": null,
"kvCacheDtype": "fp8_e5m2",
"attentionBackend": "FLASHINFER",
"flashAttn": true,
"specDecoding": true,
"mtpEnabled": true,
"commandSnippet": "vllm serve Lorbus/Qwen3.6-27B-int4-AutoRound \\\n --tensor-parallel-size 1 --quantization auto_round --dtype float16 \\\n --max-model-len 32768 --gpu-memory-utilization 0.95 --max-num-seqs 1 \\\n --kv-cache-dtype fp8_e5m2 --trust-remote-code \\\n --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder \\\n --enable-prefix-caching --enable-chunked-prefill \\\n --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmoga3m060002l704h3s1zjue",
"name": "moussaba",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 17,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof2ilht0008lb04dw1zuqwm",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 185000,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 78.2,
"tokSTotal": null,
"peakVramGb": 21.7,
"createdAt": "2026-04-26T01:06:45.713Z",
"notes": "Qwen3.6-27B (dense, Qwen3-Next hybrid) with Lorbus INT4 AutoRound + native MTP head preserved. DFlash speculative decoding via z-lab/Qwen3.6-27B-DFlash draft (BF16, ~3.5 GB), N=5 speculative tokens. Vision (MoonViT) enabled. Code-prompt peak 128 TPS (quicksort) vs narrative 78 TPS reported here (steady-state on 800-word essay). All 7 functional checks pass: tools, streaming, thinking, recall ladder 10K/30K/60K/90K. Requires --dtype bfloat16 (workaround for vLLM PR #40334, OPEN). Requires our pad-Marlin patch (vLLM PR #40361, OPEN) for TP=2 INT4 sub-tile shards. Measured 2026-04-25 at 230W power cap, no NVLink bridge.",
"model": {
"hfId": "Lorbus/Qwen3.6-27B-int4-AutoRound",
"displayName": "Qwen3.6-27B-int4-AutoRound",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev205+g07351e088",
"quantization": "INT4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp16",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "vllm serve Lorbus/Qwen3.6-27B-int4-AutoRound --tensor-parallel-size 2 --quantization auto_round --dtype bfloat16 --max-model-len 185000 --gpu-memory-utilization 0.95 --max-num-seqs 1 --max-num-batched-tokens 8192 --trust-remote-code --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --enable-chunked-prefill --speculative-config '{\"method\":\"dflash\",\"model\":\"/path/to/qwen3.6-27b-dflash\",\"num_speculative_tokens\":5}'"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 18,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof2rbxq000djr04m4s9umwu",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 71,
"tokSTotal": null,
"peakVramGb": 22.4,
"createdAt": "2026-04-26T01:13:33.231Z",
"notes": "Qwen3.6-27B (dense, Qwen3-Next hybrid), Lorbus INT4 AutoRound, full 262K context. fp8_e5m2 KV cache. Native MTP head spec-decode (num_speculative_tokens=3, ~92/81/67% per-position acceptance). Vision (MoonViT) + tools + thinking + recall ladder all working. Code peak 89 TPS on quicksort vs narrative 71 TPS reported here. KV pool 168K tokens \u2192 2.36\u00d7 concurrency at full 262K. Concurrent throughput: ~119 TPS at 2 streams, ~257 TPS at 4, ~385 TPS at 8 (short prompts). Loaded-context generation 36 TPS @ 100K, 28 TPS @ 200K. Requires our pad-Marlin patch (vLLM PR #40361, OPEN) for TP=2 INT4 sub-tile shards. Measured 2026-04-25 at 230W power cap, no NVLink bridge.",
"model": {
"hfId": "Lorbus/Qwen3.6-27B-int4-AutoRound",
"displayName": "Qwen3.6-27B-int4-AutoRound",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev205+g07351e088",
"quantization": "INT4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp8_e5m2",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": true,
"commandSnippet": "vllm serve Lorbus/Qwen3.6-27B-int4-AutoRound --tensor-parallel-size 2 --quantization auto_round --dtype float16 --max-model-len 262144 --gpu-memory-utilization 0.92 --max-num-seqs 2 --max-num-batched-tokens 8192 --kv-cache-dtype fp8_e5m2 --trust-remote-code --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --enable-chunked-prefill --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 19,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogs42490005lb04f8a8wm4k",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 131072,
"batchSize": 1,
"ttftMs": 465.38,
"tokSOut": 70.32,
"tokSTotal": 140.64,
"peakVramGb": 22,
"createdAt": "2026-04-27T05:51:03.609Z",
"notes": "Qwen 3.6 27B FP8 dense + MTP n=3 speculative decoding (FP8 weights ship native mtp.layers.0.* \u2014 verified 2026-04-27, mtp_num_hidden_layers=1 in text_config). vLLM 0.19.1 --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'. Dual RTX 3090 (Ti slot 1 Gen4 x16 + Dell slot 2 Gen4 x4, hardware-detected as same gpuName for dedupe). Bench: vllm bench serve openai-chat random 512/512 num-prompts 20 concurrency 1 --ignore-eos --extra-body chat_template_kwargs.enable_thinking=false. MTP acceptance 55.02% / avg length 2.65 / per-position P0=75%/P1=54%/P2=36%. TPOT 13.34 ms. **1.59\u00d7 output throughput vs prior FP8 single-stream (44.17 t/s, no spec decode)**, zero quality cost (same FP8 weights, just MTP draft path). Tool calls verified working (qwen3_coder parser + qwen3 reasoning parser). Thinking-off is the production default \u2014 bench verified that thinking-on causes 'silent finish' (model exhausts max_tokens drafting in reasoning channel without emitting visible content) on 5/6 prompt categories.",
"model": {
"hfId": "Qwen/Qwen3.6-27B-FP8",
"displayName": "Qwen3.6-27B-FP8",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090 Ti",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD Ryzen 9 9950X",
"ramGb": 32,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.1 + MTP n=3",
"quantization": "FP8",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp8",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": true,
"commandSnippet": "vllm serve Qwen3.6-27B-FP8 --tensor-parallel-size 2 --gpu-memory-utilization 0.88 --max-model-len 131072 --max-num-seqs 2 --kv-cache-dtype fp8 --enable-prefix-caching --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3 --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 20,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof36q3i000njr04topmfpwo",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 63.8,
"tokSTotal": null,
"peakVramGb": 22,
"createdAt": "2026-04-26T01:25:31.422Z",
"notes": "Qwen3.6-27B (dense, Qwen3-Next hybrid), Lorbus INT4 AutoRound, single RTX 3090. fp8_e5m2 KV cache. Native MTP head spec-decode (num_speculative_tokens=3). All features working: vision (MoonViT), tools, thinking, recall ladder. Code peak 79.7 TPS vs narrative 63.8 TPS reported here. Best single-card 27B config with all features verified end-to-end. ~14% slower than the same config on dual-card TP=2 because batch=1 decode is memory-bandwidth-bound \u2014 single 3090 has half the bandwidth of TP=2 dual-3090. KV pool 20,800 tokens at 32K (max-num-seqs=1). Companion config to the dual-card recipe at github.com/noonghunna/qwen36-dual-3090. Measured 2026-04-25 at 230W power cap.",
"model": {
"hfId": "Lorbus/Qwen3.6-27B-int4-AutoRound",
"displayName": "Qwen3.6-27B-int4-AutoRound",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev205+g07351e088",
"quantization": "INT4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 1,
"gpuLayers": null,
"kvCacheDtype": "fp8_e5m2",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": true,
"commandSnippet": "vllm serve Lorbus/Qwen3.6-27B-int4-AutoRound --tensor-parallel-size 1 --quantization auto_round --dtype float16 --max-model-len 32768 --gpu-memory-utilization 0.95 --max-num-seqs 1 --kv-cache-dtype fp8_e5m2 --trust-remote-code --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --enable-chunked-prefill --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 21,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoft2ti5000lii040g3yxthl",
"modelRevision": "main",
"promptTokens": 69,
"outputTokens": 512,
"contextLength": 16384,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 59.8,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T13:30:19.230Z",
"notes": "2x RTX 3090 NVLink, TP=2, MTP speculative decoding (3 draft tokens, 55.8% acceptance), enforce-eager, AWQ INT4 + BF16 MTP head. Marlin INT4 kernels native on Ampere. Post-reboot clean GPU state.",
"model": {
"hfId": "hampsonw/Qwen3.6-27B-AWQ-BF16-INT4-mtp-bf16",
"displayName": "Qwen3.6-27B-AWQ-BF16-INT4-mtp-bf16",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.1",
"quantization": "AWQ_INT4",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 22,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof2z3n20006l90452twxyjy",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 58.2,
"tokSTotal": null,
"peakVramGb": 24.1,
"createdAt": "2026-04-26T01:19:35.726Z",
"notes": "Qwen3.6-27B (dense, Qwen3-Next hybrid), Lorbus INT4 AutoRound, full 262K context, MTP n=3, vision. KV cache: TurboQuant 3-bit (turboquant_3bit_nc) via Genesis v7.14 (https://github.com/Sandermage/genesis-vllm-patches) \u2014 P65 cudagraph downgrade for spec-decode fixes vllm#40880 (MTP \u00d7 TurboQuant \u00d7 cudagraph corruption). KV pool: 1,498,464 tokens (9\u00d7 the fp8 default of 168K). Concurrency at full 262K: 4.59\u00d7 streams (vs fp8 default 2.36\u00d7). Code peak 69 TPS vs narrative 58 TPS reported here. ~25% per-stream TPS regression vs fp8 (P65 forces cudagraph_mode=PIECEWISE for spec-decode \u2192 eager continuation), gained back via concurrency above ~3 streams. Consumer-Ampere note: gpu-mem 0.85 + max-num-batched-tokens 4128 to leave activation headroom (Sandermage runs A5000 32GB at 0.92, doesn't hit this). Requires Marlin pad PR #40361 (OPEN) + our patch_tolist_cudagraph.py. Measured 2026-04-25.",
"model": {
"hfId": "Lorbus/Qwen3.6-27B-int4-AutoRound",
"displayName": "Qwen3.6-27B-int4-AutoRound",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.2rc1.dev205+g07351e088",
"quantization": "INT4",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "turboquant_3bit_nc",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": true,
"commandSnippet": "vllm serve Lorbus/Qwen3.6-27B-int4-AutoRound --tensor-parallel-size 2 --quantization auto_round --dtype float16 --max-model-len 262144 --gpu-memory-utilization 0.85 --max-num-seqs 4 --max-num-batched-tokens 4128 --kv-cache-dtype turboquant_3bit_nc --trust-remote-code --reasoning-parser qwen3 --enable-auto-tool-choice --tool-call-parser qwen3_coder --enable-prefix-caching --enable-chunked-prefill --speculative-config '{\"method\":\"mtp\",\"num_speculative_tokens\":3}'"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 23,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofauhkl0006js04hlnh28qt",
"modelRevision": "main",
"promptTokens": 69,
"outputTokens": 1024,
"contextLength": 16384,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 55.2,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T04:59:57.429Z",
"notes": "2x RTX 3090 NVLink, TP=2, MTP speculative decoding (3 draft tokens, 49.5% acceptance), enforce-eager, AWQ INT4 weights with BF16 MTP head",
"model": {
"hfId": "hampsonw/Qwen3.6-27B-AWQ-BF16-INT4-mtp-bf16",
"displayName": "Qwen3.6-27B-AWQ-BF16-INT4-mtp-bf16",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.1",
"quantization": "AWQ",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 24,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogp33aw0004jl045g4sbd9e",
"modelRevision": "82d411acf4a06cfb8d9b073a5211bf410bfc29bf",
"promptTokens": 20,
"outputTokens": 500,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 49.43213842343312,
"tokSTotal": 50.90984157774624,
"peakVramGb": 35.502,
"createdAt": "2026-04-27T04:26:19.640Z",
"notes": "Submitted by Codex after two warmup requests against llama-server /v1/chat/completions. Benchmark request used temperature=0, max_tokens=500, prompt: Please explain to me the main concepts of quantum physics. Peak VRAM observed during run: about 18.32 GiB on GPU0 and 17.18 GiB on GPU1. Server default had Qwen thinking enabled; output tokens in recorded response were reasoning_content.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen ",
"ramGb": 96,
"os": "windows"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8938-b1a5bd4e0",
"quantization": "Q4_K_M",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": 999,
"kvCacheDtype": null,
"attentionBackend": "flash_attn",
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -hf unsloth/Qwen3.6-27B-GGUF:Q4_K_M --host 0.0.0.0 --port 8080 -ngl 999 -fa on --jinja --no-mmap --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0.00 --presence-penalty 1.50 --split-mode tensor -np 1"
},
"user": {
"id": "cmofelkpt0005l704vd0viq0a",
"name": "Jo\u00e3o Vieira",
"username": "joaosump",
"verified": false,
"verifiedAt": null
},
"rank": 25,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofb22uy0004jx04u73naj2s",
"modelRevision": "main",
"promptTokens": 69,
"outputTokens": 1024,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 48.5,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T05:05:51.610Z",
"notes": "2x RTX 3090 NVLink, TP=2, MTP speculative decoding (2 draft tokens, 60.7% acceptance), enforce-eager. FP8 emulated via Marlin on Ampere.",
"model": {
"hfId": "Qwen/Qwen3.6-27B-FP8",
"displayName": "Qwen3.6-27B-FP8",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.1",
"quantization": "fp8",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 26,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof8rjey000hl804orjkwuhd",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": 421.33,
"tokSOut": 44.17,
"tokSTotal": 88.34,
"peakVramGb": 22,
"createdAt": "2026-04-26T04:01:40.618Z",
"notes": "Mixed-pair rig: RTX 3090 Ti (slot 1, x16) + RTX 3090 FE (slot 2, x4). TP=2 throughput gated by the slower FE card \u2014 the Ti's compute headroom is mostly stranded. Single-stream 512/512 measured via `vllm bench serve` at concurrency=1, 20 prompts. Prefix caching + torch.compile + CUDAGraphs (FULL_AND_PIECEWISE) all engaged.",
"model": {
"hfId": "Qwen/Qwen3.6-27B-FP8",
"displayName": "Qwen3.6-27B-FP8",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "vllm",
"engineVersion": "0.19.1",
"quantization": "fp8",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": null,
"kvCacheDtype": "fp8",
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "vllm serve Qwen/Qwen3.6-27B-FP8 --tensor-parallel-size 2 --gpu-memory-utilization 0.88 --max-model-len 262144 --max-num-seqs 2 --kv-cache-dtype fp8 --dtype auto --hf-overrides '{\"language_model_only\": true}' --enable-prefix-caching --enable-auto-tool-choice --tool-call-parser qwen3_coder --reasoning-parser qwen3"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 27,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof8c3e30003l80461yycsyd",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 40.85,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T03:49:40.012Z",
"notes": "BigBeast TurboQuant turbo3 KV, Qwen3.5-27B Q4_K_M on RTX 3090. pp512 1384 tok/s, tg128 40.85 tok/s. Source: ~/aurora_docs/BIGBEAST_TURBO3_BENCHMARKS.md.",
"model": {
"hfId": "unsloth/Qwen3.5-27B-GGUF",
"displayName": "Qwen3.5-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 28,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofelzzf0005ie04osbwxjd9",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 462.98,
"tokSOut": 40.34,
"tokSTotal": 80.71,
"peakVramGb": 19.8,
"createdAt": "2026-04-26T06:45:19.851Z",
"notes": "PC dual-RTX-3090 (Ti+FE), llama.cpp -sm row TP=2. Qwen 3.6 35B-A3B MoE (3B active) at unsloth UD-Q6_K_XL (30 GB). 512/512 single-stream, 20 prompts. KV q8_0. Notable: M5 Strix Halo runs same model at 50.36 t/s (1.25x faster). For sparse MoE single-stream via llama.cpp, dual-card row-split all-reduce overhead exceeds Strix Halo unified-memory advantage. PC vLLM FP8 (different stack): 140 t/s \u2014 vLLM MoE kernel wins decisively.",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B-GGUF",
"displayName": "Qwen3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8870",
"quantization": "UD-Q6_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf -ngl 999 -fa 1 --no-mmap --mlock -c 32768 -sm row -ts 1,1 -ctk q8_0 -ctv q8_0 -b 4096 -ub 4096 --parallel 1"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 29,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofaw63i0001l504b6245kbr",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 38.901,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T05:01:15.871Z",
"notes": "BigBeast RTX 3090 Gemma 4 31B Q4_K_M mainline q4_0/q4_0 KV. Prefill: 512=1408.09, 100K=649.885, 131072=560.072, 150K=516.029 tok/s; decode 38.901 tok/s. Source: ~/aurora_docs/GEMMA4_31B_3090_KV_RESULTS_2026-04-10.csv.",
"model": {
"hfId": "unsloth/gemma-4-31B-it-GGUF",
"displayName": "gemma-4-31B-it-GGUF",
"family": "Gemma",
"params": 31,
"isMoE": false,
"baseModel": {
"hfId": "google/gemma-4-31B-it",
"displayName": "gemma-4-31B-it"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 30,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoh3cued0004js0482vbtb3k",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 256,
"contextLength": 131072,
"batchSize": 1,
"ttftMs": 813.9,
"tokSOut": 38.78,
"tokSTotal": 105.97,
"peakVramGb": 21.58,
"createdAt": "2026-04-27T11:05:49.286Z",
"notes": "llama.cpp with Unsloth Dynamic UD-Q4_K_XL GGUF (mixed K-quants, ~5.24 BPW). flash-attn on, q8_0 KV cache, 4 parallel slots (--parallel auto), --kv-unified shared KV pool. No speculative decoding. Greedy decode, ignore_eos=true. Median of 5 runs after 2 warmup; greedy; ignore_eos=true. Actual prompt tokens (server): 527. Decode tok/s stddev: 0.04.",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8892-0d0764dfd",
"quantization": "UD-Q4_K_XL",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 65,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "./llama.cpp/llama-server \\\n --model unsloth/Qwen3.6-27B-GGUF/Qwen3.6-27B-UD-Q4_K_XL.gguf \\\n --alias \"unsloth/Qwen3.6-27B\" \\\n --host 0.0.0.0 --port 8001 \\\n --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 \\\n --kv-unified \\\n --cache-type-k q8_0 --cache-type-v q8_0 \\\n --flash-attn on --fit on \\\n --ctx-size 131072"
},
"user": {
"id": "cmoga3m060002l704h3s1zjue",
"name": "moussaba",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 31,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofb2r0p0001kv04p9ar0tg5",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 38.733,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T05:06:22.922Z",
"notes": "BigBeast RTX 3090 Gemma 4 31B Q4_K_M turboquant q4_0/turbo3 KV. Prefill: 512=1390.32, 100K=646.249, 131072=556.740, 150K=513.469 tok/s; decode 38.733 tok/s. Source: ~/aurora_docs/GEMMA4_31B_3090_KV_RESULTS_2026-04-10.csv.",
"model": {
"hfId": "unsloth/gemma-4-31B-it-GGUF",
"displayName": "gemma-4-31B-it-GGUF",
"family": "Gemma",
"params": 31,
"isMoE": false,
"baseModel": {
"hfId": "google/gemma-4-31B-it",
"displayName": "gemma-4-31B-it"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 32,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofb8rpy0008jx04r6ydaaf8",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 8192,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 38.41,
"tokSTotal": 1290.87,
"peakVramGb": null,
"createdAt": "2026-04-26T05:11:03.766Z",
"notes": "2x RTX 3090 NVLink, all layers on GPU, flash attention, Q4_0 KV cache. llama-bench pp512 tg128.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8934",
"quantization": "Q4_K_P",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 33,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmof7yxtg0003l804v9nek86b",
"modelRevision": "main",
"promptTokens": 0,
"outputTokens": 0,
"contextLength": 2048,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 38.4,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T03:39:26.260Z",
"notes": "BigBeast single RTX 3090, Qwen3.5-27B dense Q4_0, Q4_1 KV. Prompt processing 692 tok/s. Source: ~/aurora_docs/QWEN35_LOCAL_BENCHMARKS.md.",
"model": {
"hfId": "unsloth/Qwen3.5-27B-GGUF",
"displayName": "Qwen3.5-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": null,
"quantization": "Q4_0",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmof4nith0000i9049jn28jo6",
"name": "Bermont",
"username": "basecampbernie",
"verified": false,
"verifiedAt": null
},
"rank": 34,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoh3k1zb000fl704qcvewppz",
"modelRevision": "main",
"promptTokens": 4096,
"outputTokens": 512,
"contextLength": 131072,
"batchSize": 1,
"ttftMs": 3782.9,
"tokSOut": 37.88,
"tokSTotal": 267.68,
"peakVramGb": 21.6,
"createdAt": "2026-04-27T11:11:25.703Z",
"notes": "llama.cpp with Unsloth Dynamic UD-Q4_K_XL GGUF (mixed K-quants, ~5.24 BPW). flash-attn on, q8_0 KV cache, 4 parallel slots (--parallel auto), --kv-unified shared KV pool. No speculative decoding. Greedy decode, ignore_eos=true. Median of 5 runs after 2 warmup; greedy; ignore_eos=true. Actual prompt tokens (server): 4111. Decode tok/s stddev: 0.04.",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8892-0d0764dfd",
"quantization": "UD-Q4_K_XL",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 65,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "./llama.cpp/llama-server \\\n --model unsloth/Qwen3.6-27B-GGUF/Qwen3.6-27B-UD-Q4_K_XL.gguf \\\n --alias \"unsloth/Qwen3.6-27B\" \\\n --host 0.0.0.0 --port 8001 \\\n --temp 0.6 --top-p 0.95 --top-k 20 --min-p 0.00 \\\n --kv-unified \\\n --cache-type-k q8_0 --cache-type-v q8_0 \\\n --flash-attn on --fit on \\\n --ctx-size 131072"
},
"user": {
"id": "cmoga3m060002l704h3s1zjue",
"name": "moussaba",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 35,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogjhzqv0004jv04sl792yp8",
"modelRevision": "main",
"promptTokens": 22,
"outputTokens": 1000,
"contextLength": 65536,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 37.78,
"tokSTotal": null,
"peakVramGb": 10.2,
"createdAt": "2026-04-27T01:49:57.175Z",
"notes": "Tom Turney's TurboQuant+ fork of llama.cpp. Asymmetric KV: q8_0 keys + turbo4 values per @no_stp_on_snek's twitter post. 2\u00d7 RTX 3090 layer-split, 50/50 tensor-split. Pascal (2\u00d7 1080 Ti) got 14 TPS at 65K; Ampere gets 37.78 = 2.7\u00d7 faster. CV 0.1% across 5 runs (essentially deterministic). Combined with the 35B-A3B Q3_K_XL bench (125.94 TPS), MoE-vs-dense advantage = 3.3\u00d7 on Ampere (vs Pascal's 2.6\u00d7). Bench: 800-word essay \u00d7 1000 max_tokens \u00d7 temp 0.6 streaming.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "Ryzen ",
"ramGb": 96,
"os": "windows"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "Tom-TurboQuant-fork-b3401eaf9",
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 999,
"kvCacheDtype": "q8_0/turbo4 (asymmetric)",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server --model Qwen3.6-27B-Q4_K_M.gguf --ctx-size 65536 --n-gpu-layers 999 --split-mode layer --tensor-split 0.5,0.5 --cache-type-k q8_0 --cache-type-v turbo4 --batch-size 2048 --ubatch-size 512 --threads 8"
},
"user": {
"id": "cmoetzsh3000el704f9wkttae",
"name": "Wasif Basharat",
"username": "wasifb",
"verified": false,
"verifiedAt": null
},
"rank": 36,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoerh82s0001l404jm347jzo",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 73728,
"batchSize": 1,
"ttftMs": 374,
"tokSOut": 36.37,
"tokSTotal": null,
"peakVramGb": 21,
"createdAt": "2026-04-25T19:57:45.893Z",
"notes": "Short-reply variant of cmoer7qzw000ale04gs1qkc1n. Same config, 512 prompt \u2192 128 output. Quick-dispatch tool-call profile. See main entry cmoer7qzw000ale04gs1qkc1n for full DeltaNet stability config.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090 Ti",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b1540",
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 99,
"kvCacheDtype": "bf16",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m gary-qwen36-27b-v4-Q4_K_M.gguf -ngl 99 -c 73728 -np 1 --cache-type-k bf16 --cache-type-v bf16 --jinja --flash-attn on --batch-size 2048 --ubatch-size 512 --threads 8 --threads-batch 16 --temp 0.1 --reasoning-format deepseek --reasoning-budget 1024 --no-context-shift --ctx-checkpoints 0 --checkpoint-every-n-tokens -1"
},
"user": {
"id": "cmoequ7db0000lb04vgu9qwft",
"name": "Tim",
"username": "SMOK33Y3",
"verified": false,
"verifiedAt": null
},
"rank": 37,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoffvmc00001l504493iorau",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 73728,
"batchSize": 1,
"ttftMs": 347,
"tokSOut": 35.71,
"tokSTotal": null,
"peakVramGb": 21,
"createdAt": "2026-04-26T07:20:48.337Z",
"notes": "Production config running a domain-specific fine-tune of unsloth/Qwen3.6-27B-GGUF Q4_K_M for native tool-calling. Hard-won hybrid-DeltaNet stability config after a 14-crash debugging series: paired --ctx-checkpoints 0 + --checkpoint-every-n-tokens -1 (full-disable; 0 alone trips create-then-erase via while size>=0), no --mmproj for text-only serving (#21690/#19980 interaction bugs), no LLAMA_ATTN_ROT_DISABLE (Qwen3.5-only env, interferes with 3.6 DeltaNet kernels), --no-context-shift mandatory, --reasoning-budget 1024 to cap unbounded <think> (one earlier crash was unbounded reasoning at 20K context fill). nginx reverse-proxy on :8034 -> llama-server :8033 with -np 1 to serialize concurrent callers. 3+ days production-clean. PCIe Gen 3 + Resizable BAR disabled in BIOS for POST stability. Build 1540 (b76429a69). Companion entries: cmoerh82s0001l404jm347jzo (512->128), cmoerpri70001i904utd6uxna (8192->512) show decode rate degradation 36.37 -> 35.71 -> 32.65 as context grows (bf16 KV memory-bandwidth signature).",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090 Ti",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b1540",
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 99,
"kvCacheDtype": "bf16",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m gary-qwen36-27b-v4-Q4_K_M.gguf -ngl 99 -c 73728 -np 1 --cache-type-k bf16 --cache-type-v bf16 --jinja --flash-attn on --batch-size 2048 --ubatch-size 512 --threads 8 --threads-batch 16 --temp 0.1 --reasoning-format deepseek --reasoning-budget 1024 --no-context-shift --ctx-checkpoints 0 --checkpoint-every-n-tokens -1"
},
"user": {
"id": "cmoequ7db0000lb04vgu9qwft",
"name": "Tim",
"username": "SMOK33Y3",
"verified": false,
"verifiedAt": null
},
"rank": 38,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoerpri70001i904utd6uxna",
"modelRevision": "main",
"promptTokens": 8192,
"outputTokens": 512,
"contextLength": 73728,
"batchSize": 1,
"ttftMs": 7416,
"tokSOut": 32.65,
"tokSTotal": null,
"peakVramGb": 21,
"createdAt": "2026-04-25T20:04:24.320Z",
"notes": "Larger-prompt variant of cmoer7qzw000ale04gs1qkc1n. 8K prompt -> 512 output. Prefill ~1104 tok/s. Decode ~9% slower than 512-prompt baseline (32.65 vs 35.71 tok/s) due to KV scan cost growing with context. See main entry for full DeltaNet stability config.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "NVIDIA GeForce RTX 3090 Ti",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b1540",
"quantization": "Q4_K_M",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 99,
"kvCacheDtype": "bf16",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m gary-qwen36-27b-v4-Q4_K_M.gguf -ngl 99 -c 73728 -np 1 --cache-type-k bf16 --cache-type-v bf16 --jinja --flash-attn on --batch-size 2048 --ubatch-size 512 --threads 8 --threads-batch 16 --temp 0.1 --reasoning-format deepseek --reasoning-budget 1024 --no-context-shift --ctx-checkpoints 0 --checkpoint-every-n-tokens -1"
},
"user": {
"id": "cmoequ7db0000lb04vgu9qwft",
"name": "Tim",
"username": "SMOK33Y3",
"verified": false,
"verifiedAt": null
},
"rank": 39,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofbffzj000ujx04vi0k6a8v",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 8192,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 26.82,
"tokSTotal": 1411.78,
"peakVramGb": null,
"createdAt": "2026-04-26T05:16:15.151Z",
"notes": "2x RTX 3090 NVLink, all layers on GPU, flash attention, Q4_0 KV cache. llama-bench pp512 tg128.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8934",
"quantization": "Q8_0",
"backend": null
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": ""
},
"user": {
"id": "cmofar9nt000il204u66pwhcx",
"name": "jalopy1",
"username": null,
"verified": false,
"verifiedAt": null
},
"rank": 40,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdrmzl000dla042zyn4ihi",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 1416.14,
"tokSOut": 24.16,
"tokSTotal": 47.38,
"peakVramGb": 14.2,
"createdAt": "2026-04-26T06:21:43.330Z",
"notes": "PC dual-RTX-3090 (Ti slot 1 + FE slot 2), llama.cpp dual-card row-split TP=2 (-sm row -ts 1,1). Qwen 3.6 27B DENSE at unsloth UD-Q4_K_XL (17 GB). Single-stream 512/512 vllm bench serve, 20 prompts. KV q8_0. 2.1x faster than M5 same model (24.16 vs 11.46 t/s) \u2014 discrete GPU bandwidth wins on dense.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8870",
"quantization": "UD-Q4_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.6-27B-UD-Q4_K_XL.gguf -ngl 999 -fa 1 --no-mmap --mlock -c 32768 -sm row -ts 1,1 -ctk q8_0 -ctv q8_0 -b 4096 -ub 4096 --parallel 1"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 41,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofe2x2b0002jv04nd165jxz",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 1531.55,
"tokSOut": 22.23,
"tokSTotal": 43.4,
"peakVramGb": 17.9,
"createdAt": "2026-04-26T06:30:29.603Z",
"notes": "PC dual-RTX-3090 (Ti+FE), llama.cpp -sm row TP=2. Qwen 3.6 27B DENSE at unsloth UD-Q6_K_XL (24 GB). Single-stream 512/512 vllm bench serve, 20 prompts. KV q8_0. PC dense 27B quant ladder via llama.cpp: Q4 24.16 / Q6 22.23 t/s.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8870",
"quantization": "UD-Q6_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.6-27B-UD-Q6_K_XL.gguf -ngl 999 -fa 1 --no-mmap --mlock -c 32768 -sm row -ts 1,1 -ctk q8_0 -ctv q8_0 -b 4096 -ub 4096 --parallel 1"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 42,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofefde70002ie04ag5nc5pd",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 1658.53,
"tokSOut": 19.22,
"tokSTotal": 38.18,
"peakVramGb": 22.1,
"createdAt": "2026-04-26T06:40:10.640Z",
"notes": "PC dual-RTX-3090 (Ti+FE), llama.cpp -sm row TP=2. Qwen 3.6 27B DENSE at unsloth UD-Q8_K_XL (33 GB). 512/512 single-stream, 20 prompts. KV q8_0. Full PC dense 27B llama.cpp ladder: Q4 24.16 / Q6 22.23 / Q8 19.22 t/s. For comparison, vLLM FP8 same model: 44.17 t/s \u2014 vLLM kernel + FP8 ~2x faster than llama.cpp Q8.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8870",
"quantization": "UD-Q8_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m Qwen3.6-27B-UD-Q8_K_XL.gguf -ngl 999 -fa 1 --no-mmap --mlock -c 32768 -sm row -ts 1,1 -ctk q8_0 -ctv q8_0 -b 4096 -ub 4096 --parallel 1"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 43,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofeyn1i000gl204hiwvdqj0",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 32768,
"batchSize": 1,
"ttftMs": 2263.37,
"tokSOut": 14.38,
"tokSTotal": 33.5,
"peakVramGb": 20.7,
"createdAt": "2026-04-26T06:55:09.606Z",
"notes": "PC dual-RTX-3090 (Ti+FE), llama.cpp -sm row TP=2. Gemma 4 31B IT at unsloth UD-Q6_K_XL (26 GB). 512/512 single-stream, 20 prompts. KV q8_0. Bench used Qwen tokenizer for synthetic prompts (Gemma tokenizer is gated) \u2014 output token count is server-reported, throughput accurate; TTFT slightly inflated due to vocab mismatch. Gemma 4 dense ~25% slower than Qwen 3.6 27B dense Q6 on same rig (14.38 vs 22.23 t/s).",
"model": {
"hfId": "unsloth/gemma-4-31B-it-GGUF",
"displayName": "gemma-4-31B-it-GGUF",
"family": "Gemma",
"params": 31,
"isMoE": false,
"baseModel": {
"hfId": "google/gemma-4-31B-it",
"displayName": "gemma-4-31B-it"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RTX 3090",
"gpuCount": 2,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": "AMD EPYC 7543",
"ramGb": 128,
"os": "Ubuntu 24.04"
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8870",
"quantization": "UD-Q6_K_XL",
"backend": "cuda"
},
"engineFlags": {
"tensorParallel": 2,
"gpuLayers": 999,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "llama-server -m gemma-4-31B-it-UD-Q6_K_XL.gguf -ngl 999 -fa 1 --no-mmap --mlock -c 32768 -sm row -ts 1,1 -ctk q8_0 -ctv q8_0 -b 4096 -ub 4096 --parallel 1"
},
"user": {
"id": "cmof7gocp0015gv041vcnngqj",
"name": "Skiipy",
"username": "Skiipy",
"verified": false,
"verifiedAt": null
},
"rank": 44,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 44
},
"RTX 3080 (10 GB)": {
"rows": [],
"total": 0
},
"RTX 3060 (12 GB)": {
"rows": [],
"total": 0
},
"A100 (80 GB)": {
"rows": [],
"total": 0
},
"A100 (40 GB)": {
"rows": [],
"total": 0
},
"H100 (80 GB)": {
"rows": [],
"total": 0
},
"L40S (48 GB)": {
"rows": [],
"total": 0
},
"T4 (16 GB)": {
"rows": [],
"total": 0
},
"RX 7900 XTX (24 GB)": {
"rows": [
{
"id": "cmofzk74w0002ij042txvvkx5",
"modelRevision": "main",
"promptTokens": 27,
"outputTokens": 184,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.53,
"tokSOut": 575.25,
"tokSTotal": null,
"peakVramGb": 8.07421875,
"createdAt": "2026-04-26T16:31:47.744Z",
"notes": "hipfire @ e659452 (master post-PR #51 + #52 series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n chatml-wrapped + explicit empty <think></think> for thinking-off\n prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 571.2\u2013576.0\n per-run tok/s: [571.23, 575.25, 576.04]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 13.077\naccept_rate (median): 0.872\nprefill: 31.1ms (868.3 tok/s)\nttft (excl warmup): 75.5ms = prefill + first cycle\nvram: 8268 MB used / 24560 MB total\nnatural EOS at 184 tokens \u2014 production-shape bounded code (no loop)",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+e659452",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdp1ve0003i904tw2yf6bs",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 336.98,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:19:42.650Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 7.5\naccept_rate (median): 0.5\nruns all: [336.98, 343.32, 314.93]",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 2,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofkl1gm0006l4045txzjtoc",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 322.6,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:32:32.806Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 322.5\u2013346.0\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 7.067\naccept_rate (median): 0.471\nall runs tok/s: [346.0, 322.47, 322.6]",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 3,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofyatrj0002jv04r8064iqg",
"modelRevision": "main",
"promptTokens": 27,
"outputTokens": 157,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 154.7,
"tokSOut": 250.25,
"tokSTotal": null,
"peakVramGb": 18.2890625,
"createdAt": "2026-04-26T15:56:30.896Z",
"notes": "hipfire @ 3945bb2 (master post-PR #51 loop-break + ngram_block series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n chatml-wrapped + explicit empty <think></think> for thinking-off\n prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 249.6\u2013251.0\n per-run tok/s: [249.56, 250.25, 251.02]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 13.182\naccept_rate (median): 0.879\nprefill: 81.3ms (332.0 tok/s)\nttft (excl warmup): 154.7ms = prefill + first cycle\nvram: 18728 MB used / 24560 MB total\nnatural EOS at 157 tokens \u2014 production-shape bounded code (no loop)",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+3945bb2",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 4,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofkrpp0000hic04ymyzumfg",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 201.11,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:37:44.148Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 201.0\u2013201.1\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 10.545\naccept_rate (median): 0.703\nall runs tok/s: [201.04, 201.11, 201.11]",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 5,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofe2efg000kla04k5h7rto0",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 181.96,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:30:05.453Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 9.417\naccept_rate (median): 0.628\nruns all: [171.34, 181.96, 200.56]",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 6,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofkye0g0009l404ap3uadyl",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 154.62,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:42:55.600Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 66.4\u2013155.9\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 5.611\naccept_rate (median): 0.374\nall runs tok/s: [66.38, 154.62, 155.92]",
"model": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B-Base",
"displayName": "Qwen3.5-35B-A3B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-35b-a3b.mq4 --draft qwen35-35b-a3b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 7,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofefr0j000ula04chaak8ou",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 140.59,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:40:28.291Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 4.905\naccept_rate (median): 0.327\nruns all: [74.91, 160.26, 140.59]",
"model": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B-Base",
"displayName": "Qwen3.5-35B-A3B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-35b-a3b.mq4 --draft qwen35-35b-a3b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 8,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofe92tt000pla04ivnjv4jd",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.4,
"tokSOut": 135.9,
"tokSTotal": 125.9,
"peakVramGb": 22.12890625,
"createdAt": "2026-04-26T06:35:17.010Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 326.1 tok/s\npp512: 322.5 tok/s\npp1024: 320.8 tok/s\npp2048: 317.5 tok/s\nvram_loaded: 22660 MB",
"model": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B-Base",
"displayName": "Qwen3.5-35B-A3B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.5:35b-a3b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 9,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofezrun000cie04z5y0b83e",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.5,
"tokSOut": 135.3,
"tokSTotal": 125.3,
"peakVramGb": 22.12890625,
"createdAt": "2026-04-26T06:56:02.495Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 325.4 tok/s\npp512: 322.0 tok/s\npp1024: 320.4 tok/s\npp2048: 316.8 tok/s\nvram_loaded: 22660 MB",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.6:35b-a3b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 10,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofgd9iz000ml4055xsdgvuy",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.6,
"tokSOut": 134,
"tokSTotal": 124.2,
"peakVramGb": 22.13,
"createdAt": "2026-04-26T07:34:31.547Z",
"notes": "hipfire @ 0.1.8-alpha+f16eceb (master post-perf-recovery, PR #47)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3 (median reported)\ndecode mode: AR (autoregressive, no spec-decode)\nkv_cache: asym3 (3-bit rotated K + Q8 V; 5.5x vs fp32)\nprompt_normalize: true (default since 2026-04-26)\npp128: 328.1 tok/s\npp512: 324.6 tok/s\npp1024: 323.1 tok/s\npp2048: 318.9 tok/s\nvram_loaded: 22660 MB\nQuantized in-house: hipfire-quantize MQ4 (FWHT-rotated 4-bit, group=256)",
"model": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B",
"displayName": "Ornstein3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "unsloth/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench ornstein-3.6-35b-a3b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 11,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdidib0002ju04tsntnze1",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 48.1,
"tokSOut": 122.3,
"tokSTotal": 116.9,
"peakVramGb": 5.662109375,
"createdAt": "2026-04-26T06:14:31.139Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 602.8 tok/s\npp512: 591.4 tok/s\npp1024: 587.8 tok/s\npp2048: 580.0 tok/s\nvram_loaded: 5798 MB",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.5:9b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 12,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofet3j80005l2042r92gywi",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 118.16,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:50:51.045Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 5.632\naccept_rate (median): 0.375\nruns all: [118.19, 118.14, 118.16]",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.6-27b.mq4 --draft qwen36-27b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 13,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofl528t000nl4045c6nx68j",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 118.14,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:48:06.941Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 117.5\u2013118.4\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 5.632\naccept_rate (median): 0.375\nall runs tok/s: [118.14, 117.48, 118.44]",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.6-27b.mq4 --draft qwen36-27b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 14,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoh721ve000kla04zc0ki9gs",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 104.53,
"tokSTotal": null,
"peakVramGb": 15.68,
"createdAt": "2026-04-27T12:49:24.218Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 1024; q8_0/q8_0 KV; all-GPU no -ot needed (15.68 GiB fits 24 GB VRAM); gen flat 104-106 t/s tg128-tg2048; b/ub 1024 wins prefill over b/ub 512 (pp4096: 3035 vs 2353); b/ub 2048 catastrophic cliff (pp4096=653, -78%)",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q3_K_XL",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q8_0 --cache-type-v q8_0 -sm row"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 15,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoetv7zn0004l704ocb4a949",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 94.5,
"tokSTotal": null,
"peakVramGb": 15.63,
"createdAt": "2026-04-25T21:04:38.195Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 512; FA=0 required (Gemma-4 head_dim=512 exceeds FA2 limit; FA=1 degrades gen 50-70% on ROCm); f16/f16 KV mandatory with FA=0",
"model": {
"hfId": "Jackrong/Gemopus-4-26B-A4B-it-GGUF",
"displayName": "Gemopus-4-26B-A4B-it-GGUF",
"family": "Gemma",
"params": 26,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_M",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": false,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 512 -ub 512 --cache-type-k f16 --cache-type-v f16 --no-flash-attn"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 16,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeu1wus0003l60472784yfr",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 94.3,
"tokSTotal": null,
"peakVramGb": 15.25,
"createdAt": "2026-04-25T21:09:50.356Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 512; FA=0 required on ROCm; f16/f16 KV only (quantized V requires FA); UD unsloth dynamic quant",
"model": {
"hfId": "unsloth/gemma-4-26B-A4B-it-GGUF",
"displayName": "gemma-4-26B-A4B-it-GGUF",
"family": "Gemma",
"params": 26,
"isMoE": false,
"baseModel": {
"hfId": "google/gemma-4-26B-A4B-it",
"displayName": "gemma-4-26B-A4B-it"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": false,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 512 -ub 512 --cache-type-k f16 --cache-type-v f16 --no-flash-attn"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 17,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoetb4n1000jjz04aol3g3h5",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 93.2,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T20:49:00.733Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 4096; f16/f16 KV cache",
"model": {
"hfId": "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-GGUF",
"displayName": "Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-GGUF",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8639",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 4096 -ub 4096 --cache-type-k f16 --cache-type-v f16"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 18,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoff6g560019la04hewfjrll",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 68.6,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T07:01:13.915Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 1.222\naccept_rate (median): 0.081\nruns all: [48.08, 68.6, 68.65]",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.6-35b-a3b.mq4 --draft qwen36-35b-a3b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 19,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeum17e0003jo04iijt0vvl",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 62.4,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:25:29.115Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 1024; q8_0/q8_0 KV; MoE A3B; gen perfectly flat (Qwen3.6 DeltaNet arch); Q4_K_S beats Q4_K_M on all metrics",
"model": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B-GGUF",
"displayName": "Ornstein3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B",
"displayName": "Ornstein3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q8_0 --cache-type-v q8_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 20,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeusq0f0007l804y0oofytc",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 57.6,
"tokSTotal": null,
"peakVramGb": 19.45,
"createdAt": "2026-04-25T21:30:41.200Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 1024; q4_0/q4_0 KV; -ot cutoff=33 (layers 0-32 GPU / 33-39 CPU) gives 3x better prefill vs baseline; UD unsloth dynamic quant; MoE A3B",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B-GGUF",
"displayName": "Qwen3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 33,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q4_0 --cache-type-v q4_0 -sm row -ot \"blk\\.(3[3-9])\\.ffn_.*_exps=CPU\""
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 21,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeufbnr000ql604wctafc0s",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 1024,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 47.8,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:20:16.072Z",
"notes": "llama-bench on ROCm (hip-radeon); tg512 metric; b/ub 1024; q4_0/q4_0 KV; AWQ 4-bit re-quanted to Q4_K_M GGUF; MoE A3B",
"model": {
"hfId": "cyberjuju/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit-Q4_K_M-GGUF",
"displayName": "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit-Q4_K_M-GGUF",
"family": "Qwen",
"params": 30,
"isMoE": false,
"baseModel": {
"hfId": "cyankiwi/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit",
"displayName": "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8639",
"quantization": "Q4_K_M",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q4_0 --cache-type-v q4_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 22,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeu8m92000fl604nt2wtvkj",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 1024,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 46.9,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:15:03.206Z",
"notes": "llama-bench on ROCm (hip-radeon); tg512 metric; b/ub 1024; q8_0/q8_0 KV; MoE A3B (30B total / 3B active)",
"model": {
"hfId": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-GGUF",
"displayName": "Qwen3-Coder-30B-A3B-Instruct-GGUF",
"family": "Qwen",
"params": 30,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
"displayName": "Qwen3-Coder-30B-A3B-Instruct"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8639",
"quantization": "Q4_K_M",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q8_0 --cache-type-v q8_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 23,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdvq7c0001l40461v8g3z7",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 117.8,
"tokSOut": 43.7,
"tokSTotal": 42,
"peakVramGb": 15.10546875,
"createdAt": "2026-04-26T06:24:54.120Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 223.5 tok/s\npp512: 215.7 tok/s\npp1024: 213.3 tok/s\npp2048: 210.9 tok/s\nvram_loaded: 15468 MB",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.5:27b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 24,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofemf840014la04lhmues5d",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 117.6,
"tokSOut": 43.6,
"tokSTotal": 41.9,
"peakVramGb": 15.10546875,
"createdAt": "2026-04-26T06:45:39.604Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 224.2 tok/s\npp512: 216.5 tok/s\npp1024: 214.3 tok/s\npp2048: 211.1 tok/s\nvram_loaded: 15468 MB",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.6:27b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 25,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogm5x010002jv040zuze5n3",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 33.31,
"tokSTotal": null,
"peakVramGb": 13.47,
"createdAt": "2026-04-27T03:04:32.594Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b=1024/ub=512; q8_0/q8_0 KV; gen perfectly flat tg128-tg2048 (33.31->33.08 t/s, -0.7%); DeltaNet hybrid arch; b=1024/ub=512 beats b=2048/ub=512 on prefill (951 vs 943 pp512)",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q3_K_XL",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 512 --cache-type-k q8_0 --cache-type-v q8_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 26,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoethtq9000pjz045yguc0ld",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 28.6,
"tokSTotal": null,
"peakVramGb": 14.51,
"createdAt": "2026-04-25T20:54:13.185Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 512; q4_0/q4_0 KV; gen perfectly flat tg128-tg2048",
"model": {
"hfId": "Jackrong/Qwopus3.5-27B-v3-GGUF",
"displayName": "Qwopus3.5-27B-v3-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "unsloth/Qwen3.5-27B",
"displayName": "Qwen3.5-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 512 -ub 512 --cache-type-k q4_0 --cache-type-v q4_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 27,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoetoim3000sjo04hd67abew",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 28.5,
"tokSTotal": null,
"peakVramGb": 14.76,
"createdAt": "2026-04-25T20:59:25.371Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b=1024/ub=512; q4_0/q4_0 KV; GatedDeltaNet+GQA hybrid arch; gen perfectly flat",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 512 --cache-type-k q4_0 --cache-type-v q4_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 28,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 28
},
"RX 7900 XT (20 GB)": {
"rows": [
{
"id": "cmofzk74w0002ij042txvvkx5",
"modelRevision": "main",
"promptTokens": 27,
"outputTokens": 184,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.53,
"tokSOut": 575.25,
"tokSTotal": null,
"peakVramGb": 8.07421875,
"createdAt": "2026-04-26T16:31:47.744Z",
"notes": "hipfire @ e659452 (master post-PR #51 + #52 series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n chatml-wrapped + explicit empty <think></think> for thinking-off\n prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 571.2\u2013576.0\n per-run tok/s: [571.23, 575.25, 576.04]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 13.077\naccept_rate (median): 0.872\nprefill: 31.1ms (868.3 tok/s)\nttft (excl warmup): 75.5ms = prefill + first cycle\nvram: 8268 MB used / 24560 MB total\nnatural EOS at 184 tokens \u2014 production-shape bounded code (no loop)",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+e659452",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdp1ve0003i904tw2yf6bs",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 336.98,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:19:42.650Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 7.5\naccept_rate (median): 0.5\nruns all: [336.98, 343.32, 314.93]",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 2,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofkl1gm0006l4045txzjtoc",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 322.6,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:32:32.806Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 322.5\u2013346.0\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 7.067\naccept_rate (median): 0.471\nall runs tok/s: [346.0, 322.47, 322.6]",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-9b.mq4 --draft qwen35-9b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 3,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofyatrj0002jv04r8064iqg",
"modelRevision": "main",
"promptTokens": 27,
"outputTokens": 157,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 154.7,
"tokSOut": 250.25,
"tokSTotal": null,
"peakVramGb": 18.2890625,
"createdAt": "2026-04-26T15:56:30.896Z",
"notes": "hipfire @ 3945bb2 (master post-PR #51 loop-break + ngram_block series)\nprompt: merge_sort thinking-OFF (md5=253c7ac50857fe6d0e10fb0d2c5e35c0)\n chatml-wrapped + explicit empty <think></think> for thinking-off\n prompt file: benchmarks/prompts/merge_sort_thinking_off.txt\nruns: 3 (median reported); range 249.6\u2013251.0\n per-run tok/s: [249.56, 250.25, 251.02]\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 13.182\naccept_rate (median): 0.879\nprefill: 81.3ms (332.0 tok/s)\nttft (excl warmup): 154.7ms = prefill + first cycle\nvram: 18728 MB used / 24560 MB total\nnatural EOS at 157 tokens \u2014 production-shape bounded code (no loop)",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+3945bb2",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/merge_sort_thinking_off.txt) --max 256 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 4,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofkrpp0000hic04ymyzumfg",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 201.11,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:37:44.148Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 201.0\u2013201.1\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 10.545\naccept_rate (median): 0.703\nall runs tok/s: [201.04, 201.11, 201.11]",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 5,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofe2efg000kla04k5h7rto0",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 181.96,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:30:05.453Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 9.417\naccept_rate (median): 0.628\nruns all: [171.34, 181.96, 200.56]",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-27b.mq4 --draft qwen35-27b-dflash.mq4 --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 6,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofkye0g0009l404ap3uadyl",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 154.62,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:42:55.600Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 66.4\u2013155.9\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 5.611\naccept_rate (median): 0.374\nall runs tok/s: [66.38, 154.62, 155.92]",
"model": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B-Base",
"displayName": "Qwen3.5-35B-A3B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-35b-a3b.mq4 --draft qwen35-35b-a3b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 7,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofefr0j000ula04chaak8ou",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 140.59,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:40:28.291Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 4.905\naccept_rate (median): 0.327\nruns all: [74.91, 160.26, 140.59]",
"model": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B-Base",
"displayName": "Qwen3.5-35B-A3B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.5-35b-a3b.mq4 --draft qwen35-35b-a3b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 8,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofe92tt000pla04ivnjv4jd",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.4,
"tokSOut": 135.9,
"tokSTotal": 125.9,
"peakVramGb": 22.12890625,
"createdAt": "2026-04-26T06:35:17.010Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 326.1 tok/s\npp512: 322.5 tok/s\npp1024: 320.8 tok/s\npp2048: 317.5 tok/s\nvram_loaded: 22660 MB",
"model": {
"hfId": "Qwen/Qwen3.5-35B-A3B",
"displayName": "Qwen3.5-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-35B-A3B-Base",
"displayName": "Qwen3.5-35B-A3B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.5:35b-a3b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 9,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofezrun000cie04z5y0b83e",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.5,
"tokSOut": 135.3,
"tokSTotal": 125.3,
"peakVramGb": 22.12890625,
"createdAt": "2026-04-26T06:56:02.495Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 325.4 tok/s\npp512: 322.0 tok/s\npp1024: 320.4 tok/s\npp2048: 316.8 tok/s\nvram_loaded: 22660 MB",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.6:35b-a3b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 10,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofgd9iz000ml4055xsdgvuy",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 75.6,
"tokSOut": 134,
"tokSTotal": 124.2,
"peakVramGb": 22.13,
"createdAt": "2026-04-26T07:34:31.547Z",
"notes": "hipfire @ 0.1.8-alpha+f16eceb (master post-perf-recovery, PR #47)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3 (median reported)\ndecode mode: AR (autoregressive, no spec-decode)\nkv_cache: asym3 (3-bit rotated K + Q8 V; 5.5x vs fp32)\nprompt_normalize: true (default since 2026-04-26)\npp128: 328.1 tok/s\npp512: 324.6 tok/s\npp1024: 323.1 tok/s\npp2048: 318.9 tok/s\nvram_loaded: 22660 MB\nQuantized in-house: hipfire-quantize MQ4 (FWHT-rotated 4-bit, group=256)",
"model": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B",
"displayName": "Ornstein3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "unsloth/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench ornstein-3.6-35b-a3b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 11,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdidib0002ju04tsntnze1",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 48.1,
"tokSOut": 122.3,
"tokSTotal": 116.9,
"peakVramGb": 5.662109375,
"createdAt": "2026-04-26T06:14:31.139Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 602.8 tok/s\npp512: 591.4 tok/s\npp1024: 587.8 tok/s\npp2048: 580.0 tok/s\nvram_loaded: 5798 MB",
"model": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B-Base",
"displayName": "Qwen3.5-9B-Base"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.5:9b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 12,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofet3j80005l2042r92gywi",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 118.16,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T06:50:51.045Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 5.632\naccept_rate (median): 0.375\nruns all: [118.19, 118.14, 118.16]",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.6-27b.mq4 --draft qwen36-27b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 13,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofl528t000nl4045c6nx68j",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 118.14,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T09:48:06.941Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported); range 117.5\u2013118.4\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 5.632\naccept_rate (median): 0.375\nall runs tok/s: [118.14, 117.48, 118.44]",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.6-27b.mq4 --draft qwen36-27b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 14,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoh721ve000kla04zc0ki9gs",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 104.53,
"tokSTotal": null,
"peakVramGb": 15.68,
"createdAt": "2026-04-27T12:49:24.218Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 1024; q8_0/q8_0 KV; all-GPU no -ot needed (15.68 GiB fits 24 GB VRAM); gen flat 104-106 t/s tg128-tg2048; b/ub 1024 wins prefill over b/ub 512 (pp4096: 3035 vs 2353); b/ub 2048 catastrophic cliff (pp4096=653, -78%)",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q3_K_XL",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q8_0 --cache-type-v q8_0 -sm row"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 15,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoetv7zn0004l704ocb4a949",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 94.5,
"tokSTotal": null,
"peakVramGb": 15.63,
"createdAt": "2026-04-25T21:04:38.195Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 512; FA=0 required (Gemma-4 head_dim=512 exceeds FA2 limit; FA=1 degrades gen 50-70% on ROCm); f16/f16 KV mandatory with FA=0",
"model": {
"hfId": "Jackrong/Gemopus-4-26B-A4B-it-GGUF",
"displayName": "Gemopus-4-26B-A4B-it-GGUF",
"family": "Gemma",
"params": 26,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_M",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": false,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 512 -ub 512 --cache-type-k f16 --cache-type-v f16 --no-flash-attn"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 16,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeu1wus0003l60472784yfr",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 94.3,
"tokSTotal": null,
"peakVramGb": 15.25,
"createdAt": "2026-04-25T21:09:50.356Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 512; FA=0 required on ROCm; f16/f16 KV only (quantized V requires FA); UD unsloth dynamic quant",
"model": {
"hfId": "unsloth/gemma-4-26B-A4B-it-GGUF",
"displayName": "gemma-4-26B-A4B-it-GGUF",
"family": "Gemma",
"params": 26,
"isMoE": false,
"baseModel": {
"hfId": "google/gemma-4-26B-A4B-it",
"displayName": "gemma-4-26B-A4B-it"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": false,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 512 -ub 512 --cache-type-k f16 --cache-type-v f16 --no-flash-attn"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 17,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoetb4n1000jjz04aol3g3h5",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 93.2,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T20:49:00.733Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 4096; f16/f16 KV cache",
"model": {
"hfId": "Jackrong/Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-GGUF",
"displayName": "Qwen3.5-9B-Claude-4.6-Opus-Reasoning-Distilled-GGUF",
"family": "Qwen",
"params": 9,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.5-9B",
"displayName": "Qwen3.5-9B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8639",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "f16",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 4096 -ub 4096 --cache-type-k f16 --cache-type-v f16"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 18,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoff6g560019la04hewfjrll",
"modelRevision": "main",
"promptTokens": 220,
"outputTokens": 120,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 68.6,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-26T07:01:13.915Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery, PR #47)\nprompt: LRU cache PEP-8 strict (md5=df5dedc8040c, see benchmarks/prompts/lru_cache_pep8_strict.txt)\nruns: 3 (median reported)\ndecode mode: DFlash speculative (block_size=16)\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\n\u03c4 (median): 1.222\naccept_rate (median): 0.081\nruns all: [48.08, 68.6, 68.65]",
"model": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": true,
"mtpEnabled": false,
"commandSnippet": "./target/release/examples/dflash_spec_demo --target qwen3.6-35b-a3b.mq4 --draft qwen36-35b-a3b-dflash-mq4.hfq --prompt $(cat benchmarks/prompts/lru_cache_pep8_strict.txt) --max 120 --no-chatml --kv-mode asym3"
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 19,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeum17e0003jo04iijt0vvl",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 62.4,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:25:29.115Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 1024; q8_0/q8_0 KV; MoE A3B; gen perfectly flat (Qwen3.6 DeltaNet arch); Q4_K_S beats Q4_K_M on all metrics",
"model": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B-GGUF",
"displayName": "Ornstein3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "DJLougen/Ornstein3.6-35B-A3B",
"displayName": "Ornstein3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q8_0 --cache-type-v q8_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 20,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeusq0f0007l804y0oofytc",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 57.6,
"tokSTotal": null,
"peakVramGb": 19.45,
"createdAt": "2026-04-25T21:30:41.200Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 1024; q4_0/q4_0 KV; -ot cutoff=33 (layers 0-32 GPU / 33-39 CPU) gives 3x better prefill vs baseline; UD unsloth dynamic quant; MoE A3B",
"model": {
"hfId": "unsloth/Qwen3.6-35B-A3B-GGUF",
"displayName": "Qwen3.6-35B-A3B-GGUF",
"family": "Qwen",
"params": 35,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": 33,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q4_0 --cache-type-v q4_0 -sm row -ot \"blk\\.(3[3-9])\\.ffn_.*_exps=CPU\""
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 21,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeufbnr000ql604wctafc0s",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 1024,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 47.8,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:20:16.072Z",
"notes": "llama-bench on ROCm (hip-radeon); tg512 metric; b/ub 1024; q4_0/q4_0 KV; AWQ 4-bit re-quanted to Q4_K_M GGUF; MoE A3B",
"model": {
"hfId": "cyberjuju/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit-Q4_K_M-GGUF",
"displayName": "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit-Q4_K_M-GGUF",
"family": "Qwen",
"params": 30,
"isMoE": false,
"baseModel": {
"hfId": "cyankiwi/Qwen3-30B-A3B-Instruct-2507-AWQ-4bit",
"displayName": "Qwen3-30B-A3B-Instruct-2507-AWQ-4bit"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8639",
"quantization": "Q4_K_M",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q4_0 --cache-type-v q4_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 22,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoeu8m92000fl604nt2wtvkj",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 512,
"contextLength": 1024,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 46.9,
"tokSTotal": null,
"peakVramGb": null,
"createdAt": "2026-04-25T21:15:03.206Z",
"notes": "llama-bench on ROCm (hip-radeon); tg512 metric; b/ub 1024; q8_0/q8_0 KV; MoE A3B (30B total / 3B active)",
"model": {
"hfId": "lmstudio-community/Qwen3-Coder-30B-A3B-Instruct-GGUF",
"displayName": "Qwen3-Coder-30B-A3B-Instruct-GGUF",
"family": "Qwen",
"params": 30,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
"displayName": "Qwen3-Coder-30B-A3B-Instruct"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8639",
"quantization": "Q4_K_M",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 1024 --cache-type-k q8_0 --cache-type-v q8_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 23,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofdvq7c0001l40461v8g3z7",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 117.8,
"tokSOut": 43.7,
"tokSTotal": 42,
"peakVramGb": 15.10546875,
"createdAt": "2026-04-26T06:24:54.120Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 223.5 tok/s\npp512: 215.7 tok/s\npp1024: 213.3 tok/s\npp2048: 210.9 tok/s\nvram_loaded: 15468 MB",
"model": {
"hfId": "Qwen/Qwen3.5-27B",
"displayName": "Qwen3.5-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.5:27b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 24,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofemf840014la04lhmues5d",
"modelRevision": "main",
"promptTokens": 20,
"outputTokens": 128,
"contextLength": 4096,
"batchSize": 1,
"ttftMs": 117.6,
"tokSOut": 43.6,
"tokSTotal": 41.9,
"peakVramGb": 15.10546875,
"createdAt": "2026-04-26T06:45:39.604Z",
"notes": "hipfire @ f16eceb (master post-perf-recovery)\nprompt: \"Explain the theory of general relativity in simple terms.\"\nruns: 3\ndecode mode: AR\nkv_cache: asym3\nprompt_normalize: true (default since 2026-04-26)\npp128: 224.2 tok/s\npp512: 216.5 tok/s\npp1024: 214.3 tok/s\npp2048: 211.1 tok/s\nvram_loaded: 15468 MB",
"model": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": null
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "hipfire",
"engineVersion": "0.1.8-alpha+f16eceb",
"quantization": "MQ4",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "hipfire bench qwen3.6:27b --runs 3 \"Explain the theory of general relativity in simple terms.\""
},
"user": {
"id": "cmoeye1gq0000le04ie2kqs58",
"name": "Kaden Schutt",
"username": "schuttdev",
"verified": false,
"verifiedAt": null
},
"rank": 25,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmogm5x010002jv040zuze5n3",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 33.31,
"tokSTotal": null,
"peakVramGb": 13.47,
"createdAt": "2026-04-27T03:04:32.594Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b=1024/ub=512; q8_0/q8_0 KV; gen perfectly flat tg128-tg2048 (33.31->33.08 t/s, -0.7%); DeltaNet hybrid arch; b=1024/ub=512 beats b=2048/ub=512 on prefill (951 vs 943 pp512)",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q3_K_XL",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q8_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 512 --cache-type-k q8_0 --cache-type-v q8_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 26,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoethtq9000pjz045yguc0ld",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 28.6,
"tokSTotal": null,
"peakVramGb": 14.51,
"createdAt": "2026-04-25T20:54:13.185Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b/ub 512; q4_0/q4_0 KV; gen perfectly flat tg128-tg2048",
"model": {
"hfId": "Jackrong/Qwopus3.5-27B-v3-GGUF",
"displayName": "Qwopus3.5-27B-v3-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "unsloth/Qwen3.5-27B",
"displayName": "Qwen3.5-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 512 -ub 512 --cache-type-k q4_0 --cache-type-v q4_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 27,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmoetoim3000sjo04hd67abew",
"modelRevision": "main",
"promptTokens": 512,
"outputTokens": 128,
"contextLength": 640,
"batchSize": 1,
"ttftMs": null,
"tokSOut": 28.5,
"tokSTotal": null,
"peakVramGb": 14.76,
"createdAt": "2026-04-25T20:59:25.371Z",
"notes": "llama-bench on ROCm (hip-radeon); tg128 metric; b=1024/ub=512; q4_0/q4_0 KV; GatedDeltaNet+GQA hybrid arch; gen perfectly flat",
"model": {
"hfId": "unsloth/Qwen3.6-27B-GGUF",
"displayName": "Qwen3.6-27B-GGUF",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "DISCRETE_GPU",
"gpuName": "AMD Radeon RX 7900 XTX",
"gpuCount": 1,
"vramGb": 24,
"chipVendor": null,
"chipFamily": null,
"chipVariant": null,
"unifiedMemoryGb": null,
"cpu": null,
"ramGb": null,
"os": null
},
"engine": {
"engineName": "llama.cpp",
"engineVersion": "b8778",
"quantization": "Q4_K_S",
"backend": "rocm"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": "q4_0",
"attentionBackend": null,
"flashAttn": true,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "-b 1024 -ub 512 --cache-type-k q4_0 --cache-type-v q4_0"
},
"user": {
"id": "cmoer1mdc0000le044537gzwp",
"name": "GMAzrael",
"username": "itgoatee",
"verified": false,
"verifiedAt": null
},
"rank": 28,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 28
},
"MI300X (192 GB)": {
"rows": [],
"total": 0
},
"Apple M4 Max (128 GB)": {
"rows": [],
"total": 0
},
"Apple M4 Max (64 GB)": {
"rows": [
{
"id": "cmoh6rfig000mju04hf34f6re",
"modelRevision": "3f8ff65551b3bfda2d8ccfadc091bab9ce066905",
"promptTokens": 0,
"outputTokens": 512,
"contextLength": 262144,
"batchSize": 1,
"ttftMs": 105.75,
"tokSOut": 83.353,
"tokSTotal": null,
"peakVramGb": 22,
"createdAt": "2026-04-27T12:41:08.681Z",
"notes": "Automated local benchmark on Mac Studio. Runner: Node v25.9.0 + @mlx-node/lm 0.0.7. Command was run from ~/mlx-node-qwen36-test. Submitted tokSOut/ttftMs use warm runs 2-3 average; run 1 is included below as post-load warmup. Raw output: run=1 ttft_ms=1572.6 elapsed_s=7.419 tokens=509 tok_s=68.608; run=2 ttft_ms=118.8 elapsed_s=6.046 tokens=512 tok_s=84.689; run=3 ttft_ms=92.7 elapsed_s=6.243 tokens=512 tok_s=82.017. Smoke run output quality check passed. Memory measurement from held Node process RSS: before_load=62.6 MiB, after_load=22498.5 MiB, after_generate=21825.8 MiB. Peak memory field is process RSS approximation for unified memory, not vendor VRAM telemetry. Prompt token count was not reported by the runner, so promptTokens is set to 0.",
"model": {
"hfId": "Brooooooklyn/Qwen3.6-35B-A3B-UD-Q8_K_XL-mlx",
"displayName": "Qwen3.6-35B-A3B-UD-Q8_K_XL-mlx",
"family": "Qwen",
"params": 35,
"isMoE": true,
"baseModel": {
"hfId": "Qwen/Qwen3.6-35B-A3B",
"displayName": "Qwen3.6-35B-A3B"
}
},
"hardware": {
"hwClass": "UNIFIED",
"gpuName": null,
"gpuCount": 1,
"vramGb": null,
"chipVendor": "Apple",
"chipFamily": "M4",
"chipVariant": "M4 Max",
"unifiedMemoryGb": 64,
"cpu": "Apple M4 Max 16-core CPU / 40-core GPU",
"ramGb": null,
"os": "macOS 26.4 build 25E246"
},
"engine": {
"engineName": "mlx",
"engineVersion": "@mlx-node/lm 0.0.7",
"quantization": "Q8_K_XL",
"backend": "metal"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "cd ~/mlx-node-qwen36-test && MAX_NEW_TOKENS=512 RUNS=3 npm run bench"
},
"user": {
"id": "cmoffffz20001l70489iwo6hj",
"name": "shikharpant",
"username": "shikhar",
"verified": false,
"verifiedAt": null
},
"rank": 1,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofi32rs000rjx04be81ns5r",
"modelRevision": "main",
"promptTokens": 128,
"outputTokens": 6000,
"contextLength": 6128,
"batchSize": 1,
"ttftMs": 676,
"tokSOut": 18.435,
"tokSTotal": 18.786,
"peakVramGb": 25.895,
"createdAt": "2026-04-26T08:22:35.465Z",
"notes": "MLX benchmark Trial 1. TTFT is estimated from prompt_tps and generation_tps, not directly reported by mlx_lm.benchmark. Model config uses 4-bit affine group size 64 with selected 8-bit tensors.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-UD-MLX-4bit",
"displayName": "Qwen3.6-27B-UD-MLX-4bit",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "UNIFIED",
"gpuName": null,
"gpuCount": 1,
"vramGb": null,
"chipVendor": "Apple",
"chipFamily": "M4",
"chipVariant": "M4 Max",
"unifiedMemoryGb": 64,
"cpu": "Apple M4 Max 16-core CPU / 40-core GPU",
"ramGb": null,
"os": "macOS 26.4 build 25E246"
},
"engine": {
"engineName": "mlx",
"engineVersion": "mlx-lm 0.31.3 / mlx 0.31.2 / mlx-metal 0.31.2",
"quantization": "int4",
"backend": "metal"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "mlx_lm.benchmark --model unsloth/Qwen3.6-27B-UD-MLX-4bit --prompt-tokens 128 --generation-tokens 6000"
},
"user": {
"id": "cmoffffz20001l70489iwo6hj",
"name": "shikharpant",
"username": "shikhar",
"verified": false,
"verifiedAt": null
},
"rank": 2,
"reactionCounts": {},
"myEmoji": null
},
{
"id": "cmofia1vo0009l7048w39brkk",
"modelRevision": "main",
"promptTokens": 128,
"outputTokens": 6000,
"contextLength": 6128,
"batchSize": 1,
"ttftMs": 676,
"tokSOut": 18.434,
"tokSTotal": 18.786,
"peakVramGb": 25.896,
"createdAt": "2026-04-26T08:28:00.900Z",
"notes": "MLX benchmark 5-trial average. Trials: generation_tps 18.435, 18.433, 18.437, 18.421, 18.445; prompt_tps avg 205.970; generation_tps avg 18.434; peak_memory avg 25.896 GB. TTFT is estimated from avg prompt_tps and generation_tps because mlx_lm.benchmark does not directly report TTFT. Model config uses 4-bit affine group size 64 with selected 8-bit tensors.",
"model": {
"hfId": "unsloth/Qwen3.6-27B-UD-MLX-4bit",
"displayName": "Qwen3.6-27B-UD-MLX-4bit",
"family": "Qwen",
"params": 27,
"isMoE": false,
"baseModel": {
"hfId": "Qwen/Qwen3.6-27B",
"displayName": "Qwen3.6-27B"
}
},
"hardware": {
"hwClass": "UNIFIED",
"gpuName": null,
"gpuCount": 1,
"vramGb": null,
"chipVendor": "Apple",
"chipFamily": "M4",
"chipVariant": "M4 Max",
"unifiedMemoryGb": 64,
"cpu": "Apple M4 Max 16-core CPU / 40-core GPU",
"ramGb": null,
"os": "macOS 26.4 build 25E246"
},
"engine": {
"engineName": "mlx",
"engineVersion": "mlx-lm 0.31.3 / mlx 0.31.2 / mlx-metal 0.31.2",
"quantization": "int4",
"backend": "metal"
},
"engineFlags": {
"tensorParallel": null,
"gpuLayers": null,
"kvCacheDtype": null,
"attentionBackend": null,
"flashAttn": null,
"specDecoding": false,
"mtpEnabled": false,
"commandSnippet": "mlx_lm.benchmark --model unsloth/Qwen3.6-27B-UD-MLX-4bit --prompt-tokens 128 --generation-tokens 6000"
},
"user": {
"id": "cmoffffz20001l70489iwo6hj",
"name": "shikharpant",
"username": "shikhar",
"verified": false,
"verifiedAt": null
},
"rank": 3,
"reactionCounts": {},
"myEmoji": null
}
],
"total": 3
},
"Apple M4 Pro (48 GB)": {
"rows": [],
"total": 0
},
"Apple M4 Pro (24 GB)": {
"rows": [],
"total": 0
},
"Apple M3 Max (128 GB)": {
"rows": [],
"total": 0
},
"Apple M3 Max (96 GB)": {
"rows": [],
"total": 0
},
"Apple M2 Ultra (192 GB)": {
"rows": [],
"total": 0
},
"Apple M2 Max (96 GB)": {
"rows": [],
"total": 0
},
"Apple M2 Pro (32 GB)": {
"rows": [],
"total": 0
},
"Apple M1 Max (64 GB)": {
"rows": [],
"total": 0
},
"CPU Only": {
"rows": [],
"total": 0
}
}
}