Skip to content

Commit 873bad5

Browse files
evilsocketclaude
andcommitted
bench(vulkan): add Vulkan-specific benchmarks for Steam Deck optimization
GPU GEMV vs CPU matmul at Qwen3-0.6B model sizes, dispatch overhead, elementwise ops, and full MLP pass benchmarks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d483cca commit 873bad5

2 files changed

Lines changed: 159 additions & 0 deletions

File tree

cake-core/benches/bench_vulkan.rs

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
/// Vulkan backend benchmarks — GPU GEMV vs CPU matmul at model-realistic sizes,
2+
/// dispatch overhead, upload/download costs, and elementwise ops.
3+
///
4+
/// Run on Steam Deck: `cargo bench -p cake-core --features vulkan -- vulkan`
5+
6+
use cake_core::backends::{ComputeBackend, VulkanBackend};
7+
use candle_core::{DType, Device, Tensor};
8+
9+
fn vk() -> VulkanBackend {
10+
VulkanBackend::new().expect("Vulkan backend required for these benchmarks")
11+
}
12+
13+
fn cpu_tensor(shape: &[usize], seed: u64) -> Tensor {
14+
super::bench_helpers::make_tensor(shape, seed)
15+
}
16+
17+
// ── Dispatch overhead ────────────────────────────────────────────────
18+
19+
#[divan::bench]
20+
fn vulkan_dispatch_overhead(bencher: divan::Bencher) {
21+
let backend = vk();
22+
// Tiny tensor to measure fixed dispatch cost (not data transfer)
23+
let a = cpu_tensor(&[1, 1, 16], 1000);
24+
let b = cpu_tensor(&[1, 1, 16], 1001);
25+
bencher.bench_local(|| backend.silu_mul(&a, &b).unwrap());
26+
}
27+
28+
// ── GPU GEMV vs CPU matmul at model sizes ────────────────────────────
29+
// Qwen3-0.6B: hidden=1024, intermediate=3072, head_dim=128
30+
// QKV: (1,1024) × (1024,4096), O: (1,1024) × (1024,1024)
31+
// gate_up: (1,1024) × (1024,6144), down: (1,3072) × (3072,1024)
32+
33+
#[divan::bench(args = [1024, 4096, 6144])]
34+
fn vulkan_gemv_1024xN(bencher: divan::Bencher, n: usize) {
35+
let backend = vk();
36+
let a = cpu_tensor(&[1, 1024], 1100);
37+
let b = cpu_tensor(&[1024, n], 1101);
38+
bencher.bench_local(|| backend.matmul(&a, &b).unwrap());
39+
}
40+
41+
#[divan::bench]
42+
fn vulkan_gemv_3072x1024(bencher: divan::Bencher) {
43+
let backend = vk();
44+
let a = cpu_tensor(&[1, 3072], 1200);
45+
let b = cpu_tensor(&[3072, 1024], 1201);
46+
bencher.bench_local(|| backend.matmul(&a, &b).unwrap());
47+
}
48+
49+
#[divan::bench(args = [1024, 4096, 6144])]
50+
fn cpu_gemv_1024xN(bencher: divan::Bencher, n: usize) {
51+
let a = cpu_tensor(&[1, 1024], 1100);
52+
let b = cpu_tensor(&[1024, n], 1101);
53+
bencher.bench_local(|| a.matmul(&b).unwrap());
54+
}
55+
56+
#[divan::bench]
57+
fn cpu_gemv_3072x1024(bencher: divan::Bencher) {
58+
let a = cpu_tensor(&[1, 3072], 1200);
59+
let b = cpu_tensor(&[3072, 1024], 1201);
60+
bencher.bench_local(|| a.matmul(&b).unwrap());
61+
}
62+
63+
// ── GPU GEMM (prefill) at model sizes ────────────────────────────────
64+
65+
#[divan::bench(args = [8, 32, 64])]
66+
fn vulkan_gemm_Mx1024x4096(bencher: divan::Bencher, m: usize) {
67+
let backend = vk();
68+
let a = cpu_tensor(&[m, 1024], 1300);
69+
let b = cpu_tensor(&[1024, 4096], 1301);
70+
bencher.bench_local(|| backend.matmul(&a, &b).unwrap());
71+
}
72+
73+
#[divan::bench(args = [8, 32, 64])]
74+
fn cpu_gemm_Mx1024x4096(bencher: divan::Bencher, m: usize) {
75+
let a = cpu_tensor(&[m, 1024], 1300);
76+
let b = cpu_tensor(&[1024, 4096], 1301);
77+
bencher.bench_local(|| a.matmul(&b).unwrap());
78+
}
79+
80+
// ── Elementwise ops at model sizes ───────────────────────────────────
81+
82+
#[divan::bench(args = [1024, 3072])]
83+
fn vulkan_silu_mul(bencher: divan::Bencher, size: usize) {
84+
let backend = vk();
85+
let gate = cpu_tensor(&[1, 1, size], 1400);
86+
let up = cpu_tensor(&[1, 1, size], 1401);
87+
bencher.bench_local(|| backend.silu_mul(&gate, &up).unwrap());
88+
}
89+
90+
#[divan::bench(args = [1024, 3072])]
91+
fn cpu_silu_mul(bencher: divan::Bencher, size: usize) {
92+
let gate = cpu_tensor(&[1, 1, size], 1400);
93+
let up = cpu_tensor(&[1, 1, size], 1401);
94+
bencher.bench_local(|| {
95+
(candle_nn::ops::silu(&gate).unwrap() * &up).unwrap()
96+
});
97+
}
98+
99+
#[divan::bench(args = [1024, 3072])]
100+
fn vulkan_add3(bencher: divan::Bencher, size: usize) {
101+
let backend = vk();
102+
let a = cpu_tensor(&[1, 1, size], 1500);
103+
let b = cpu_tensor(&[1, 1, size], 1501);
104+
let c = cpu_tensor(&[1, 1, size], 1502);
105+
bencher.bench_local(|| backend.add3(&a, &b, &c).unwrap());
106+
}
107+
108+
// ── RMS norm (CPU-only in current backend) ───────────────────────────
109+
110+
#[divan::bench(args = [1024, 3072])]
111+
fn vulkan_rms_norm_gated(bencher: divan::Bencher, size: usize) {
112+
let backend = vk();
113+
let x = cpu_tensor(&[1, 1, size], 1600);
114+
let z = cpu_tensor(&[1, 1, size], 1601);
115+
let w = Tensor::ones(size, DType::F32, &Device::Cpu).unwrap();
116+
bencher.bench_local(|| backend.rms_norm_gated(&x, &z, &w, 1e-6).unwrap());
117+
}
118+
119+
#[divan::bench(args = [1024, 3072])]
120+
fn vulkan_add_rms_norm(bencher: divan::Bencher, size: usize) {
121+
let backend = vk();
122+
let a = cpu_tensor(&[1, 1, size], 1700);
123+
let b = cpu_tensor(&[1, 1, size], 1701);
124+
let w = Tensor::ones(size, DType::F32, &Device::Cpu).unwrap();
125+
bencher.bench_local(|| backend.add_rms_norm(&a, &b, &w, 1e-6).unwrap());
126+
}
127+
128+
// ── Full MLP pass (gate_up + silu_mul + down) ────────────────────────
129+
130+
#[divan::bench]
131+
fn vulkan_mlp_full(bencher: divan::Bencher) {
132+
let backend = vk();
133+
let x = cpu_tensor(&[1, 1024], 1800);
134+
let gate_up_w = cpu_tensor(&[6144, 1024], 1801);
135+
let down_w = cpu_tensor(&[1024, 3072], 1802);
136+
bencher.bench_local(|| {
137+
let fused = backend.matmul(&x, &gate_up_w.t().unwrap()).unwrap();
138+
let gate = fused.narrow(1, 0, 3072).unwrap().contiguous().unwrap();
139+
let up = fused.narrow(1, 3072, 3072).unwrap().contiguous().unwrap();
140+
let act = backend.silu_mul(&gate, &up).unwrap();
141+
backend.matmul(&act, &down_w.t().unwrap()).unwrap()
142+
});
143+
}
144+
145+
#[divan::bench]
146+
fn cpu_mlp_full(bencher: divan::Bencher) {
147+
let x = cpu_tensor(&[1, 1024], 1800);
148+
let gate_up_w = cpu_tensor(&[6144, 1024], 1801);
149+
let down_w = cpu_tensor(&[1024, 3072], 1802);
150+
bencher.bench_local(|| {
151+
let fused = x.matmul(&gate_up_w.t().unwrap()).unwrap();
152+
let gate = fused.narrow(1, 0, 3072).unwrap();
153+
let up = fused.narrow(1, 3072, 3072).unwrap();
154+
let act = (candle_nn::ops::silu(&gate).unwrap() * &up).unwrap();
155+
act.matmul(&down_w.t().unwrap()).unwrap()
156+
});
157+
}

cake-core/benches/benchmarks.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,5 @@ mod bench_flux;
2121
mod bench_vibevoice;
2222
#[cfg(feature = "luxtts")]
2323
mod bench_luxtts;
24+
#[cfg(feature = "vulkan")]
25+
mod bench_vulkan;

0 commit comments

Comments
 (0)