grafos_observe/
cache_metrics.rs

1//! Cache-specific observability metrics for LLM inference KV caches.
2//!
3//! Defines the 12 metric names from the design document Section 7.5 and provides
4//! a [`CacheMetrics`] struct that aggregates all cache-related counters, gauges,
5//! and histograms. The [`CacheMetrics::global()`] singleton can be used by
6//! `KvCacheManager` and other cache infrastructure to emit metrics.
7
8use crate::metrics::{MetricCounter, MetricGauge, MetricHistogram};
9
10// ---------------------------------------------------------------------------
11// Metric name constants
12// ---------------------------------------------------------------------------
13
14/// Histogram: prefill latency in microseconds.
15pub const CACHE_PREFILL_LATENCY_US: &str = "cache/prefill_latency_us";
16
17/// Histogram: time to first token in microseconds.
18pub const CACHE_FIRST_TOKEN_LATENCY_US: &str = "cache/first_token_latency_us";
19
20/// Histogram: steady-state per-token decode latency in microseconds.
21pub const CACHE_DECODE_LATENCY_US: &str = "cache/decode_latency_us";
22
23/// Counter: total cache hits, labeled by cache_class.
24pub const CACHE_HIT_TOTAL: &str = "cache/hit_total";
25
26/// Counter: total cache misses, labeled by cache_class.
27pub const CACHE_MISS_TOTAL: &str = "cache/miss_total";
28
29/// Gauge: resident bytes, labeled by tier (VRAM, DRAM, CXL).
30pub const CACHE_RESIDENT_BYTES: &str = "cache/resident_bytes";
31
32/// Counter: total bytes spilled between tiers, labeled by (src_tier, dst_tier).
33pub const CACHE_SPILL_BYTES_TOTAL: &str = "cache/spill_bytes_total";
34
35/// Counter: total bytes warmed between tiers, labeled by (src_tier, dst_tier).
36pub const CACHE_WARMUP_BYTES_TOTAL: &str = "cache/warmup_bytes_total";
37
38/// Counter: total attach failures, labeled by reason.
39pub const CACHE_ATTACH_FAILURE_TOTAL: &str = "cache/attach_failure_total";
40
41/// Counter: total cache forks.
42pub const CACHE_FORK_TOTAL: &str = "cache/fork_total";
43
44/// Counter: total cache reclaims, labeled by cause (expired, revoked, evicted).
45pub const CACHE_RECLAIM_TOTAL: &str = "cache/reclaim_total";
46
47/// Counter: decode requests placed far from cache (low cache_locality score).
48pub const CACHE_DECODE_FAR_FROM_CACHE: &str = "cache/decode_far_from_cache";
49
50/// All 12 cache metric names as a slice, for validation.
51pub const ALL_CACHE_METRIC_NAMES: [&str; 12] = [
52    CACHE_PREFILL_LATENCY_US,
53    CACHE_FIRST_TOKEN_LATENCY_US,
54    CACHE_DECODE_LATENCY_US,
55    CACHE_HIT_TOTAL,
56    CACHE_MISS_TOTAL,
57    CACHE_RESIDENT_BYTES,
58    CACHE_SPILL_BYTES_TOTAL,
59    CACHE_WARMUP_BYTES_TOTAL,
60    CACHE_ATTACH_FAILURE_TOTAL,
61    CACHE_FORK_TOTAL,
62    CACHE_RECLAIM_TOTAL,
63    CACHE_DECODE_FAR_FROM_CACHE,
64];
65
66// ---------------------------------------------------------------------------
67// CacheMetrics
68// ---------------------------------------------------------------------------
69
70/// Aggregated cache observability metrics.
71///
72/// Provides counters, gauges, and histograms for the 12 cache metrics defined
73/// in the design document Section 7.5. Access the process-wide singleton via
74/// [`CacheMetrics::global()`].
75///
76/// For labeled metrics (e.g. hit_total by cache_class, resident_bytes by tier),
77/// per-label tracking is left to the caller — the counters here provide the
78/// aggregate emission point. Callers should use the metric name constants
79/// along with labels when exporting to Prometheus or OTLP.
80pub struct CacheMetrics {
81    /// Histogram: prefill latency in microseconds.
82    pub prefill_latency: MetricHistogram,
83    /// Histogram: time to first token in microseconds.
84    pub first_token_latency: MetricHistogram,
85    /// Histogram: steady-state per-token decode latency in microseconds.
86    pub decode_latency: MetricHistogram,
87    /// Counter: total cache hits (across all cache classes).
88    pub hit_total: MetricCounter,
89    /// Counter: total cache misses (across all cache classes).
90    pub miss_total: MetricCounter,
91    /// Gauge: total resident bytes (across all tiers).
92    pub resident_bytes: MetricGauge,
93    /// Counter: total bytes spilled between tiers.
94    pub spill_bytes_total: MetricCounter,
95    /// Counter: total bytes warmed between tiers.
96    pub warmup_bytes_total: MetricCounter,
97    /// Counter: total attach failures.
98    pub attach_failure_total: MetricCounter,
99    /// Counter: total cache forks.
100    pub fork_total: MetricCounter,
101    /// Counter: total cache reclaims (expired + revoked + evicted).
102    pub reclaim_total: MetricCounter,
103    /// Counter: decode requests placed far from cache.
104    pub decode_far_from_cache: MetricCounter,
105}
106
107impl Default for CacheMetrics {
108    fn default() -> Self {
109        Self::new()
110    }
111}
112
113impl CacheMetrics {
114    /// Create a new metrics instance with all values at zero.
115    pub const fn new() -> Self {
116        Self {
117            prefill_latency: MetricHistogram::new(),
118            first_token_latency: MetricHistogram::new(),
119            decode_latency: MetricHistogram::new(),
120            hit_total: MetricCounter::new(),
121            miss_total: MetricCounter::new(),
122            resident_bytes: MetricGauge::new(),
123            spill_bytes_total: MetricCounter::new(),
124            warmup_bytes_total: MetricCounter::new(),
125            attach_failure_total: MetricCounter::new(),
126            fork_total: MetricCounter::new(),
127            reclaim_total: MetricCounter::new(),
128            decode_far_from_cache: MetricCounter::new(),
129        }
130    }
131
132    /// Access the global cache metrics instance.
133    pub fn global() -> &'static CacheMetrics {
134        static INSTANCE: CacheMetrics = CacheMetrics::new();
135        &INSTANCE
136    }
137
138    // -----------------------------------------------------------------------
139    // Emission helpers — called by KvCacheManager and inference infrastructure
140    // -----------------------------------------------------------------------
141
142    /// Record a cache creation: increments resident_bytes by the given amount.
143    pub fn record_cache_created(&self, logical_bytes: u64) {
144        self.resident_bytes
145            .set(self.resident_bytes.get() + logical_bytes as i64);
146    }
147
148    /// Record a cache hit for the given cache class.
149    pub fn record_cache_hit(&self) {
150        self.hit_total.inc();
151    }
152
153    /// Record a cache miss for the given cache class.
154    pub fn record_cache_miss(&self) {
155        self.miss_total.inc();
156    }
157
158    /// Record a cache attach failure.
159    pub fn record_attach_failure(&self) {
160        self.attach_failure_total.inc();
161    }
162
163    /// Record a cache spill: increments spill_bytes_total.
164    pub fn record_cache_spill(&self, bytes_moved: u64) {
165        self.spill_bytes_total.add(bytes_moved);
166    }
167
168    /// Record a cache warmup: increments warmup_bytes_total.
169    pub fn record_cache_warmup(&self, bytes_moved: u64) {
170        self.warmup_bytes_total.add(bytes_moved);
171    }
172
173    /// Record a cache destruction/eviction: decrements resident_bytes and
174    /// increments reclaim_total.
175    pub fn record_cache_reclaimed(&self, logical_bytes: u64) {
176        self.reclaim_total.inc();
177        self.resident_bytes
178            .set(self.resident_bytes.get() - logical_bytes as i64);
179    }
180
181    /// Record a cache fork: increments fork_total and resident_bytes.
182    pub fn record_cache_forked(&self, logical_bytes: u64) {
183        self.fork_total.inc();
184        self.resident_bytes
185            .set(self.resident_bytes.get() + logical_bytes as i64);
186    }
187
188    /// Record a prefill latency observation.
189    pub fn record_prefill_latency(&self, latency_us: u64) {
190        self.prefill_latency.observe(latency_us);
191    }
192
193    /// Record a first-token latency observation.
194    pub fn record_first_token_latency(&self, latency_us: u64) {
195        self.first_token_latency.observe(latency_us);
196    }
197
198    /// Record a decode latency observation.
199    pub fn record_decode_latency(&self, latency_us: u64) {
200        self.decode_latency.observe(latency_us);
201    }
202
203    /// Record a decode request placed far from cache.
204    pub fn record_decode_far_from_cache(&self) {
205        self.decode_far_from_cache.inc();
206    }
207}
208
209// ---------------------------------------------------------------------------
210// Tests
211// ---------------------------------------------------------------------------
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn all_metric_names_are_distinct() {
219        let names = ALL_CACHE_METRIC_NAMES;
220        for i in 0..names.len() {
221            for j in (i + 1)..names.len() {
222                assert_ne!(
223                    names[i], names[j],
224                    "metric names at indices {} and {} collide: {}",
225                    i, j, names[i]
226                );
227            }
228        }
229    }
230
231    #[test]
232    fn all_metric_names_have_cache_prefix() {
233        for name in &ALL_CACHE_METRIC_NAMES {
234            assert!(
235                name.starts_with("cache/"),
236                "metric name {} does not start with 'cache/'",
237                name
238            );
239        }
240    }
241
242    #[test]
243    fn exactly_twelve_metrics() {
244        assert_eq!(ALL_CACHE_METRIC_NAMES.len(), 12);
245    }
246
247    #[test]
248    fn cache_metrics_global_is_singleton() {
249        let m1 = CacheMetrics::global();
250        let m2 = CacheMetrics::global();
251        assert!(core::ptr::eq(m1, m2));
252    }
253
254    #[test]
255    fn record_cache_created_increments_resident_bytes() {
256        let m = CacheMetrics::new();
257        m.record_cache_created(4096);
258        assert_eq!(m.resident_bytes.get(), 4096);
259        m.record_cache_created(1024);
260        assert_eq!(m.resident_bytes.get(), 5120);
261    }
262
263    #[test]
264    fn record_cache_hit_and_miss() {
265        let m = CacheMetrics::new();
266        m.record_cache_hit();
267        m.record_cache_hit();
268        m.record_cache_miss();
269        assert_eq!(m.hit_total.get(), 2);
270        assert_eq!(m.miss_total.get(), 1);
271    }
272
273    #[test]
274    fn record_cache_spill_and_warmup() {
275        let m = CacheMetrics::new();
276        m.record_cache_spill(1024);
277        m.record_cache_spill(2048);
278        m.record_cache_warmup(512);
279        assert_eq!(m.spill_bytes_total.get(), 3072);
280        assert_eq!(m.warmup_bytes_total.get(), 512);
281    }
282
283    #[test]
284    fn record_cache_reclaimed_decrements_resident() {
285        let m = CacheMetrics::new();
286        m.record_cache_created(8192);
287        m.record_cache_reclaimed(4096);
288        assert_eq!(m.resident_bytes.get(), 4096);
289        assert_eq!(m.reclaim_total.get(), 1);
290    }
291
292    #[test]
293    fn record_cache_forked() {
294        let m = CacheMetrics::new();
295        m.record_cache_forked(2048);
296        assert_eq!(m.fork_total.get(), 1);
297        assert_eq!(m.resident_bytes.get(), 2048);
298    }
299
300    #[test]
301    fn record_attach_failure() {
302        let m = CacheMetrics::new();
303        m.record_attach_failure();
304        m.record_attach_failure();
305        assert_eq!(m.attach_failure_total.get(), 2);
306    }
307
308    #[test]
309    fn record_latencies() {
310        let m = CacheMetrics::new();
311        m.record_prefill_latency(1000);
312        m.record_first_token_latency(500);
313        m.record_decode_latency(50);
314        assert_eq!(m.prefill_latency.count(), 1);
315        assert_eq!(m.first_token_latency.count(), 1);
316        assert_eq!(m.decode_latency.count(), 1);
317    }
318
319    #[test]
320    fn record_decode_far_from_cache() {
321        let m = CacheMetrics::new();
322        m.record_decode_far_from_cache();
323        assert_eq!(m.decode_far_from_cache.get(), 1);
324    }
325}