grafos_observe/
metrics.rs

1//! Core metrics types for fabric observability.
2//!
3//! All types use atomics and work in `no_std` environments. The
4//! [`FabricMetrics`] singleton collects system-wide counters and gauges.
5
6use core::sync::atomic::{AtomicI64, AtomicU64, Ordering};
7
8/// Monotonically increasing counter (atomic u64).
9///
10/// Counters only go up — use them for totals like "operations completed"
11/// or "bytes transferred". Thread-safe via relaxed atomic operations.
12///
13/// # Examples
14///
15/// ```
16/// use grafos_observe::MetricCounter;
17///
18/// let c = MetricCounter::new();
19/// c.inc();
20/// c.add(10);
21/// assert_eq!(c.get(), 11);
22///
23/// let prev = c.reset();
24/// assert_eq!(prev, 11);
25/// assert_eq!(c.get(), 0);
26/// ```
27pub struct MetricCounter {
28    value: AtomicU64,
29}
30
31impl Default for MetricCounter {
32    fn default() -> Self {
33        Self::new()
34    }
35}
36
37impl MetricCounter {
38    /// Create a new counter starting at zero.
39    pub const fn new() -> Self {
40        Self {
41            value: AtomicU64::new(0),
42        }
43    }
44
45    /// Increment the counter by one.
46    pub fn inc(&self) {
47        self.value.fetch_add(1, Ordering::Relaxed);
48    }
49
50    /// Increment the counter by `n`.
51    pub fn add(&self, n: u64) {
52        self.value.fetch_add(n, Ordering::Relaxed);
53    }
54
55    /// Read the current counter value.
56    pub fn get(&self) -> u64 {
57        self.value.load(Ordering::Relaxed)
58    }
59
60    /// Reset the counter to zero. Returns the previous value.
61    pub fn reset(&self) -> u64 {
62        self.value.swap(0, Ordering::Relaxed)
63    }
64}
65
66/// Current-value gauge (atomic i64).
67///
68/// Gauges go up and down — use them for values like "active leases"
69/// or "connections open".
70///
71/// # Examples
72///
73/// ```
74/// use grafos_observe::MetricGauge;
75///
76/// let g = MetricGauge::new();
77/// g.inc();
78/// g.inc();
79/// g.dec();
80/// assert_eq!(g.get(), 1);
81///
82/// g.set(-5);
83/// assert_eq!(g.get(), -5);
84/// ```
85pub struct MetricGauge {
86    value: AtomicI64,
87}
88
89impl Default for MetricGauge {
90    fn default() -> Self {
91        Self::new()
92    }
93}
94
95impl MetricGauge {
96    /// Create a new gauge starting at zero.
97    pub const fn new() -> Self {
98        Self {
99            value: AtomicI64::new(0),
100        }
101    }
102
103    /// Increment the gauge by one.
104    pub fn inc(&self) {
105        self.value.fetch_add(1, Ordering::Relaxed);
106    }
107
108    /// Decrement the gauge by one.
109    pub fn dec(&self) {
110        self.value.fetch_sub(1, Ordering::Relaxed);
111    }
112
113    /// Set the gauge to an absolute value.
114    pub fn set(&self, val: i64) {
115        self.value.store(val, Ordering::Relaxed);
116    }
117
118    /// Read the current gauge value.
119    pub fn get(&self) -> i64 {
120        self.value.load(Ordering::Relaxed)
121    }
122}
123
124/// Fixed-bucket latency histogram.
125///
126/// Tracks the distribution of values (typically durations in microseconds)
127/// across predefined buckets. No allocator needed — the bucket array is
128/// inline. Uses 10 buckets: 100us, 500us, 1ms, 5ms, 10ms, 50ms, 100ms,
129/// 500ms, 1s, +Inf.
130///
131/// Buckets are cumulative: each bucket count includes all observations that
132/// also fall into lower buckets. This matches the Prometheus histogram
133/// convention.
134///
135/// # Examples
136///
137/// ```
138/// use grafos_observe::MetricHistogram;
139///
140/// let h = MetricHistogram::new();
141/// h.observe(50);     // 50us — lands in the <=100us bucket
142/// h.observe(2_000);  // 2ms  — lands in the <=5ms bucket
143///
144/// assert_eq!(h.count(), 2);
145/// assert_eq!(h.sum(), 2050);
146/// assert_eq!(h.bucket_count(0), 1); // <=100us: only the 50us observation
147/// assert_eq!(h.bucket_count(3), 2); // <=5ms: both observations
148/// ```
149pub struct MetricHistogram {
150    /// Bucket upper bounds in microseconds. The last bucket is +Inf (u64::MAX).
151    bounds: [u64; Self::NUM_BUCKETS],
152    /// Counts per bucket (cumulative — each bucket includes all lower buckets).
153    counts: [AtomicU64; Self::NUM_BUCKETS],
154    /// Sum of all observed values.
155    sum: AtomicU64,
156    /// Total number of observations.
157    total: AtomicU64,
158}
159
160impl Default for MetricHistogram {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166impl MetricHistogram {
167    /// Number of histogram buckets.
168    pub const NUM_BUCKETS: usize = 10;
169
170    /// Bucket boundaries in microseconds.
171    pub const BUCKET_BOUNDS: [u64; Self::NUM_BUCKETS] = [
172        100,       // 100us
173        500,       // 500us
174        1_000,     // 1ms
175        5_000,     // 5ms
176        10_000,    // 10ms
177        50_000,    // 50ms
178        100_000,   // 100ms
179        500_000,   // 500ms
180        1_000_000, // 1s
181        u64::MAX,  // +Inf
182    ];
183
184    /// Create a new histogram with the default bucket boundaries.
185    #[allow(clippy::declare_interior_mutable_const)]
186    pub const fn new() -> Self {
187        Self {
188            bounds: Self::BUCKET_BOUNDS,
189            counts: [
190                AtomicU64::new(0),
191                AtomicU64::new(0),
192                AtomicU64::new(0),
193                AtomicU64::new(0),
194                AtomicU64::new(0),
195                AtomicU64::new(0),
196                AtomicU64::new(0),
197                AtomicU64::new(0),
198                AtomicU64::new(0),
199                AtomicU64::new(0),
200            ],
201            sum: AtomicU64::new(0),
202            total: AtomicU64::new(0),
203        }
204    }
205
206    /// Record an observation (value in microseconds).
207    ///
208    /// Increments the count for every bucket whose bound is >= the value
209    /// (cumulative histogram, matching Prometheus convention).
210    pub fn observe(&self, value_us: u64) {
211        self.sum.fetch_add(value_us, Ordering::Relaxed);
212        self.total.fetch_add(1, Ordering::Relaxed);
213        for (i, &bound) in self.bounds.iter().enumerate() {
214            if value_us <= bound {
215                self.counts[i].fetch_add(1, Ordering::Relaxed);
216            }
217        }
218    }
219
220    /// Read the cumulative count for a specific bucket index.
221    pub fn bucket_count(&self, index: usize) -> u64 {
222        if index < Self::NUM_BUCKETS {
223            self.counts[index].load(Ordering::Relaxed)
224        } else {
225            0
226        }
227    }
228
229    /// Read the bucket upper bound for a specific bucket index (in microseconds).
230    pub fn bucket_bound(&self, index: usize) -> u64 {
231        if index < Self::NUM_BUCKETS {
232            self.bounds[index]
233        } else {
234            u64::MAX
235        }
236    }
237
238    /// Total number of observations.
239    pub fn count(&self) -> u64 {
240        self.total.load(Ordering::Relaxed)
241    }
242
243    /// Sum of all observed values (in microseconds).
244    pub fn sum(&self) -> u64 {
245        self.sum.load(Ordering::Relaxed)
246    }
247}
248
249/// Global fabric metrics registry.
250///
251/// Tracks system-wide counters and gauges for lease lifecycles,
252/// data-plane operations, and graph rewrites. Access the process-wide
253/// singleton via [`FabricMetrics::global()`].
254///
255/// # Examples
256///
257/// ```
258/// use grafos_observe::FabricMetrics;
259///
260/// let m = FabricMetrics::global();
261/// m.leases_total.inc();
262/// m.leases_active.inc();
263/// m.ops_total.add(5);
264/// m.bytes_read.add(1024);
265/// m.op_latency.observe(300);
266/// ```
267pub struct FabricMetrics {
268    /// Currently active leases (gauge — goes up on acquire, down on drop/expire).
269    pub leases_active: MetricGauge,
270    /// Total leases ever created (counter).
271    pub leases_total: MetricCounter,
272    /// Total leases that expired (counter).
273    pub leases_expired: MetricCounter,
274    /// Total leases that entered fenced state (counter).
275    pub leases_fenced: MetricCounter,
276    /// Total data-plane operations completed (counter).
277    pub ops_total: MetricCounter,
278    /// Total data-plane operations that failed (counter).
279    pub ops_errors: MetricCounter,
280    /// Total bytes read across all data-plane operations (counter).
281    pub bytes_read: MetricCounter,
282    /// Total bytes written across all data-plane operations (counter).
283    pub bytes_written: MetricCounter,
284    /// Total graph rewrites initiated (counter).
285    pub rewrites_total: MetricCounter,
286    /// Histogram of operation latencies in microseconds.
287    pub op_latency: MetricHistogram,
288    /// Total leases explicitly revoked (counter) — distinct from expired.
289    pub leases_revoked: MetricCounter,
290    /// Histogram of lease bind latency in microseconds.
291    pub bind_latency: MetricHistogram,
292    /// Histogram of lease renewal latency in microseconds.
293    pub renew_latency: MetricHistogram,
294    /// Histogram of lease revocation latency in microseconds.
295    pub revoke_latency: MetricHistogram,
296    /// Histogram of teardown execution latency in microseconds.
297    pub teardown_latency: MetricHistogram,
298    /// Total authentication failures (counter).
299    pub auth_failures: MetricCounter,
300    /// Total anti-replay cache rejections (counter).
301    pub replay_rejections: MetricCounter,
302    /// Total capability token validations (counter).
303    pub token_validations: MetricCounter,
304    /// Total capability token validation failures (counter).
305    pub token_failures: MetricCounter,
306    /// Total stale access attempts after revoke/expiry (counter).
307    pub stale_access_rejections: MetricCounter,
308    /// Histogram of control-plane operation latencies in microseconds.
309    pub control_latency: MetricHistogram,
310    /// Histogram of dataplane operation latencies in microseconds.
311    pub dataplane_latency: MetricHistogram,
312    /// Histogram of tasklet submit (full dispatch) latencies in microseconds.
313    pub tasklet_submit_latency: MetricHistogram,
314    /// Histogram of tasklet execution-only latencies in microseconds.
315    pub tasklet_exec_latency: MetricHistogram,
316    /// Total tasklet submissions received (counter).
317    pub tasklet_submits: MetricCounter,
318    /// Total tasklet executions that completed successfully (counter).
319    pub tasklet_completions: MetricCounter,
320    /// Total tasklet executions that failed (counter).
321    pub tasklet_failures: MetricCounter,
322    /// Histogram of tasklet wall-clock duration in microseconds.
323    pub tasklet_duration: MetricHistogram,
324    /// Total module cache hits (counter).
325    pub module_cache_hits: MetricCounter,
326    /// Total module cache misses (counter).
327    pub module_cache_misses: MetricCounter,
328    /// Total module cache stores (counter).
329    pub module_cache_stores: MetricCounter,
330    /// Total module cache hash mismatches (counter).
331    pub module_cache_hash_mismatches: MetricCounter,
332    // ── Phase 219 / slice 20 — audit-chain emit counters ──
333    //
334    // Per-AuditEventKind counters that bump every time a daemon /
335    // scheduler emits an audit record. SREs scrape these via
336    // Prometheus to see lifecycle event RATES (emits/sec) without
337    // tailing the chain itself. The breakdown by kind lets a
338    // dashboard distinguish "lease churn" (lots of allocated +
339    // released) from "instability" (lots of fenced + expired).
340    /// Total `CapabilityIssued` audit records emitted (counter).
341    pub audit_capability_issued: MetricCounter,
342    /// Total `CapabilityRevoked` audit records emitted (counter).
343    pub audit_capability_revoked: MetricCounter,
344    /// Total `LeaseAllocated` audit records emitted (counter).
345    pub audit_lease_allocated: MetricCounter,
346    /// Total `LeaseRenewed` audit records emitted (counter).
347    pub audit_lease_renewed: MetricCounter,
348    /// Total `LeaseReleased` audit records emitted (counter).
349    pub audit_lease_released: MetricCounter,
350    /// Total `LeaseExpired` audit records emitted (counter).
351    pub audit_lease_expired: MetricCounter,
352    /// Total `LeaseFenced` audit records emitted (counter).
353    pub audit_lease_fenced: MetricCounter,
354    /// Total `LeaseTorndown` audit records emitted (counter).
355    pub audit_lease_torndown: MetricCounter,
356    /// Total `AdmissionDecided` audit records emitted (counter).
357    pub audit_admission_decided: MetricCounter,
358    /// Total `Preempted` audit records emitted (counter).
359    pub audit_preempted: MetricCounter,
360    /// Total `DrainInitiated` audit records emitted (counter).
361    pub audit_drain_initiated: MetricCounter,
362    /// Total `ChainAnchored` audit records emitted (counter). A
363    /// non-zero value indicates the chain has been anchored at
364    /// least once — operators monitor this for key-rotation
365    /// boundaries.
366    pub audit_chain_anchored: MetricCounter,
367    /// Total `SoftModeEnabled` audit records emitted (counter).
368    /// **Should normally be ZERO in production**. Any non-zero
369    /// value means a weaker-mode toggle was enabled and warrants
370    /// operator attention.
371    pub audit_soft_mode_enabled: MetricCounter,
372    /// Phase 218–222 / slice 77 — total `TenantCreated` audit
373    /// records emitted (counter). Tenant-CRUD typed kinds were
374    /// added in Stage 2a of the audit-surface migration; producers
375    /// dual-write to both this typed counter and the older
376    /// enterprise_audit log during Stage 2.
377    pub audit_tenant_created: MetricCounter,
378    /// Phase 218–222 / slice 77 — total `TenantDeleted` audit
379    /// records emitted (counter).
380    pub audit_tenant_deleted: MetricCounter,
381    /// Phase 218–222 / slice 77 — total `TenantQuotaUpdated` audit
382    /// records emitted (counter).
383    pub audit_tenant_quota_updated: MetricCounter,
384    /// Phase 218–222 / slice 79 — total `ProviderConformanceRecorded`
385    /// audit records emitted (counter). Stage 2b of the audit-surface
386    /// migration: the orchestrator dual-writes this typed counter
387    /// alongside the older enterprise_audit `kind=admin,
388    /// outcome="provider_conformance_recorded"` row.
389    pub audit_provider_conformance_recorded: MetricCounter,
390    /// Phase 218–222 / slice 79 — total
391    /// `ProviderBootstrapTokenIssued` audit records emitted.
392    pub audit_provider_bootstrap_token_issued: MetricCounter,
393    /// Phase 218–222 / slice 79 — total `ProviderBootstrapExchanged`
394    /// audit records emitted.
395    pub audit_provider_bootstrap_exchanged: MetricCounter,
396    /// Phase 218–222 / slice 79 — total `ProviderCellIdentityIssued`
397    /// audit records emitted.
398    pub audit_provider_cell_identity_issued: MetricCounter,
399    /// Phase 218–222 / slice 79 — total `ProviderCellIdentityRotated`
400    /// audit records emitted.
401    pub audit_provider_cell_identity_rotated: MetricCounter,
402    /// Phase 218–222 / slice 79 — total `ProviderCellIdentityRevoked`
403    /// audit records emitted.
404    pub audit_provider_cell_identity_revoked: MetricCounter,
405    /// Phase 218–222 / slice 80 — total `BearerTokenIssued` audit
406    /// records emitted (counter). Stage 2c of the audit-surface
407    /// migration: cell-side and orchestrator-side mint paths dual-
408    /// write this typed counter alongside the older enterprise_audit
409    /// `kind=token, outcome="created"` row. Distinct from
410    /// `audit_capability_issued`: bearer tokens are HTTP
411    /// Authorization-shaped admin/tenant API keys (not lease-bound
412    /// capability tokens). Operators monitoring token churn or
413    /// alerting on suspicious mint cadence read this counter.
414    pub audit_bearer_token_issued: MetricCounter,
415    /// Phase 218–222 / slice 80 — total `BearerTokenRevoked` audit
416    /// records emitted (counter). Distinct from
417    /// `audit_capability_revoked` (lease-bound capability revoke).
418    /// Operators monitoring token revocation rate or alerting on
419    /// mass-revoke incidents read this counter.
420    pub audit_bearer_token_revoked: MetricCounter,
421    /// Phase 218–222 / slice 81 — total `SchedulerPromoted` audit
422    /// records emitted (counter). Stage 2 mop-up of the audit-surface
423    /// migration: the cell-side `handle_promote` dual-writes this
424    /// typed counter alongside the older enterprise_audit
425    /// `kind=admin, outcome="promoted"` row. A non-zero value (and
426    /// especially a sustained rate) is operator-relevant: each emit
427    /// represents a leadership transition.
428    pub audit_scheduler_promoted: MetricCounter,
429    /// Phase 218–222 / slice 81 — total `BillingRateCardInstalled`
430    /// audit records emitted (counter). Stage 2 mop-up: the only
431    /// Billing-category producer. Lifecycle-axis, low-volume.
432    /// Operators alerting on rate-card swaps (a billing-impacting
433    /// admin action) read this counter.
434    pub audit_billing_rate_card_installed: MetricCounter,
435    /// Slice 261 (EdgeRecord audit-chain integration arc) — total
436    /// `EdgeRewritten` audit records emitted (counter). One emit per
437    /// affected edge per committed rewrite (producer wiring in slice
438    /// 262). High-volume relative to other audit kinds because a
439    /// rewrite typically touches multiple edges; operators reading
440    /// this counter expect throughput proportional to graph mutation
441    /// rate.
442    pub audit_edge_rewritten: MetricCounter,
443    /// Total audit records emitted across all kinds (counter).
444    /// Equal to the sum of the per-kind counters; exposed
445    /// separately for cheap "total emit rate" dashboards.
446    pub audit_records_emitted: MetricCounter,
447
448    /// Phase 219 slice 23 — total audit records persisted to the
449    /// JSONL sink (counter). Bumps once per successful
450    /// `grafos_audit::write_record` call. Difference from
451    /// `audit_records_emitted` indicates JSONL lag or the daemon
452    /// running without `--audit-jsonl-path`.
453    pub audit_jsonl_writes: MetricCounter,
454    /// Phase 219 slice 23 — total audit JSONL writes that failed
455    /// (counter). Bumps once per `write_record` Err. Operator
456    /// attention warranted; chain integrity is preserved but the
457    /// downstream collector is missing records.
458    pub audit_jsonl_write_failed: MetricCounter,
459}
460
461impl Default for FabricMetrics {
462    fn default() -> Self {
463        Self::new()
464    }
465}
466
467impl FabricMetrics {
468    /// Create a new metrics instance with all values at zero.
469    pub const fn new() -> Self {
470        Self {
471            leases_active: MetricGauge::new(),
472            leases_total: MetricCounter::new(),
473            leases_expired: MetricCounter::new(),
474            leases_fenced: MetricCounter::new(),
475            ops_total: MetricCounter::new(),
476            ops_errors: MetricCounter::new(),
477            bytes_read: MetricCounter::new(),
478            bytes_written: MetricCounter::new(),
479            rewrites_total: MetricCounter::new(),
480            op_latency: MetricHistogram::new(),
481            leases_revoked: MetricCounter::new(),
482            bind_latency: MetricHistogram::new(),
483            renew_latency: MetricHistogram::new(),
484            revoke_latency: MetricHistogram::new(),
485            teardown_latency: MetricHistogram::new(),
486            auth_failures: MetricCounter::new(),
487            replay_rejections: MetricCounter::new(),
488            token_validations: MetricCounter::new(),
489            token_failures: MetricCounter::new(),
490            stale_access_rejections: MetricCounter::new(),
491            control_latency: MetricHistogram::new(),
492            dataplane_latency: MetricHistogram::new(),
493            tasklet_submit_latency: MetricHistogram::new(),
494            tasklet_exec_latency: MetricHistogram::new(),
495            tasklet_submits: MetricCounter::new(),
496            tasklet_completions: MetricCounter::new(),
497            tasklet_failures: MetricCounter::new(),
498            tasklet_duration: MetricHistogram::new(),
499            module_cache_hits: MetricCounter::new(),
500            module_cache_misses: MetricCounter::new(),
501            module_cache_stores: MetricCounter::new(),
502            module_cache_hash_mismatches: MetricCounter::new(),
503            audit_capability_issued: MetricCounter::new(),
504            audit_capability_revoked: MetricCounter::new(),
505            audit_lease_allocated: MetricCounter::new(),
506            audit_lease_renewed: MetricCounter::new(),
507            audit_lease_released: MetricCounter::new(),
508            audit_lease_expired: MetricCounter::new(),
509            audit_lease_fenced: MetricCounter::new(),
510            audit_lease_torndown: MetricCounter::new(),
511            audit_admission_decided: MetricCounter::new(),
512            audit_preempted: MetricCounter::new(),
513            audit_drain_initiated: MetricCounter::new(),
514            audit_chain_anchored: MetricCounter::new(),
515            audit_soft_mode_enabled: MetricCounter::new(),
516            audit_tenant_created: MetricCounter::new(),
517            audit_tenant_deleted: MetricCounter::new(),
518            audit_tenant_quota_updated: MetricCounter::new(),
519            audit_provider_conformance_recorded: MetricCounter::new(),
520            audit_provider_bootstrap_token_issued: MetricCounter::new(),
521            audit_provider_bootstrap_exchanged: MetricCounter::new(),
522            audit_provider_cell_identity_issued: MetricCounter::new(),
523            audit_provider_cell_identity_rotated: MetricCounter::new(),
524            audit_provider_cell_identity_revoked: MetricCounter::new(),
525            audit_bearer_token_issued: MetricCounter::new(),
526            audit_bearer_token_revoked: MetricCounter::new(),
527            audit_scheduler_promoted: MetricCounter::new(),
528            audit_billing_rate_card_installed: MetricCounter::new(),
529            audit_edge_rewritten: MetricCounter::new(),
530            audit_records_emitted: MetricCounter::new(),
531            audit_jsonl_writes: MetricCounter::new(),
532            audit_jsonl_write_failed: MetricCounter::new(),
533        }
534    }
535
536    /// Phase 219 slice 20 — bump the per-kind audit-emit counter
537    /// for `kind`, plus the overall `audit_records_emitted` counter.
538    /// Callers (typically `fabricbiosd`'s emit_*_audit helpers and
539    /// the tick thread's inline emit) call this on every successful
540    /// `assemble_record` so the Prometheus dashboard sees the rate.
541    /// The match arms cover every `AuditEventKind` variant — adding
542    /// a new variant in `grafos-core::policy_vocab` MUST also add
543    /// the corresponding counter and an arm here, otherwise the
544    /// new kind's emit rate is invisible.
545    pub fn count_audit_emit(&self, kind: grafos_core::AuditEventKind) {
546        use grafos_core::AuditEventKind as K;
547        let counter = match kind {
548            K::CapabilityIssued => &self.audit_capability_issued,
549            K::CapabilityRevoked => &self.audit_capability_revoked,
550            K::LeaseAllocated => &self.audit_lease_allocated,
551            K::LeaseRenewed => &self.audit_lease_renewed,
552            K::LeaseReleased => &self.audit_lease_released,
553            K::LeaseExpired => &self.audit_lease_expired,
554            K::LeaseFenced => &self.audit_lease_fenced,
555            K::LeaseTorndown => &self.audit_lease_torndown,
556            K::AdmissionDecided => &self.audit_admission_decided,
557            K::Preempted => &self.audit_preempted,
558            K::DrainInitiated => &self.audit_drain_initiated,
559            K::ChainAnchored => &self.audit_chain_anchored,
560            K::SoftModeEnabled => &self.audit_soft_mode_enabled,
561            K::TenantCreated => &self.audit_tenant_created,
562            K::TenantDeleted => &self.audit_tenant_deleted,
563            K::TenantQuotaUpdated => &self.audit_tenant_quota_updated,
564            K::ProviderConformanceRecorded => &self.audit_provider_conformance_recorded,
565            K::ProviderBootstrapTokenIssued => &self.audit_provider_bootstrap_token_issued,
566            K::ProviderBootstrapExchanged => &self.audit_provider_bootstrap_exchanged,
567            K::ProviderCellIdentityIssued => &self.audit_provider_cell_identity_issued,
568            K::ProviderCellIdentityRotated => &self.audit_provider_cell_identity_rotated,
569            K::ProviderCellIdentityRevoked => &self.audit_provider_cell_identity_revoked,
570            K::BearerTokenIssued => &self.audit_bearer_token_issued,
571            K::BearerTokenRevoked => &self.audit_bearer_token_revoked,
572            K::SchedulerPromoted => &self.audit_scheduler_promoted,
573            K::BillingRateCardInstalled => &self.audit_billing_rate_card_installed,
574            K::EdgeRewritten => &self.audit_edge_rewritten,
575        };
576        counter.inc();
577        self.audit_records_emitted.inc();
578    }
579
580    /// Access the global metrics instance.
581    pub fn global() -> &'static FabricMetrics {
582        static INSTANCE: FabricMetrics = FabricMetrics::new();
583        &INSTANCE
584    }
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590
591    #[test]
592    fn counter_inc_and_add() {
593        let c = MetricCounter::new();
594        assert_eq!(c.get(), 0);
595        c.inc();
596        assert_eq!(c.get(), 1);
597        c.add(10);
598        assert_eq!(c.get(), 11);
599    }
600
601    #[test]
602    fn counter_reset() {
603        let c = MetricCounter::new();
604        c.add(42);
605        let prev = c.reset();
606        assert_eq!(prev, 42);
607        assert_eq!(c.get(), 0);
608    }
609
610    #[test]
611    fn gauge_inc_dec_set() {
612        let g = MetricGauge::new();
613        assert_eq!(g.get(), 0);
614        g.inc();
615        g.inc();
616        assert_eq!(g.get(), 2);
617        g.dec();
618        assert_eq!(g.get(), 1);
619        g.set(100);
620        assert_eq!(g.get(), 100);
621        g.set(-5);
622        assert_eq!(g.get(), -5);
623    }
624
625    #[test]
626    fn histogram_observe() {
627        let h = MetricHistogram::new();
628        // Observation of 50us should land in the 100us bucket (index 0) and all above.
629        h.observe(50);
630        assert_eq!(h.count(), 1);
631        assert_eq!(h.sum(), 50);
632        // Bucket 0 (<=100us) should have 1 count.
633        assert_eq!(h.bucket_count(0), 1);
634        // Bucket 9 (+Inf) should also have 1 count.
635        assert_eq!(h.bucket_count(9), 1);
636    }
637
638    #[test]
639    fn histogram_multiple_observations() {
640        let h = MetricHistogram::new();
641        h.observe(50); // <=100us bucket
642        h.observe(200); // <=500us bucket
643        h.observe(2_000); // <=5ms bucket
644        h.observe(999_999); // <=1s bucket
645
646        assert_eq!(h.count(), 4);
647        assert_eq!(h.sum(), 50 + 200 + 2_000 + 999_999);
648
649        // Bucket 0 (<=100us): only the 50us observation.
650        assert_eq!(h.bucket_count(0), 1);
651        // Bucket 1 (<=500us): 50 + 200.
652        assert_eq!(h.bucket_count(1), 2);
653        // Bucket 3 (<=5ms): 50 + 200 + 2000.
654        assert_eq!(h.bucket_count(3), 3);
655        // Bucket 8 (<=1s): all four.
656        assert_eq!(h.bucket_count(8), 4);
657        // +Inf: all four.
658        assert_eq!(h.bucket_count(9), 4);
659    }
660
661    #[test]
662    fn histogram_out_of_bounds_index() {
663        let h = MetricHistogram::new();
664        assert_eq!(h.bucket_count(99), 0);
665        assert_eq!(h.bucket_bound(99), u64::MAX);
666    }
667
668    #[test]
669    fn fabric_metrics_global_is_singleton() {
670        let m1 = FabricMetrics::global();
671        let m2 = FabricMetrics::global();
672        // Same address.
673        assert!(core::ptr::eq(m1, m2));
674    }
675
676    #[test]
677    fn fabric_metrics_fields() {
678        let m = FabricMetrics::new();
679        m.leases_active.inc();
680        m.leases_total.inc();
681        m.ops_total.add(5);
682        m.bytes_read.add(1024);
683        m.bytes_written.add(512);
684        m.rewrites_total.inc();
685        m.leases_expired.inc();
686        m.leases_fenced.inc();
687        m.ops_errors.inc();
688
689        assert_eq!(m.leases_active.get(), 1);
690        assert_eq!(m.leases_total.get(), 1);
691        assert_eq!(m.ops_total.get(), 5);
692        assert_eq!(m.bytes_read.get(), 1024);
693        assert_eq!(m.bytes_written.get(), 512);
694        assert_eq!(m.rewrites_total.get(), 1);
695        assert_eq!(m.leases_expired.get(), 1);
696        assert_eq!(m.leases_fenced.get(), 1);
697        assert_eq!(m.ops_errors.get(), 1);
698
699        m.leases_revoked.inc();
700        m.auth_failures.inc();
701        m.replay_rejections.inc();
702        m.token_validations.add(10);
703        m.token_failures.inc();
704        m.stale_access_rejections.inc();
705        m.bind_latency.observe(100);
706        m.renew_latency.observe(200);
707        m.revoke_latency.observe(300);
708        m.teardown_latency.observe(400);
709
710        assert_eq!(m.leases_revoked.get(), 1);
711        assert_eq!(m.auth_failures.get(), 1);
712        assert_eq!(m.replay_rejections.get(), 1);
713        assert_eq!(m.token_validations.get(), 10);
714        assert_eq!(m.token_failures.get(), 1);
715        assert_eq!(m.stale_access_rejections.get(), 1);
716        assert_eq!(m.bind_latency.count(), 1);
717        assert_eq!(m.renew_latency.count(), 1);
718        assert_eq!(m.revoke_latency.count(), 1);
719        assert_eq!(m.teardown_latency.count(), 1);
720    }
721
722    /// Phase 219 slice 20 — count_audit_emit routes each
723    /// AuditEventKind to the correct per-kind counter AND bumps
724    /// the overall `audit_records_emitted`. A regression in the
725    /// match arms (e.g., a future variant added without an arm)
726    /// would silently miss-route emits — this test pins each
727    /// variant's counter mapping.
728    #[test]
729    fn count_audit_emit_routes_each_kind_to_correct_counter() {
730        use grafos_core::AuditEventKind as K;
731        let m = FabricMetrics::new();
732
733        // Each kind bumps its specific counter and the overall total.
734        // Walk every variant; if a future variant lands without a
735        // match arm in count_audit_emit, this loop would not
736        // exercise it — the compile-time exhaustiveness check on
737        // the match itself is the regression guard, but this test
738        // verifies the runtime mapping for every existing variant.
739        let kinds_and_getters: &[(K, fn(&FabricMetrics) -> u64)] = &[
740            (K::CapabilityIssued, |m| m.audit_capability_issued.get()),
741            (K::CapabilityRevoked, |m| m.audit_capability_revoked.get()),
742            (K::LeaseAllocated, |m| m.audit_lease_allocated.get()),
743            (K::LeaseRenewed, |m| m.audit_lease_renewed.get()),
744            (K::LeaseReleased, |m| m.audit_lease_released.get()),
745            (K::LeaseExpired, |m| m.audit_lease_expired.get()),
746            (K::LeaseFenced, |m| m.audit_lease_fenced.get()),
747            (K::LeaseTorndown, |m| m.audit_lease_torndown.get()),
748            (K::AdmissionDecided, |m| m.audit_admission_decided.get()),
749            (K::Preempted, |m| m.audit_preempted.get()),
750            (K::DrainInitiated, |m| m.audit_drain_initiated.get()),
751            (K::ChainAnchored, |m| m.audit_chain_anchored.get()),
752            (K::SoftModeEnabled, |m| m.audit_soft_mode_enabled.get()),
753            (K::TenantCreated, |m| m.audit_tenant_created.get()),
754            (K::TenantDeleted, |m| m.audit_tenant_deleted.get()),
755            (K::TenantQuotaUpdated, |m| {
756                m.audit_tenant_quota_updated.get()
757            }),
758            (K::ProviderConformanceRecorded, |m| {
759                m.audit_provider_conformance_recorded.get()
760            }),
761            (K::ProviderBootstrapTokenIssued, |m| {
762                m.audit_provider_bootstrap_token_issued.get()
763            }),
764            (K::ProviderBootstrapExchanged, |m| {
765                m.audit_provider_bootstrap_exchanged.get()
766            }),
767            (K::ProviderCellIdentityIssued, |m| {
768                m.audit_provider_cell_identity_issued.get()
769            }),
770            (K::ProviderCellIdentityRotated, |m| {
771                m.audit_provider_cell_identity_rotated.get()
772            }),
773            (K::ProviderCellIdentityRevoked, |m| {
774                m.audit_provider_cell_identity_revoked.get()
775            }),
776            (K::BearerTokenIssued, |m| m.audit_bearer_token_issued.get()),
777            (K::BearerTokenRevoked, |m| {
778                m.audit_bearer_token_revoked.get()
779            }),
780            (K::SchedulerPromoted, |m| m.audit_scheduler_promoted.get()),
781            (K::BillingRateCardInstalled, |m| {
782                m.audit_billing_rate_card_installed.get()
783            }),
784            (K::EdgeRewritten, |m| m.audit_edge_rewritten.get()),
785        ];
786        for (kind, getter) in kinds_and_getters.iter() {
787            let before = getter(&m);
788            m.count_audit_emit(*kind);
789            let after = getter(&m);
790            assert_eq!(
791                after,
792                before + 1,
793                "kind {:?} did not bump its counter",
794                kind
795            );
796        }
797        // Overall counter equals the number of emits we made.
798        assert_eq!(
799            m.audit_records_emitted.get(),
800            kinds_and_getters.len() as u64
801        );
802    }
803}
grafos_observe/metrics.rs

grafos_observe/
metrics.rs