grafos_observe/
contract.rs

1//! Stable Phase 219 observability contract names.
2//!
3//! These constants mirror `docs/observability-metric-contract.md` and
4//! `docs/observability-event-trace-contract.md`. Producers should use
5//! these names instead of repeating string literals so metrics, events,
6//! traces, dashboards, and docs stay aligned.
7
8/// Default production Prometheus labels must not use these fields.
9///
10/// They are intentionally high-cardinality identifiers and belong in
11/// spans, events, audit records, scoped APIs, dashboard JSON, or
12/// debug-only surfaces.
13pub const FORBIDDEN_DEFAULT_METRIC_LABELS: &[&str] = &[
14    "lease_id",
15    "trace_id",
16    "span_id",
17    "tenant_id",
18    "tenant_name",
19    "pod_uid",
20    "pod_name",
21    "service_account",
22    "logical_resource_name",
23    "operation_hash",
24    "node_id",
25];
26
27/// Existing aggregate metric families rendered by the current
28/// `PrometheusExporter`.
29pub const STABLE_METRIC_FAMILIES: &[&str] = &[
30    "grafos_leases_active",
31    "grafos_leases_total",
32    "grafos_leases_expired_total",
33    "grafos_leases_fenced_total",
34    "grafos_leases_revoked_total",
35    "grafos_ops_total",
36    "grafos_ops_errors_total",
37    "grafos_bytes_read_total",
38    "grafos_bytes_written_total",
39    "grafos_op_latency_us",
40    "grafos_bind_latency_us",
41    "grafos_renew_latency_us",
42    "grafos_revoke_latency_us",
43    "grafos_teardown_latency_us",
44    "grafos_auth_failures_total",
45    "grafos_replay_rejections_total",
46    "grafos_token_validations_total",
47    "grafos_token_failures_total",
48    "grafos_stale_access_rejections_total",
49    "grafos_tasklet_submits_total",
50    "grafos_tasklet_completions_total",
51    "grafos_tasklet_failures_total",
52    "grafos_tasklet_submit_latency_us",
53    "grafos_tasklet_exec_latency_us",
54    "grafos_tasklet_duration_us",
55];
56
57/// Required dimensional metric families for Phase 219 producers.
58pub const PHASE_219_METRIC_FAMILIES: &[&str] = &[
59    "grafos_lease_operations_total",
60    "grafos_lease_operation_latency_us",
61    "grafos_scheduler_admissions_total",
62    "grafos_scheduler_admission_latency_us",
63    "grafos_scheduler_pending_admissions",
64    "grafos_scheduler_preemptions_total",
65    "grafos_scheduler_preempted_capacity_bytes_total",
66    "grafos_scheduler_placement_candidates_total",
67    "grafos_scheduler_placement_latency_us",
68    "grafos_resource_capacity_bytes",
69    "grafos_resource_used_bytes",
70    "grafos_resource_pressure_ratio",
71    "grafos_failure_domain_health",
72    "grafos_replica_lag_records",
73    "grafos_economics_generation_total",
74    "grafos_economics_cap_rejections_total",
75];
76
77/// Required span names for Phase 219 admission-to-lease traces.
78pub const PHASE_219_SPAN_NAMES: &[&str] = &[
79    "grafos.api.request",
80    "grafos.scheduler.admission",
81    "grafos.scheduler.placement",
82    "grafos.scheduler.preemption",
83    "grafos.scheduler.lease_request",
84    "fabricbios.control.lease_alloc",
85    "fabricbios.control.lease_renew",
86    "fabricbios.control.lease_revoke",
87    "fabricbios.control.capability_issue",
88    "fabricbios.data.fbmu",
89    "fabricbios.data.fbbu",
90    "fabricbios.data.gpu_session",
91    "grafos.runtime.capability_cache",
92    "grafos.audit.emit",
93    "grafos.economics.publish_generation",
94];
95
96/// Required event names for Phase 219 lifecycle and decision streams.
97pub const PHASE_219_EVENT_NAMES: &[&str] = &[
98    "lease_acquired",
99    "lease_dropped",
100    "lease_expired",
101    "lease_revoked",
102    "lease_fenced",
103    "teardown_failed",
104    "op_completed",
105    "op_failed",
106    "admission_approved",
107    "admission_denied",
108    "placement_decision",
109    "preemption_triggered",
110    "cross_state_disagreement_resolved",
111    "auth_failed",
112    "replay_rejected",
113    "token_validation_failed",
114    "token_minted",
115    "scheduler_election_lost",
116    "scheduler_promotion_failed",
117    "scheduler_stale_leader_detected",
118    "scheduler_promoted",
119];
120
121/// Return true when `label` is forbidden on default production metric
122/// families.
123pub fn is_forbidden_default_metric_label(label: &str) -> bool {
124    FORBIDDEN_DEFAULT_METRIC_LABELS.contains(&label)
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130
131    fn assert_contains_all(haystack: &[&str], needles: &[&str]) {
132        for needle in needles {
133            assert!(
134                haystack.contains(needle),
135                "observability contract is missing {needle}"
136            );
137        }
138    }
139
140    #[test]
141    fn phase_219_metric_contract_contains_required_families() {
142        assert_contains_all(
143            PHASE_219_METRIC_FAMILIES,
144            &[
145                "grafos_lease_operations_total",
146                "grafos_scheduler_admissions_total",
147                "grafos_scheduler_preemptions_total",
148                "grafos_scheduler_placement_candidates_total",
149                "grafos_resource_pressure_ratio",
150                "grafos_failure_domain_health",
151                "grafos_economics_generation_total",
152            ],
153        );
154    }
155
156    #[test]
157    fn stable_metric_contract_contains_existing_exported_families() {
158        assert_contains_all(
159            STABLE_METRIC_FAMILIES,
160            &[
161                "grafos_leases_active",
162                "grafos_leases_total",
163                "grafos_leases_revoked_total",
164                "grafos_revoke_latency_us",
165                "grafos_tasklet_duration_us",
166            ],
167        );
168    }
169
170    #[test]
171    fn forbidden_metric_labels_cover_high_cardinality_identifiers() {
172        for label in [
173            "lease_id",
174            "trace_id",
175            "tenant_id",
176            "tenant_name",
177            "pod_uid",
178            "logical_resource_name",
179            "operation_hash",
180            "node_id",
181        ] {
182            assert!(
183                is_forbidden_default_metric_label(label),
184                "{label} must stay out of default production metric labels"
185            );
186        }
187    }
188
189    #[test]
190    fn phase_219_span_contract_contains_admission_to_lease_path() {
191        assert_contains_all(
192            PHASE_219_SPAN_NAMES,
193            &[
194                "grafos.api.request",
195                "grafos.scheduler.admission",
196                "grafos.scheduler.placement",
197                "grafos.scheduler.lease_request",
198                "fabricbios.control.lease_alloc",
199                "fabricbios.control.capability_issue",
200                "grafos.audit.emit",
201            ],
202        );
203    }
204
205    #[test]
206    fn phase_219_event_contract_contains_lifecycle_and_decision_events() {
207        assert_contains_all(
208            PHASE_219_EVENT_NAMES,
209            &[
210                "lease_acquired",
211                "lease_revoked",
212                "lease_fenced",
213                "admission_denied",
214                "placement_decision",
215                "preemption_triggered",
216                "cross_state_disagreement_resolved",
217                "scheduler_promoted",
218            ],
219        );
220    }
221}