grafos_observe/
dataplane.rs

1//! Phase 219 data-plane span vocabulary.
2//!
3//! Producer-side data-plane operations (FBMU, FBBU, GPU session, runtime
4//! capability cache) emit canonical [`ResourceSpan`]s using the helpers
5//! in this module. Span names come from the [`PHASE_219_SPAN_NAMES`]
6//! contract; lease IDs are kept in the span body, never on Prometheus
7//! labels.
8//!
9//! The vocabulary lives here — not in `grafos-scheduler` — so that
10//! data-plane producers (e.g. `grafos-runtime`, `grafos-std`) can emit
11//! spans without depending on the scheduler crate. This aligns with the
12//! v1.1 design (observability is substrate-level, not policy-level) and
13//! with §3.3 invariant 2 (every edge has a trace context, so every edge
14//! realization is a candidate emission site).
15//!
16//! [`PHASE_219_SPAN_NAMES`]: crate::PHASE_219_SPAN_NAMES
17
18use alloc::string::ToString;
19
20use grafos_core::ResourceKind;
21
22use crate::event::OpType;
23use crate::export::emit_span;
24use crate::span::{ResourceSpan, SpanStatus};
25use crate::trace::{TraceContext, TraceContextError};
26
27fn resource_kind_label(kind: ResourceKind) -> &'static str {
28    match kind {
29        ResourceKind::Mem => "mem",
30        ResourceKind::Block => "block",
31        ResourceKind::Net => "net",
32        ResourceKind::Cpu => "cpu",
33        ResourceKind::Gpu => "gpu",
34        ResourceKind::GpuMem => "gpumem",
35        ResourceKind::Tasklet => "tasklet",
36    }
37}
38
39fn optional_trace_context(
40    traceparent: Option<&str>,
41) -> Result<Option<TraceContext>, TraceContextError> {
42    match traceparent {
43        Some(value) => TraceContext::from_w3c_string(value).map(Some),
44        None => Ok(None),
45    }
46}
47
48/// Phase 219 data-plane span family. Each variant maps 1:1 to a span
49/// name in [`PHASE_219_SPAN_NAMES`] so callers can refer to the family
50/// by typed identity rather than stringly-typed names.
51///
52/// [`PHASE_219_SPAN_NAMES`]: crate::PHASE_219_SPAN_NAMES
53#[derive(Debug, Clone, Copy, PartialEq, Eq)]
54pub enum DataplaneSpanKind {
55    /// `fabricbios.data.fbmu` — FBMU memory data-plane operation.
56    Fbmu,
57    /// `fabricbios.data.fbbu` — FBBU block data-plane operation.
58    Fbbu,
59    /// `fabricbios.data.gpu_session` — GPU session lifecycle/IO.
60    GpuSession,
61    /// `grafos.runtime.capability_cache` — capability cache hit/miss.
62    CapabilityCache,
63}
64
65impl DataplaneSpanKind {
66    /// Stable span name from the Phase 219 contract.
67    pub fn span_name(self) -> &'static str {
68        match self {
69            DataplaneSpanKind::Fbmu => "fabricbios.data.fbmu",
70            DataplaneSpanKind::Fbbu => "fabricbios.data.fbbu",
71            DataplaneSpanKind::GpuSession => "fabricbios.data.gpu_session",
72            DataplaneSpanKind::CapabilityCache => "grafos.runtime.capability_cache",
73        }
74    }
75}
76
77/// Map a producer-side [`OpType`] to the Phase 219 data-plane span
78/// family it belongs to. `TaskletSubmit` is a control-plane submission
79/// and has no data-plane span kind — returning `None` keeps the
80/// vocabulary closed (callers must handle that variant explicitly).
81///
82/// The match is exhaustive: a future [`OpType`] variant produces a
83/// compile error here, surfacing the missing mapping at build time
84/// rather than at trace export time.
85pub fn dataplane_span_kind_for_op_type(op_type: OpType) -> Option<DataplaneSpanKind> {
86    match op_type {
87        OpType::Read | OpType::Write => Some(DataplaneSpanKind::Fbmu),
88        OpType::ReadBlock | OpType::WriteBlock => Some(DataplaneSpanKind::Fbbu),
89        OpType::GpuSubmit => Some(DataplaneSpanKind::GpuSession),
90        OpType::TaskletSubmit => None,
91    }
92}
93
94/// Stable operation label for a producer-side [`OpType`]. Mirrors
95/// [`OpType`]'s [`fmt::Display`] impl but is `&'static str` so callers
96/// can set it as a span attribute without allocation. The match is
97/// exhaustive for the same reason as [`dataplane_span_kind_for_op_type`].
98///
99/// [`fmt::Display`]: core::fmt::Display
100pub fn dataplane_operation_label(op_type: OpType) -> &'static str {
101    match op_type {
102        OpType::Read => "read",
103        OpType::Write => "write",
104        OpType::ReadBlock => "read_block",
105        OpType::WriteBlock => "write_block",
106        OpType::GpuSubmit => "gpu_submit",
107        OpType::TaskletSubmit => "tasklet_submit",
108    }
109}
110
111/// Build a data-plane span using the Phase 219 canonical span names.
112/// The helper keeps the lease id in the span body (never a Prometheus
113/// label) and routes resource kind / operation / outcome through stable
114/// attribute keys so consumers see the same shape across families.
115pub fn build_dataplane_span(
116    trace_context: TraceContext,
117    kind: DataplaneSpanKind,
118    operation: &str,
119    outcome: &str,
120    resource_kind: ResourceKind,
121    lease_id: Option<u128>,
122    start_time_unix_us: u64,
123    end_time_unix_us: u64,
124) -> ResourceSpan {
125    let mut span = ResourceSpan::new(kind.span_name(), trace_context);
126    span.start_time_unix_us = start_time_unix_us;
127    span.end_time_unix_us = end_time_unix_us;
128    span.status = if outcome == "failed" || outcome == "rejected" {
129        SpanStatus::Error(outcome.to_string())
130    } else {
131        SpanStatus::Ok
132    };
133    span.set_attribute("operation", operation);
134    span.set_attribute("outcome", outcome);
135    span.set_attribute("resource_kind", resource_kind_label(resource_kind));
136    if let Some(lease_id) = lease_id {
137        span.add_lease_id(lease_id);
138    }
139    span
140}
141
142/// Emit a data-plane span through grafos-observe's shared span exporter.
143pub fn emit_dataplane_span(
144    trace_context: TraceContext,
145    kind: DataplaneSpanKind,
146    operation: &str,
147    outcome: &str,
148    resource_kind: ResourceKind,
149    lease_id: Option<u128>,
150    start_time_unix_us: u64,
151    end_time_unix_us: u64,
152) {
153    emit_span(build_dataplane_span(
154        trace_context,
155        kind,
156        operation,
157        outcome,
158        resource_kind,
159        lease_id,
160        start_time_unix_us,
161        end_time_unix_us,
162    ));
163}
164
165/// Build a data-plane span from an optional W3C traceparent.
166///
167/// Absent trace context returns `Ok(None)`; invalid trace context
168/// returns a typed error. The helper intentionally does not synthesize
169/// root contexts — data-plane ops without an inbound trace are surfaced
170/// as "no span" so callers can decide between linking and root-creation
171/// explicitly (v1.1 §3.3 invariant 2 / no silent best-effort loss).
172pub fn build_dataplane_span_from_traceparent(
173    traceparent: Option<&str>,
174    kind: DataplaneSpanKind,
175    operation: &str,
176    outcome: &str,
177    resource_kind: ResourceKind,
178    lease_id: Option<u128>,
179    start_time_unix_us: u64,
180    end_time_unix_us: u64,
181) -> Result<Option<ResourceSpan>, TraceContextError> {
182    Ok(optional_trace_context(traceparent)?.map(|trace_context| {
183        build_dataplane_span(
184            trace_context,
185            kind,
186            operation,
187            outcome,
188            resource_kind,
189            lease_id,
190            start_time_unix_us,
191            end_time_unix_us,
192        )
193    }))
194}
195
196/// Emit a data-plane span from an optional W3C traceparent.
197pub fn emit_dataplane_span_from_traceparent(
198    traceparent: Option<&str>,
199    kind: DataplaneSpanKind,
200    operation: &str,
201    outcome: &str,
202    resource_kind: ResourceKind,
203    lease_id: Option<u128>,
204    start_time_unix_us: u64,
205    end_time_unix_us: u64,
206) -> Result<(), TraceContextError> {
207    if let Some(span) = build_dataplane_span_from_traceparent(
208        traceparent,
209        kind,
210        operation,
211        outcome,
212        resource_kind,
213        lease_id,
214        start_time_unix_us,
215        end_time_unix_us,
216    )? {
217        emit_span(span);
218    }
219    Ok(())
220}
221
222#[cfg(test)]
223mod tests {
224    use super::*;
225    use crate::PHASE_219_SPAN_NAMES;
226    use alloc::format;
227    use alloc::vec;
228
229    fn test_ctx() -> TraceContext {
230        TraceContext::new_root(&[0x42; 24])
231    }
232
233    #[test]
234    fn dataplane_span_kind_maps_to_phase_219_contract_names() {
235        for kind in [
236            DataplaneSpanKind::Fbmu,
237            DataplaneSpanKind::Fbbu,
238            DataplaneSpanKind::GpuSession,
239            DataplaneSpanKind::CapabilityCache,
240        ] {
241            assert!(
242                PHASE_219_SPAN_NAMES.contains(&kind.span_name()),
243                "data-plane span name {} must be in the Phase 219 contract",
244                kind.span_name(),
245            );
246        }
247        assert_eq!(DataplaneSpanKind::Fbmu.span_name(), "fabricbios.data.fbmu");
248        assert_eq!(DataplaneSpanKind::Fbbu.span_name(), "fabricbios.data.fbbu");
249        assert_eq!(
250            DataplaneSpanKind::GpuSession.span_name(),
251            "fabricbios.data.gpu_session"
252        );
253        assert_eq!(
254            DataplaneSpanKind::CapabilityCache.span_name(),
255            "grafos.runtime.capability_cache"
256        );
257    }
258
259    #[test]
260    fn dataplane_span_records_lease_id_resource_kind_and_outcome() {
261        let span = build_dataplane_span(
262            test_ctx(),
263            DataplaneSpanKind::Fbmu,
264            "bind",
265            "bound",
266            ResourceKind::Mem,
267            Some(0x1234),
268            50,
269            150,
270        );
271
272        assert_eq!(span.name, "fabricbios.data.fbmu");
273        assert_eq!(span.duration_us(), 100);
274        assert_eq!(span.lease_ids, vec![0x1234]);
275        assert!(matches!(span.status, SpanStatus::Ok));
276        assert_eq!(
277            span.attributes
278                .iter()
279                .find(|(k, _)| k == "operation")
280                .map(|(_, v)| v.as_str()),
281            Some("bind")
282        );
283        assert_eq!(
284            span.attributes
285                .iter()
286                .find(|(k, _)| k == "outcome")
287                .map(|(_, v)| v.as_str()),
288            Some("bound")
289        );
290        assert_eq!(
291            span.attributes
292                .iter()
293                .find(|(k, _)| k == "resource_kind")
294                .map(|(_, v)| v.as_str()),
295            Some("mem")
296        );
297    }
298
299    #[test]
300    fn dataplane_span_marks_failed_and_rejected_outcomes_as_error() {
301        let failed = build_dataplane_span(
302            test_ctx(),
303            DataplaneSpanKind::Fbbu,
304            "read",
305            "failed",
306            ResourceKind::Block,
307            Some(0x9),
308            1,
309            2,
310        );
311        assert!(matches!(failed.status, SpanStatus::Error(_)));
312
313        let rejected = build_dataplane_span(
314            test_ctx(),
315            DataplaneSpanKind::CapabilityCache,
316            "lookup",
317            "rejected",
318            ResourceKind::Net,
319            None,
320            1,
321            2,
322        );
323        assert!(matches!(rejected.status, SpanStatus::Error(_)));
324        assert!(
325            rejected.lease_ids.is_empty(),
326            "no lease id provided → span lease list stays empty"
327        );
328    }
329
330    #[test]
331    fn dataplane_span_uses_gpu_session_name_and_gpu_label() {
332        let span = build_dataplane_span(
333            test_ctx(),
334            DataplaneSpanKind::GpuSession,
335            "open",
336            "ok",
337            ResourceKind::Gpu,
338            Some(0xab),
339            10,
340            30,
341        );
342        assert_eq!(span.name, "fabricbios.data.gpu_session");
343        assert_eq!(
344            span.attributes
345                .iter()
346                .find(|(k, _)| k == "resource_kind")
347                .map(|(_, v)| v.as_str()),
348            Some("gpu")
349        );
350    }
351
352    #[test]
353    fn dataplane_traceparent_helper_rejects_invalid_context() {
354        let err = build_dataplane_span_from_traceparent(
355            Some("not-a-traceparent"),
356            DataplaneSpanKind::Fbmu,
357            "bind",
358            "bound",
359            ResourceKind::Mem,
360            Some(1),
361            1,
362            2,
363        )
364        .expect_err("invalid traceparent must not silently drop the span");
365
366        assert!(matches!(err, TraceContextError::InvalidFormat));
367    }
368
369    #[test]
370    fn dataplane_traceparent_helper_omits_absent_context() {
371        let span = build_dataplane_span_from_traceparent(
372            None,
373            DataplaneSpanKind::CapabilityCache,
374            "lookup",
375            "hit",
376            ResourceKind::Net,
377            None,
378            1,
379            2,
380        )
381        .expect("absent traceparent is a valid no-span case");
382
383        assert!(span.is_none());
384    }
385
386    #[test]
387    fn dataplane_traceparent_helper_uses_supplied_context() {
388        let ctx = test_ctx();
389        let traceparent = ctx.to_w3c_string();
390        let span = build_dataplane_span_from_traceparent(
391            Some(&traceparent),
392            DataplaneSpanKind::Fbbu,
393            "write",
394            "ok",
395            ResourceKind::Block,
396            Some(0x77),
397            10,
398            40,
399        )
400        .expect("valid traceparent")
401        .expect("span should be built when traceparent is present");
402
403        assert_eq!(span.name, "fabricbios.data.fbbu");
404        assert_eq!(span.trace_context.trace_id, ctx.trace_id);
405        assert_eq!(span.trace_context.span_id, ctx.span_id);
406        assert_eq!(span.lease_ids, vec![0x77]);
407        assert_eq!(
408            span.attributes
409                .iter()
410                .find(|(k, _)| k == "resource_kind")
411                .map(|(_, v)| v.as_str()),
412            Some("block")
413        );
414    }
415
416    #[test]
417    fn dataplane_span_kind_for_op_type_maps_memory_ops_to_fbmu() {
418        assert_eq!(
419            dataplane_span_kind_for_op_type(OpType::Read),
420            Some(DataplaneSpanKind::Fbmu)
421        );
422        assert_eq!(
423            dataplane_span_kind_for_op_type(OpType::Write),
424            Some(DataplaneSpanKind::Fbmu)
425        );
426    }
427
428    #[test]
429    fn dataplane_span_kind_for_op_type_maps_block_ops_to_fbbu() {
430        assert_eq!(
431            dataplane_span_kind_for_op_type(OpType::ReadBlock),
432            Some(DataplaneSpanKind::Fbbu)
433        );
434        assert_eq!(
435            dataplane_span_kind_for_op_type(OpType::WriteBlock),
436            Some(DataplaneSpanKind::Fbbu)
437        );
438    }
439
440    #[test]
441    fn dataplane_span_kind_for_op_type_maps_gpu_submit_to_gpu_session() {
442        assert_eq!(
443            dataplane_span_kind_for_op_type(OpType::GpuSubmit),
444            Some(DataplaneSpanKind::GpuSession)
445        );
446    }
447
448    #[test]
449    fn dataplane_span_kind_for_op_type_returns_none_for_tasklet_submit() {
450        // TaskletSubmit is a control-plane submission, not a data-plane
451        // span family in the Phase 219 contract. Callers must handle the
452        // None case explicitly rather than defaulting to a data-plane
453        // span name that doesn't belong.
454        assert_eq!(dataplane_span_kind_for_op_type(OpType::TaskletSubmit), None);
455    }
456
457    #[test]
458    fn dataplane_operation_label_matches_op_type_display_strings() {
459        assert_eq!(dataplane_operation_label(OpType::Read), "read");
460        assert_eq!(dataplane_operation_label(OpType::Write), "write");
461        assert_eq!(dataplane_operation_label(OpType::ReadBlock), "read_block");
462        assert_eq!(dataplane_operation_label(OpType::WriteBlock), "write_block");
463        assert_eq!(dataplane_operation_label(OpType::GpuSubmit), "gpu_submit");
464        assert_eq!(
465            dataplane_operation_label(OpType::TaskletSubmit),
466            "tasklet_submit"
467        );
468    }
469
470    #[test]
471    fn dataplane_operation_label_agrees_with_op_type_display_impl() {
472        // Pin that our &'static str label never drifts from the
473        // grafos-observe OpType Display impl. If a future variant
474        // diverges, this test surfaces the drift at build time.
475        for op_type in [
476            OpType::Read,
477            OpType::Write,
478            OpType::ReadBlock,
479            OpType::WriteBlock,
480            OpType::GpuSubmit,
481            OpType::TaskletSubmit,
482        ] {
483            assert_eq!(
484                dataplane_operation_label(op_type),
485                format!("{}", op_type),
486                "&'static str label drifted from Display for {op_type:?}",
487            );
488        }
489    }
490}