shigoto-types 0.1.7

shigoto — typed primitives (Job, JobId, JobPhase, JobKindId, JobScope, JobSubject, TickReceipt, Snapshot).
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
//! Typed failure classification — the META primitive every long-running
//! pleme-io reconciler consumes.
//!
//! ## The problem this primitive solves
//!
//! A naive reconciler treats every failure as transient — it backs off
//! exponentially and retries forever. That works for "builder unreachable"
//! / "DNS blip" / "network stall" (conditions that fix themselves), but
//! breaks for "missing flake attribute" / "NixOS evaluation error" /
//! "schema mismatch" (conditions that NEVER fix themselves without the
//! operator changing the declaration).
//!
//! The implicit mental model — "obvious permanent failures should stop
//! retrying and surface in cluster status" — becomes the explicit typed
//! thing `FailureKind`.
//!
//! ## Where this primitive lives
//!
//! Per the pleme-io [Compounding Directive](../../blackmatter-pleme/docs/pleme-io-CLAUDE.md)
//! Operating Principle #1 ("solve problems once, in one place, at one
//! time"), this lives in `shigoto-types` — the typed-primitive root of
//! the work-graph crate family. Every consumer (kikai daemon, magma
//! apply engine, tatara-reconciler, pangea-operator) imports from here
//! and shares one classifier + one definition of "Declarative."
//!
//! `shigoto-retry` extends `RetryPolicy::decide` to consult
//! `FailureRecord.kind`: any Declarative failure returns `Deadletter`
//! immediately regardless of attempt budget — the META point.
//!
//! ## Conservative classification
//!
//! `classify` defaults to `Transient` on unknown error shapes. We'd
//! rather retry once or twice extra than wedge a reconciler waiting
//! for an operator to clear a state we should have considered
//! transient. Patterns added to the Declarative set are the documented
//! signatures we've observed in the fleet — extend as new classes
//! surface.

use std::fmt;

use serde::{Deserialize, Serialize};

/// Classification of a reconcile-loop failure.
///
/// Binary by design: every consumer asks one question — "should I keep
/// trying?" If `Transient`, yes (with whatever backoff policy). If
/// `Declarative`, no — the operator-supplied declaration is broken and
/// no amount of retrying will fix it without operator action. Routes
/// through `Deadletter` in `shigoto-retry::RetryDecision`.
///
/// `kind()` (variant → stable lowercase string) and `is_transient` /
/// `is_declarative` (variant predicates) are auto-generated via
/// `#[derive(Discriminant, IsVariant)]` from gen-platform.
#[derive(
    Debug,
    Clone,
    Copy,
    PartialEq,
    Eq,
    Hash,
    Serialize,
    Deserialize,
    gen_platform::Discriminant,
    gen_platform::IsVariant,
)]
#[discriminant(method = "kind", case = "lower")]
#[non_exhaustive]
pub enum FailureKind {
    /// Conditions may clear: builder unreachable, network blip, DHCP
    /// not yet allocated, kernel module loading, dependency not yet
    /// ready, rate limit hit, transient API 5xx, etc.
    Transient,

    /// Operator-supplied declaration is broken. Examples: missing
    /// flake attribute, NixOS module evaluation error, missing source
    /// file, type mismatch in option value, reference to a SOPS secret
    /// that doesn't exist, MAC address conflict with a sibling cluster,
    /// schema mismatch in a Terraform provider, etc.
    Declarative,
}

impl Default for FailureKind {
    /// Conservative default: `Transient`. Callers that don't classify
    /// get the "keep trying" behaviour by default — never accidentally
    /// wedge a reconciler at attempt #1 because someone forgot to
    /// classify their error.
    fn default() -> Self {
        Self::Transient
    }
}

impl fmt::Display for FailureKind {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        // Display keeps the title-cased form (legacy convention) while
        // .kind() (auto-generated) returns the lowercase wire form.
        f.write_str(match self {
            Self::Transient => "Transient",
            Self::Declarative => "Declarative",
        })
    }
}

/// Typed reconcile failure — the shape every long-running daemon
/// stores in its FSM to track consecutive identical failures.
///
/// Companion to `shigoto-retry::FailureRecord` (which carries the
/// attempt + timestamp + raw error string). `Failure` is the
/// classified-and-summarized projection a status surface displays.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct Failure {
    pub kind: FailureKind,
    /// Truncated to first 256 chars to keep status output bounded.
    pub message: String,
    /// First few words / canonical signature used for "same error
    /// twice in a row" detection. Stripped of paths + nix store
    /// hashes so consecutive failures with different store-path
    /// digests still match.
    pub signature: String,
}

impl Failure {
    /// Classify a raw error string and produce a Failure.
    #[must_use]
    pub fn from_raw(raw: &str) -> Self {
        let kind = classify(raw);
        let message = truncate(raw, 256);
        let signature = signature(raw);
        Self {
            kind,
            message,
            signature,
        }
    }
}

/// Classify a raw error string into Transient or Declarative.
///
/// Conservative default: Transient. Pattern-match on documented
/// Declarative signatures from across the fleet (Nix evaluation
/// errors, missing flake attributes, missing source files, NixOS
/// option misconfigurations, SOPS misses, etc.).
#[must_use]
pub fn classify(raw: &str) -> FailureKind {
    const DECLARATIVE_PATTERNS: &[&str] = &[
        // Nix evaluation / flake-attribute failures
        "does not provide attribute",
        "does not exist",
        "evaluating the attribute",
        "infinite recursion encountered",
        "syntax error",
        "attribute set is missing the attribute",
        "value is null while a set was expected",
        "value is a function while a set was expected",
        "is not allowed to refer to a store path",
        "cannot coerce",
        "while evaluating definitions from",
        // NixOS module assertion failures + option type-checks.
        // The "assert statement" + "in the condition of the assert"
        // shapes come from `lib.customisation.nix` when a module-level
        // `assert <expr>` fails — typically a downstream consumer
        // mis-using a typed option.
        "in the condition of the assert statement",
        "assertion failed",
        "The option `",
        "is missing the attribute `",
        // SOPS / secret resolution failures
        "missing secret",
        "could not decrypt",
        // Cargo / build-time failures inside an image build
        "could not find Cargo.toml",
        // Schema mismatches (Terraform / Crossplane / K8s admission)
        "schema validation failed",
        "unknown attribute",
        "field required",
        // Typed-failure wrapper markers — when a consumer (kikai
        // preflight, magma apply, …) classifies internally and bubbles
        // up via anyhow, this marker keeps the classification through
        // re-classification. See `kikai/src/up.rs::preflight` for the
        // producer side.
        "preflight failed (Declarative)",
        "[Declarative]",
        // Apple Virtualization.framework configuration rejections
        // (kasou → VZ). VZ returns these when the operator-supplied
        // VM config is structurally invalid (storage attachment shape,
        // CPU/memory bounds, network mode mismatch, etc.). Same retry
        // semantics as a Nix eval error: the operator's declaration
        // is broken; looping at 300s intervals doesn't help. Surfaces
        // through kasou's `KasouError::OperationFailed`/`Framework`
        // wrappers as `"... Invalid virtual machine configuration ..."`.
        // 2026-05-20 incident: kikai daemon retried 149 times before
        // SIGTERM because this class was classified Transient.
        "Invalid virtual machine configuration",
        "storage device attachment is invalid",
    ];

    if DECLARATIVE_PATTERNS.iter().any(|pat| raw.contains(pat)) {
        FailureKind::Declarative
    } else {
        FailureKind::Transient
    }
}

/// Compute a stable signature of an error message.
///
/// Strips paths, store hashes, and line numbers so consecutive
/// identical errors match even when the surrounding evaluation
/// context shifts. Daemons use signature equality to detect "same
/// error twice in a row" → declaration is broken, stop retrying.
#[must_use]
pub fn signature(raw: &str) -> String {
    // Multi-line nix errors often start with `error:\n   <body>`.
    // Strip *all* "error:" / "warning:" prefixes (with or without a
    // trailing space) so the first informative line surfaces.
    let trimmed = raw
        .strip_prefix("error: ")
        .or_else(|| raw.strip_prefix("warning: "))
        .or_else(|| raw.strip_prefix("error:"))
        .or_else(|| raw.strip_prefix("warning:"))
        .unwrap_or(raw);
    // Walk lines, skipping empty + skipping lines that are just
    // "error:" / "warning:" (the wrapper marker without content).
    let core = trimmed
        .lines()
        .map(str::trim)
        .find(|l| !l.is_empty() && *l != "error:" && *l != "warning:")
        .unwrap_or("")
        .trim();
    truncate(core, 80)
}

fn truncate(s: &str, max: usize) -> String {
    if s.chars().count() <= max {
        s.to_string()
    } else {
        s.chars().take(max).collect::<String>() + ""
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn classifies_missing_flake_attribute_as_declarative() {
        let err = "nix build failed: error: flake does not provide attribute 'packages.aarch64-linux.engenho-local-image'";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    #[test]
    fn classifies_missing_source_file_as_declarative() {
        let err = "error: path '/nix/store/abc-source/images/cluster-image.nix' does not exist";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    #[test]
    fn classifies_nixos_eval_error_as_declarative() {
        let err = "error: The option `blackmatter` does not exist. Definition values: …";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    #[test]
    fn classifies_terraform_schema_mismatch_as_declarative() {
        let err = "schema validation failed: unknown attribute 'foo' in resource 'bar'";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    #[test]
    fn classifies_builder_unreachable_as_transient() {
        let err = "ssh: connect to host rio port 22: Connection refused";
        assert_eq!(classify(err), FailureKind::Transient);
    }

    #[test]
    fn classifies_network_timeout_as_transient() {
        let err = "curl: (28) Operation timed out after 30000 milliseconds";
        assert_eq!(classify(err), FailureKind::Transient);
    }

    #[test]
    fn classifies_unknown_as_transient_by_default() {
        let err = "something went wrong, nobody knows what";
        assert_eq!(classify(err), FailureKind::Transient);
    }

    /// Regression: nix assert-statement failures from
    /// `lib/customisation.nix` are operator-declaration bugs.
    /// Observed in the wild when engenho-local's NixOS module
    /// evaluation hit a missing cross-module wiring assertion.
    #[test]
    fn classifies_nix_assert_statement_as_declarative() {
        let err = "error:\n       … in the condition of the assert statement\n         at /nix/store/.../lib/customisation.nix:433:9:";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    /// Regression: typed-wrapper marker survives anyhow re-wrap.
    /// When `kikai/src/up.rs::preflight` classifies an inner error
    /// as Declarative and wraps it via `anyhow!("preflight failed
    /// ({}): {}", f.kind, f.message)`, the wrapper marker must
    /// preserve the classification through daemon re-classification.
    #[test]
    fn classifies_typed_wrapper_marker_as_declarative() {
        let err = "preflight failed (Declarative): nix eval failed";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    /// Regression: VZ/kasou config rejections are declarative.
    /// 2026-05-20 incident on `cid`: kikai daemon retried 149 times
    /// over ~40 hours because the kasou-wrapped VZ error
    /// `"kasou start failed: VM operation failed: start failed:
    /// Invalid virtual machine configuration. The storage device
    /// attachment is invalid."` was classified Transient. Adding
    /// the VZ-side umbrella + the specific storage-attachment phrase
    /// to DECLARATIVE_PATTERNS so the daemon surfaces
    /// BlockedDeclarative after two identical-signature retries.
    #[test]
    fn classifies_vz_invalid_configuration_as_declarative() {
        let err = "kasou start failed: VM operation failed: start failed: \
                   Invalid virtual machine configuration. The storage device \
                   attachment is invalid.";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    #[test]
    fn classifies_vz_storage_attachment_rejection_as_declarative() {
        let err = "kasou: storage device attachment is invalid";
        assert_eq!(classify(err), FailureKind::Declarative);
    }

    /// Negative case: VZ runtime errors (guest stopped, host resource
    /// pressure, etc.) are NOT declarative. The patterns above must
    /// be specific enough not to catch every kasou error.
    #[test]
    fn classifies_vz_runtime_stop_as_transient() {
        let err = "kasou: Internal Virtualization error. The virtual \
                   machine stopped unexpectedly.";
        assert_eq!(classify(err), FailureKind::Transient);
    }

    /// Regression: multi-line nix error (`error:\n   body`) signature
    /// should surface the body, not the bare "error:" prefix.
    #[test]
    fn signature_walks_past_bare_error_prefix() {
        let err = "error:\n       … in the condition of the assert statement\n         at /nix/store/...:433:9:";
        let sig = signature(err);
        assert!(sig.contains("assert statement"), "got: {sig}");
        assert_ne!(sig, "error:");
    }

    #[test]
    fn signature_strips_error_prefix() {
        let raw = "error: does not provide attribute 'packages.aarch64-linux.engenho-local-image'";
        let sig = signature(raw);
        assert!(!sig.starts_with("error:"));
        assert!(sig.contains("does not provide"));
    }

    #[test]
    fn signature_is_stable_across_runs() {
        let raw = "error: flake does not provide attribute 'x'";
        assert_eq!(signature(raw), signature(raw));
    }

    #[test]
    fn signature_truncates_long_messages() {
        let raw = "error: ".to_string() + &"a".repeat(500);
        let sig = signature(&raw);
        assert!(sig.chars().count() <= 81); // 80 + "…"
    }

    #[test]
    fn failure_from_raw_classifies_and_summarizes() {
        let f = Failure::from_raw("error: does not provide attribute 'foo'");
        assert_eq!(f.kind, FailureKind::Declarative);
        assert!(f.signature.contains("does not provide"));
    }

    #[test]
    fn failure_serializes_via_serde() {
        let f = Failure::from_raw("error: connection refused");
        let json = serde_json::to_string(&f).expect("serialize");
        let back: Failure = serde_json::from_str(&json).expect("deserialize");
        assert_eq!(back, f);
    }

    #[test]
    fn failure_truncates_long_message() {
        let f = Failure::from_raw(&"x".repeat(1000));
        assert!(f.message.chars().count() <= 257);
    }

    /// Property: classify is deterministic — same input → same output.
    #[test]
    fn classify_is_deterministic() {
        for input in [
            "does not provide attribute",
            "Connection refused",
            "anything else",
            "evaluating the attribute",
            "schema validation failed",
        ] {
            assert_eq!(classify(input), classify(input));
        }
    }

    #[test]
    fn failure_kind_displays() {
        assert_eq!(FailureKind::Transient.to_string(), "Transient");
        assert_eq!(FailureKind::Declarative.to_string(), "Declarative");
    }
}