dirge-agent 0.7.4

Minimalistic coding agent written in Rust, optimized for memory footprint and performance
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
use std::sync::LazyLock;
use std::time::Duration;

use regex::Regex;

/// B3-2: match an HTTP 5xx status anchored by a structural
/// HTTP-context marker. Avoids false-positives on bare 5xx-shaped
/// numbers in non-HTTP text (e.g. "processed 500 items"). Patterns
/// observed from real rig/reqwest errors:
///   "503 Service Unavailable"        — leading status + reason
///   "Http status: 500"               — status: prefix
///   "status=502"                     — status= prefix
///   "error 504: ..."                 — error prefix
///   "(status_code=500)"              — status_code= prefix
///   "code: 500"                      — bare code: prefix
///   "received http 500"              — http prefix
///   "5xx server error response"      — already lowercase
static STATUS_5XX_RE: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
        r"(?x)
        (?:
            # prefix-anchored: status / code / error / http /
            # response / request / returned, with optional
            # `:`/`=`/`-`/whitespace between marker and number.
            (?:status(?:_code)?|code|error|http|response|request|returned|returns)
            \s*[:=\-]?\s*
            5\d{2}
            (?:\D|$)
        )
        |
        (?:
            # leading status + HTTP reason phrase (5xx Service / 5xx
            # Gateway / 5xx Internal / 5xx Bad / 5xx Server).
            (?:^|\D)
            5\d{2}
            \s+
            (?:service|gateway|internal|bad|server)
        )
        ",
    )
    .expect("static regex compiles")
});

#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ErrorKind {
    ContextLength,
    RateLimit,
    Network,
    Auth,
    Other,
}

#[derive(Debug, Clone)]
pub struct RecoveryPolicy {
    max_retries: usize,
    backoff_base: Duration,
}

impl Default for RecoveryPolicy {
    fn default() -> Self {
        Self {
            // Transient provider blips ("error sending request", 5xx, rate
            // limits) are common enough that 3 retries (~7s of backoff)
            // still surfaced hard failures to the user. 5 retries with the
            // exponential schedule below waits ~1+2+4+8+16 ≈ 31s before
            // giving up, which rides out the typical short outage without
            // stalling the agent indefinitely.
            max_retries: 5,
            backoff_base: Duration::from_secs(1),
        }
    }
}

impl RecoveryPolicy {
    pub fn max_retries(&self) -> usize {
        self.max_retries
    }

    pub fn should_retry(&self, attempts: usize, kind: ErrorKind) -> bool {
        if attempts >= self.max_retries {
            return false;
        }
        matches!(kind, ErrorKind::Network | ErrorKind::RateLimit)
    }

    pub fn backoff_duration(&self, attempts: usize) -> Duration {
        let exp = 1u64 << attempts.min(6); // cap at 2^6 = 64s
        let base = self.backoff_base.as_millis() as u64;
        let ms = base.saturating_mul(exp);
        // Additive jitter up to +25% so concurrent agents don't retry in
        // lockstep against a rate-limited endpoint. Never shorter than the
        // policy minimum. Seeded from the system clock — pseudo-random is
        // sufficient here.
        let jitter = pseudo_random(attempts as u64) % (ms / 4).max(1);
        Duration::from_millis(ms.saturating_add(jitter))
    }

    /// F14: combine `backoff_duration` with the provider's
    /// requested `Retry-After`. Prefer whichever is longer (since
    /// retrying earlier than the server asked just earns another
    /// 429), but cap at 5 minutes so a misformatted header can't
    /// stall the agent forever.
    pub fn backoff_duration_for_msg(&self, attempts: usize, error_msg: &str) -> Duration {
        let computed = self.backoff_duration(attempts);
        match retry_after_from_error_msg(error_msg) {
            Some(server_wants) => {
                const CAP: Duration = Duration::from_secs(300);
                let chosen = server_wants.max(computed);
                if chosen > CAP { CAP } else { chosen }
            }
            None => computed,
        }
    }

    #[cfg(test)]
    pub(crate) fn with_backoff(max_retries: usize, backoff_base: Duration) -> Self {
        Self {
            max_retries,
            backoff_base,
        }
    }
}

/// Run an async operation under a [`RecoveryPolicy`], retrying transient
/// (network / rate-limit) failures with the policy's exponential
/// backoff. Auth / context-length / other failures bail immediately.
///
/// Single home for the attempt → classify → backoff → sleep loop that
/// `AnyModel::btw_query` and the summarizer each hand-rolled (dirge-6cvc).
/// `attempt` is invoked fresh on every try; `label` names the operation
/// in the retry log line. The error type only needs `Display` — the
/// message is what `classify_error` inspects.
///
/// NOTE: the backoff sleep here is not yet cancellation-aware; wiring an
/// abort signal through the (signal-less) call sites is tracked
/// separately. The streaming retry wrapper in `agent_loop::retry` is a
/// different shape (per-event commit tracking) and keeps its own loop.
pub async fn run_with_retry<T, E, F, Fut>(
    policy: &RecoveryPolicy,
    label: &str,
    mut attempt: F,
) -> Result<T, E>
where
    F: FnMut() -> Fut,
    Fut: std::future::Future<Output = Result<T, E>>,
    E: std::fmt::Display,
{
    let mut attempts = 0;
    loop {
        match attempt().await {
            Ok(value) => return Ok(value),
            Err(err) => {
                let msg = err.to_string();
                let kind = classify_error(&msg);
                if !policy.should_retry(attempts, kind) {
                    return Err(err);
                }
                let delay = policy.backoff_duration_for_msg(attempts, &msg);
                tracing::warn!(
                    op = label,
                    attempt = attempts + 1,
                    max = policy.max_retries(),
                    delay_ms = delay.as_millis() as u64,
                    kind = ?kind,
                    error = %msg,
                    "retrying after transient failure",
                );
                tokio::time::sleep(delay).await;
                attempts += 1;
            }
        }
    }
}

/// Parse a `Retry-After` value out of an error message. Looks for
/// (in order):
/// 1. Anthropic-style `retry-after-ms: <N>` — milliseconds.
/// 2. Standard `Retry-After: <N>` — seconds.
/// 3. JSON body `"retry_after": <N>` — seconds.
///
/// Returns `None` if no recognized form is present. Robust to the
/// `:` being absent (some providers emit `retry-after 30`).
pub(crate) fn retry_after_from_error_msg(msg: &str) -> Option<Duration> {
    fn parse_after_label(msg: &str, label: &str) -> Option<u64> {
        // Case-insensitive search WITHOUT lowercasing the whole
        // message: previously we lowercased `msg` and then indexed
        // into the ORIGINAL `msg` at the lowered string's byte
        // offset. For ASCII that's identical, but `to_lowercase`
        // can change byte length for some unicode (e.g. Turkish
        // `İ` → `i̇` is 2 → 3 bytes). The mismatched offset could
        // land mid-UTF-8 and panic on `&msg[...]`. Now we scan the
        // original bytes window-by-window with case-insensitive
        // ASCII comparison. The label itself is fixed-ASCII so this
        // is sound — we just need to be case-insensitive against
        // the message's casing.
        let label_bytes = label.as_bytes();
        let msg_bytes = msg.as_bytes();
        if msg_bytes.len() < label_bytes.len() {
            return None;
        }
        let mut idx = None;
        for i in 0..=msg_bytes.len() - label_bytes.len() {
            let window = &msg_bytes[i..i + label_bytes.len()];
            if window
                .iter()
                .zip(label_bytes.iter())
                .all(|(a, b)| a.eq_ignore_ascii_case(b))
            {
                idx = Some(i);
                break;
            }
        }
        let idx = idx?;
        // `idx` is now a byte offset into the original `msg`.
        // Land at a char boundary (the ASCII label match guarantees
        // we're on a boundary, but `idx + label.len()` could still
        // hit one — for ASCII labels it can't, but defend anyway).
        let after = idx + label.len();
        if !msg.is_char_boundary(after) {
            return None;
        }
        let tail = &msg[after..];
        let tail = tail.trim_start_matches([':', ' ', '\t', '"']).trim_start();
        // Consume contiguous digits, with a hard cap so a malformed
        // header (`Retry-After: 999999999999999999999`) doesn't
        // produce a parsed integer that overflows or is absurdly
        // large before the 5-min cap applies in the caller. Cap at
        // 10^10 — any value larger is clearly bogus, and the cap
        // saturates rather than overflowing u64.
        let n: String = tail
            .chars()
            .take_while(|c| c.is_ascii_digit())
            .take(11)
            .collect();
        if n.is_empty() {
            return None;
        }
        n.parse().ok()
    }

    if let Some(ms) = parse_after_label(msg, "retry-after-ms") {
        return Some(Duration::from_millis(ms));
    }
    if let Some(secs) = parse_after_label(msg, "retry-after") {
        return Some(Duration::from_secs(secs));
    }
    if let Some(secs) = parse_after_label(msg, "retry_after") {
        return Some(Duration::from_secs(secs));
    }
    // RFC 7231 HTTP-date form: `Retry-After: Wed, 21 Oct 2015 07:28:00 GMT`.
    // Tried last so the numeric forms above (which are far more common)
    // hit their fast path before we incur a chrono parse. Past dates
    // clamp to zero so a misconfigured server doesn't suppress retries
    // by emitting a stale or epoch-zero header.
    if let Some(d) = parse_http_date_retry_after(msg) {
        return Some(d);
    }
    None
}

/// Scan `msg` for a `Retry-After:` header whose value parses as an
/// RFC 7231 HTTP-date (IMF-fixdate, RFC 850, or asctime form). Returns
/// the time from now until that date, clamped to 0 if in the past.
/// Returns `None` if no `Retry-After:` is present or the value isn't a
/// recognized date form (the numeric forms are handled by
/// `parse_after_label` above).
fn parse_http_date_retry_after(msg: &str) -> Option<Duration> {
    // PROV-10: case-insensitive byte-window scan rather than
    // lowercasing the whole message and indexing back into the
    // original. `to_ascii_lowercase` on the message preserves byte
    // length only for ASCII inputs; a unicode-bearing message could
    // shift offsets and panic on `&msg[after..]`. Mirror the pattern
    // used in `parse_after_label`.
    let label = "retry-after";
    let label_bytes = label.as_bytes();
    let msg_bytes = msg.as_bytes();
    if msg_bytes.len() < label_bytes.len() {
        return None;
    }
    let mut found = None;
    for i in 0..=msg_bytes.len() - label_bytes.len() {
        let window = &msg_bytes[i..i + label_bytes.len()];
        if window
            .iter()
            .zip(label_bytes.iter())
            .all(|(a, b)| a.eq_ignore_ascii_case(b))
        {
            found = Some(i);
            break;
        }
    }
    let idx = found?;
    let after = idx + label.len();
    if !msg.is_char_boundary(after) {
        return None;
    }
    let tail = &msg[after..];
    let tail = tail.trim_start_matches([':', ' ', '\t', '"']);
    let value: String = tail
        .chars()
        .take_while(|&c| c != '\n' && c != '\r' && c != '"')
        .collect();
    let value = value.trim();
    if value.is_empty() {
        return None;
    }
    // chrono accepts the three RFC 7231 date forms via DateTime::parse_from_rfc2822
    // (IMF-fixdate is rfc2822-compatible) and DateTime::parse_from_str for
    // asctime. Try both; ignore Err.
    let parsed = chrono::DateTime::parse_from_rfc2822(value)
        .ok()
        .or_else(|| {
            chrono::NaiveDateTime::parse_from_str(value, "%a %b %e %H:%M:%S %Y")
                .ok()
                .map(|n| n.and_utc().fixed_offset())
        })?;
    let now = chrono::Utc::now().fixed_offset();
    let delta = parsed - now;
    Some(Duration::from_secs(delta.num_seconds().max(0) as u64))
}

fn pseudo_random(salt: u64) -> u64 {
    // Audit L16: two callers that hit `pseudo_random` in the same
    // `subsec_nanos()` slot with the same `salt` (`attempts`)
    // previously produced identical jitter, defeating the
    // anti-thundering-herd purpose. The process-local counter below
    // makes every call within a process unique even when the wall
    // clock + salt collide.
    use std::sync::atomic::{AtomicU64, Ordering};
    static SEQ: AtomicU64 = AtomicU64::new(0);
    let seq = SEQ.fetch_add(1, Ordering::Relaxed);
    let nanos = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .map(|d| d.subsec_nanos() as u64)
        .unwrap_or(0);
    // splitmix64 finalizer for decent dispersion
    let mut z = nanos
        .wrapping_add(salt)
        .wrapping_add(seq.wrapping_mul(0xA240_2A1F_1CE4_E5B9))
        .wrapping_add(0x9E37_79B9_7F4A_7C15);
    z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
    z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
    z ^ (z >> 31)
}

pub fn classify_error(msg: &str) -> ErrorKind {
    let lower = msg.to_lowercase();

    // Auth: HTTP status codes in error context
    if lower.contains(" 401 ")
        || lower.contains(" 403 ")
        || lower.contains("error 401")
        || lower.contains("error 403")
        || lower.starts_with("401 ")
        || lower.starts_with("403 ")
    {
        return ErrorKind::Auth;
    }

    if lower.contains("unauthorized")
        || lower.contains("invalid api key")
        || lower.contains("authentication failed")
    {
        return ErrorKind::Auth;
    }

    // PROV-8: OpenAI's `insufficient_quota` and the broader
    // billing-exhausted signal come through wrapped in a 429 but
    // are permanent failures (the user's billing account is
    // empty/suspended). Without this check we'd burn the full retry
    // budget on a request that will never succeed. Route to Auth so
    // the policy treats it as non-retryable.
    if lower.contains("insufficient_quota")
        || lower.contains("billing_not_active")
        || lower.contains("billing_hard_limit_reached")
    {
        return ErrorKind::Auth;
    }

    if lower.contains("rate limit") || lower.contains("too many requests") {
        return ErrorKind::RateLimit;
    }

    if lower.contains(" 429 ") || lower.contains("error 429") || lower.starts_with("429 ") {
        return ErrorKind::RateLimit;
    }

    // PROV-7: Gemini emits 429s with body
    // `{"error":{"status":"RESOURCE_EXHAUSTED",…}}` that often
    // arrive stringified without the literal " 429 " or "rate
    // limit" wording. Treat as transient so the backoff loop runs.
    if lower.contains("resource_exhausted") || lower.contains("resource has been exhausted") {
        return ErrorKind::RateLimit;
    }

    // Anthropic's `overloaded_error` is a transient capacity signal —
    // structurally a rate-limit response without the "rate limit" /
    // "too many" wording. Classify as RateLimit so the retry-with-
    // backoff policy applies; previously it fell through to `Other`
    // and the user saw a one-shot failure on transient backend
    // pressure.
    if lower.contains("overloaded") {
        return ErrorKind::RateLimit;
    }

    // B3-2 (audit fix): HTTP 5xx server errors. Previously only
    // 502/503/504 were caught and only when surrounded by spaces;
    // a bare 500 fell through to `Other` and the user saw a
    // one-shot failure on transient provider 5xx. Real-world rig/
    // reqwest errors come through in many shapes: "503 Service
    // Unavailable", "Http status: 500", "status=502", "error 504:
    // ...", "(status_code=500)". Match any 3-digit number starting
    // with 5 anywhere in the message, with a non-digit boundary on
    // BOTH sides so we don't false-positive on a 5xx-shaped
    // substring of a larger number (e.g. "request id 50012345").
    if STATUS_5XX_RE.is_match(&lower) {
        return ErrorKind::Network;
    }

    // Context-length indicators. Patterns collected from real
    // provider responses — each entry is a substring observed in
    // production from at least one provider (Anthropic, OpenAI,
    // Google, GLM, DeepSeek, Mistral, OpenRouter passthroughs).
    // Keep these substrings narrow enough to avoid colliding with
    // legitimate non-context-length errors that happen to mention
    // "tokens" or "long".
    if lower.contains("context_length_exceeded")
        || lower.contains("maximum context length")
        || lower.contains("reduce the length of the messages")
        || lower.contains("request too large")
        || lower.contains("prompt is too long")
        || lower.contains("input is too long")
        || lower.contains("input token count exceeds")
        || lower.contains("tokens exceed")
        || lower.contains("exceeds the model's context")
        // PROV-6: Anthropic `max_tokens is too large` (input + max_tokens > window);
        // Cohere/Mistral-via-OpenRouter `too many tokens`; DeepSeek
        // `Range of input length`; OpenRouter `messages.length too large`.
        || lower.contains("max_tokens is too large")
        || lower.contains("too many tokens")
        || lower.contains("range of input length")
        || lower.contains("messages.length too large")
    {
        return ErrorKind::ContextLength;
    }

    // HTML responses from intermediaries (Cloudflare 502/503,
    // nginx error pages, captive-portal interception). These never
    // parse as the JSON envelope rig/reqwest expect — without
    // detection they fell through to `Other` and the user saw a
    // one-shot opaque failure. Detect by leading HTML markers; the
    // status-text strings ("Bad Gateway", "Service Unavailable")
    // also appear in genuine JSON error bodies so we don't rely on
    // them alone.
    if lower.contains("<!doctype html")
        || lower.contains("<html")
        || lower.contains("bad gateway")
        || lower.contains("service unavailable")
        || lower.contains("gateway timeout")
        || lower.contains("cloudflare")
    {
        return ErrorKind::Network;
    }

    // Network errors — check for specific phrases (avoid "connection" false positive)
    if lower.contains("connection refused")
        || lower.contains("connection reset")
        || lower.contains("broken pipe")
        || lower.contains("dns error")
        || lower.contains("tls")
        || lower.contains("ssl")
        || lower.contains("timed out")
        || lower.contains("request timeout")
        || lower.contains("server error")
        // reqwest connect/send failures: the request never got a
        // response (connection refused/dropped, DNS, TCP connect, or
        // a mid-send drop). rig wraps these as "Http client error:
        // error sending request for url (…)". Transient — retry.
        || lower.contains("error sending request")
        || lower.contains("connect error")
        || lower.contains("tcp connect")
        // Mid-stream decode failures from reqwest/rig — the connection
        // returned bytes but they didn't deserialize into the expected
        // JSON envelope. Almost always transient (network blip,
        // truncated chunked response, provider hiccup), so it should
        // be retried like any other network error rather than surfacing
        // as a hard "Other" failure.
        || lower.contains("error decoding response body")
        || lower.contains("invalid response body")
        || lower.contains("decode error")
    {
        return ErrorKind::Network;
    }

    ErrorKind::Other
}

/// Map a raw error message to a one-line user-facing explanation
/// that names *what* failed and *what to try next*. Used by the agent
/// runner when surfacing errors to the chat — beats dumping a stack
/// of `CompletionError: ProviderError: Http client error: …` at the
/// user.
///
/// The original message is appended in parentheses as the cause so
/// the user (and any bug reports) still have the underlying details.
///
/// Transitional after phase 4.5h-6 cutover: no production caller
/// at the moment. The bridge could pretty-format Error events
/// using this when h-7 testing surfaces real provider error
/// shapes; until then keep the helper (and its tests) alive.
#[allow(dead_code)]
pub fn user_facing_error(msg: &str, attempts: usize) -> String {
    let kind = classify_error(msg);
    let lower = msg.to_lowercase();

    let (headline, hint) = match kind {
        ErrorKind::Auth => (
            "authentication failed talking to the LLM provider",
            "check your API key env var (e.g. OPENROUTER_API_KEY) and provider config",
        ),
        ErrorKind::RateLimit => (
            "provider rate-limited the request",
            "wait a moment and retry, or switch to a different model via /model",
        ),
        ErrorKind::ContextLength => (
            "conversation exceeds the model's context window",
            "run /compress to summarize older turns and try again",
        ),
        ErrorKind::Network if lower.contains("error decoding response body") => (
            "lost the response stream from the provider (truncated or malformed body)",
            "usually transient — retry. If it persists the provider may be having issues or returning non-JSON (HTML error pages, plaintext)",
        ),
        ErrorKind::Network => (
            "network error reaching the LLM provider",
            "check connectivity / firewall / proxy; the request will retry automatically",
        ),
        ErrorKind::Other => (
            "the LLM provider returned an error we didn't recognize",
            "see the cause below; consider /model to try a different provider",
        ),
    };

    let attempts_note = if attempts > 1 {
        format!(" (after {} attempt(s))", attempts)
    } else {
        String::new()
    };

    format!(
        "{}{}\n  ↳ hint: {}\n  ↳ cause: {}",
        headline, attempts_note, hint, msg
    )
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::sync::atomic::{AtomicUsize, Ordering};

    #[test]
    fn default_budget_retries_transient_failures_up_to_five_times() {
        let p = RecoveryPolicy::default();
        assert_eq!(p.max_retries(), 5);
        // A transient (network) error is retryable up to, but not past,
        // the budget.
        assert!(p.should_retry(0, ErrorKind::Network));
        assert!(p.should_retry(4, ErrorKind::Network));
        assert!(!p.should_retry(5, ErrorKind::Network));
        // Non-retryable kinds never retry, regardless of budget.
        assert!(!p.should_retry(0, ErrorKind::Auth));
    }

    // dirge-6cvc: the shared retry helper — success, immediate bail on a
    // non-retryable error, and retry-then-succeed on a transient one.
    #[tokio::test]
    async fn run_with_retry_returns_first_success() {
        let policy = RecoveryPolicy::default();
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            calls.fetch_add(1, Ordering::SeqCst);
            async { Ok(7) }
        })
        .await;
        assert_eq!(r.unwrap(), 7);
        assert_eq!(calls.load(Ordering::SeqCst), 1, "no retry on success");
    }

    #[tokio::test]
    async fn run_with_retry_bails_immediately_on_non_retryable() {
        let policy = RecoveryPolicy::default();
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            calls.fetch_add(1, Ordering::SeqCst);
            async { Err("invalid api key".to_string()) }
        })
        .await;
        assert!(r.is_err());
        assert_eq!(
            calls.load(Ordering::SeqCst),
            1,
            "auth error must not be retried"
        );
    }

    #[tokio::test]
    async fn run_with_retry_retries_transient_then_succeeds() {
        // Tiny backoff so the test doesn't actually wait seconds.
        let policy = RecoveryPolicy::with_backoff(3, Duration::from_millis(1));
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            let n = calls.fetch_add(1, Ordering::SeqCst);
            async move {
                if n < 2 {
                    Err("rate limit exceeded".to_string())
                } else {
                    Ok(42)
                }
            }
        })
        .await;
        assert_eq!(r.unwrap(), 42);
        assert_eq!(calls.load(Ordering::SeqCst), 3, "two retries then success");
    }

    #[tokio::test]
    async fn run_with_retry_exhausts_then_returns_last_error() {
        let policy = RecoveryPolicy::with_backoff(2, Duration::from_millis(1));
        let calls = AtomicUsize::new(0);
        let r: Result<u32, String> = run_with_retry(&policy, "t", || {
            calls.fetch_add(1, Ordering::SeqCst);
            async { Err("rate limit exceeded".to_string()) }
        })
        .await;
        assert!(r.is_err());
        // initial attempt + 2 retries = 3 calls.
        assert_eq!(calls.load(Ordering::SeqCst), 3);
    }

    // dirge-5ul5: reqwest connect/send failures (the connection couldn't
    // be established or dropped before a response) surface as "error
    // sending request for url …" wrapped in rig's "Http client error".
    // These are transient and MUST be retried, not classified Other.
    #[test]
    fn classify_connect_send_failures_as_network() {
        for msg in [
            "ProviderError: Http client error: error sending request for url (https://api.deepseek.com/v1/chat/completions)",
            "error sending request for url (https://api.openai.com/v1/chat/completions)",
            "reqwest::Error { kind: Connect, ... }: tcp connect error",
            "Http client error: connect error",
        ] {
            assert_eq!(
                classify_error(msg),
                ErrorKind::Network,
                "connect/send failure must be retryable: {msg}"
            );
        }
        let policy = RecoveryPolicy::default();
        assert!(
            policy.should_retry(0, classify_error("error sending request for url (x)")),
            "the DeepSeek connect failure must be retried"
        );
    }

    #[test]
    fn test_classify_context_length() {
        assert_eq!(
            classify_error("context_length_exceeded: prompt too long"),
            ErrorKind::ContextLength
        );
        assert_eq!(
            classify_error("reduce the length of the messages"),
            ErrorKind::ContextLength
        );
        assert_eq!(
            classify_error("request too large for model"),
            ErrorKind::ContextLength
        );
    }

    /// Audit H1: the original `classify_error` recognized only 4
    /// substrings and missed common provider phrasings. Each entry
    /// below corresponds to a real error string a provider can emit.
    #[test]
    fn test_classify_context_length_provider_variants() {
        // Anthropic: hits when input + max_tokens > context window.
        assert_eq!(
            classify_error("prompt is too long: 250000 tokens > 200000 maximum"),
            ErrorKind::ContextLength
        );
        // OpenAI o-series + gpt-4o family.
        assert_eq!(
            classify_error(
                "This model's maximum context length is 128000 tokens. However, your messages resulted in 130000 tokens."
            ),
            ErrorKind::ContextLength
        );
        // Generic "input too long" wording used by several providers.
        assert_eq!(
            classify_error("input is too long for the requested model"),
            ErrorKind::ContextLength
        );
        // Google Gemini 1.x token-limit message.
        assert_eq!(
            classify_error("The input token count exceeds the maximum number of tokens allowed"),
            ErrorKind::ContextLength
        );
        // GLM / DeepSeek / Mistral all surface variants of "tokens exceed".
        assert_eq!(
            classify_error("Total tokens exceed model's context window"),
            ErrorKind::ContextLength
        );
        // OpenAI returns this when chat history exceeds context.
        assert_eq!(
            classify_error("the messages array exceeds the model's context length"),
            ErrorKind::ContextLength
        );
    }

    /// Audit H5: Cloudflare / nginx 502/503 pages and captive-portal
    /// interceptions arrive as HTML, not JSON. Without HTML-aware
    /// detection these fell through to `Other` (no retry); reclassify
    /// as `Network`.
    #[test]
    fn test_classify_html_proxy_response_as_network() {
        // Cloudflare 502 page snippet.
        assert_eq!(
            classify_error("<!DOCTYPE html><html><head><title>502 Bad Gateway</title>"),
            ErrorKind::Network
        );
        // nginx error page.
        assert_eq!(
            classify_error("<html><body><h1>503 Service Unavailable</h1></body></html>"),
            ErrorKind::Network
        );
        // Captive-portal interception (login page returned for the API URL).
        assert_eq!(
            classify_error("ProviderError: <html><head><meta http-equiv=\"refresh\""),
            ErrorKind::Network
        );
    }

    /// Audit H2: `Retry-After` may arrive as an HTTP-date per RFC 7231.
    /// Parser must accept this form and return a Duration in seconds
    /// from now (clamped to 0 if the date is in the past).
    #[test]
    fn retry_after_http_date_parses() {
        // Build a date ~30s in the future, then check we recover ~30s.
        let future = chrono::Utc::now() + chrono::Duration::seconds(30);
        // RFC 7231 IMF-fixdate format.
        let header = future.format("%a, %d %b %Y %H:%M:%S GMT").to_string();
        let msg = format!("429 Too Many Requests\nRetry-After: {}", header);
        let parsed = retry_after_from_error_msg(&msg).expect("HTTP-date should parse");
        let secs = parsed.as_secs();
        assert!(
            (25..=35).contains(&secs),
            "expected ~30s, got {}s (header={})",
            secs,
            header
        );
    }

    /// Past dates must clamp to 0 rather than wrapping. A misconfigured
    /// server occasionally returns `Retry-After: Thu, 01 Jan 1970 00:00:00 GMT`
    /// — we want to retry immediately, not panic or skip retries.
    #[test]
    fn retry_after_http_date_in_past_clamps_to_zero() {
        let msg = "Retry-After: Thu, 01 Jan 1970 00:00:00 GMT";
        let parsed = retry_after_from_error_msg(msg).expect("past HTTP-date should parse");
        assert_eq!(parsed, Duration::from_secs(0));
    }

    #[test]
    fn test_classify_network() {
        assert_eq!(classify_error("connection refused"), ErrorKind::Network);
        assert_eq!(
            classify_error("connection reset by peer"),
            ErrorKind::Network
        );
        assert_eq!(classify_error("request timed out"), ErrorKind::Network);
        assert_eq!(
            classify_error("503 service unavailable"),
            ErrorKind::Network
        );
        // Reqwest decode failure mid-stream — rig surfaces it as
        // `CompletionError: ProviderError: Http client error: error
        // decoding response body`. Should be retried like any other
        // transient network blip rather than surfacing as Other.
        assert_eq!(
            classify_error(
                "CompletionError: ProviderError: Http client error: error decoding response body"
            ),
            ErrorKind::Network
        );
        assert_eq!(classify_error("decode error: EOF"), ErrorKind::Network);

        // B3-2: 5xx variants beyond the previous strict set.
        // Plain 500 (was previously falling through to Other).
        assert_eq!(
            classify_error("500 Internal Server Error"),
            ErrorKind::Network
        );
        // Prefix-anchored forms.
        assert_eq!(classify_error("Http status: 500"), ErrorKind::Network);
        assert_eq!(classify_error("status=502"), ErrorKind::Network);
        assert_eq!(classify_error("status_code=503"), ErrorKind::Network);
        assert_eq!(classify_error("code: 504"), ErrorKind::Network);
        assert_eq!(
            classify_error("CompletionError: error 500: backend hiccup"),
            ErrorKind::Network
        );
        assert_eq!(
            classify_error("received http 502 from upstream"),
            ErrorKind::Network
        );
    }

    /// `user_facing_error` produces a multi-line message with headline,
    /// hint, and cause. The cause must contain the original raw
    /// message so debug context isn't lost.
    #[test]
    fn user_facing_error_includes_cause() {
        let raw = "CompletionError: ProviderError: Http client error: error decoding response body";
        let pretty = user_facing_error(raw, 1);
        assert!(pretty.contains("lost the response stream"));
        assert!(pretty.contains("hint:"));
        assert!(pretty.contains("cause:"));
        assert!(pretty.contains(raw));
    }

    /// Auth errors get a distinct headline pointing at the API key.
    #[test]
    fn user_facing_error_classifies_auth() {
        let pretty = user_facing_error("401 unauthorized", 1);
        assert!(pretty.contains("authentication failed"));
        assert!(pretty.contains("API key"));
    }

    /// Context-length errors point at /compress.
    #[test]
    fn user_facing_error_classifies_context_length() {
        let pretty = user_facing_error("maximum context length exceeded", 1);
        assert!(pretty.contains("/compress"));
    }

    #[test]
    fn test_classify_rate_limit() {
        assert_eq!(classify_error("rate limit exceeded"), ErrorKind::RateLimit);
        assert_eq!(
            classify_error("429 too many requests"),
            ErrorKind::RateLimit
        );
    }

    /// Anthropic returns `{"type": "overloaded_error", ...}` when its
    /// service is at capacity. The body is structurally similar to a
    /// rate-limit (transient + retryable) but doesn't contain the
    /// "rate limit" / "too many" / "429" patterns. Without explicit
    /// handling it falls into `Other` and dirge doesn't retry —
    /// users see a one-shot failure on a transient backend issue.
    #[test]
    fn classify_anthropic_overloaded_error_as_retryable() {
        assert_eq!(
            classify_error("overloaded_error: Anthropic API is overloaded"),
            ErrorKind::RateLimit,
        );
        // Just the lowercase token is enough — provider stringifies
        // the structured error differently across rig versions.
        assert_eq!(
            classify_error("Provider overloaded; please retry later"),
            ErrorKind::RateLimit,
        );
    }

    #[test]
    fn test_classify_auth() {
        assert_eq!(classify_error("401 unauthorized"), ErrorKind::Auth);
        assert_eq!(classify_error("invalid api key"), ErrorKind::Auth);
    }

    #[test]
    fn test_classify_other() {
        assert_eq!(classify_error("something else"), ErrorKind::Other);
        assert_eq!(classify_error("file not found"), ErrorKind::Other);
        // "connection" alone should not trigger network
        assert_eq!(
            classify_error("database connection closed"),
            ErrorKind::Other
        );
        // "reset" alone should not trigger
        assert_eq!(classify_error("form reset successful"), ErrorKind::Other);
        // "500" in non-HTTP context should not trigger
        assert_eq!(classify_error("processed 500 items"), ErrorKind::Other);
    }

    #[test]
    fn test_retry_policy() {
        let policy = RecoveryPolicy::default();

        // Network errors are retryable up to the budget (5).
        assert!(policy.should_retry(0, ErrorKind::Network));
        assert!(policy.should_retry(2, ErrorKind::Network));
        assert!(policy.should_retry(4, ErrorKind::Network));
        assert!(!policy.should_retry(5, ErrorKind::Network));

        // Rate limits are retryable
        assert!(policy.should_retry(0, ErrorKind::RateLimit));

        // Context length is NOT retryable (needs compaction)
        assert!(!policy.should_retry(0, ErrorKind::ContextLength));

        // Auth is not retryable
        assert!(!policy.should_retry(0, ErrorKind::Auth));

        // Other is not retryable
        assert!(!policy.should_retry(0, ErrorKind::Other));
    }

    #[test]
    fn test_backoff_duration() {
        let policy = RecoveryPolicy::default();
        let d0 = policy.backoff_duration(0);
        let d1 = policy.backoff_duration(1);
        let d2 = policy.backoff_duration(2);

        assert!(d0 >= Duration::from_secs(1));
        assert!(d1 >= Duration::from_secs(2));
        assert!(d2 >= Duration::from_secs(4));
    }

    #[test]
    fn test_backoff_overflow_guard() {
        let policy = RecoveryPolicy::default();
        let d = policy.backoff_duration(20); // capped at attempts=6 via min()
        // 1s * 2^6 = 64s plus up to +25% jitter = 80s ceiling
        assert!(d >= Duration::from_secs(64));
        assert!(d < Duration::from_secs(81));
    }

    #[test]
    fn test_backoff_jitter_present() {
        let policy = RecoveryPolicy::default();
        // Repeated calls at the same attempt count should yield differing values
        // most of the time. Run a small batch and confirm we see at least two
        // distinct values — proves jitter is wired in.
        let mut seen = std::collections::HashSet::new();
        for _ in 0..8 {
            seen.insert(policy.backoff_duration(3));
            std::thread::sleep(Duration::from_millis(1));
        }
        assert!(
            seen.len() > 1,
            "expected jittered backoff to vary across calls"
        );
    }

    /// F14: Anthropic-style `retry-after-ms` parses as ms.
    #[test]
    fn retry_after_parses_anthropic_ms() {
        let msg = "rate limited: retry-after-ms: 5000";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_millis(5000)),
        );
    }

    /// Standard HTTP `Retry-After: <seconds>` parses as seconds.
    #[test]
    fn retry_after_parses_standard_seconds() {
        let msg = "HTTP 429 Too Many Requests\nRetry-After: 30";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(30)),
        );
    }

    /// JSON body form: `"retry_after": 12`.
    #[test]
    fn retry_after_parses_json_body() {
        let msg = r#"{"error":"rate_limit","retry_after":12}"#;
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(12)),
        );
    }

    /// Bare-without-colon variant (some proxies log `retry-after 30`).
    #[test]
    fn retry_after_parses_no_colon() {
        let msg = "got 429, retry-after 7 next time";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(7)),
        );
    }

    /// No retry-after present → None.
    #[test]
    fn retry_after_returns_none_when_absent() {
        let msg = "generic network error: connection reset";
        assert_eq!(retry_after_from_error_msg(msg), None);
    }

    /// Regression: messages with multi-byte UTF-8 BEFORE the label
    /// previously could panic — the original parser found the
    /// label in a lowercased copy and indexed into the original
    /// at that byte offset. `to_lowercase` can change byte length
    /// (Turkish `İ` is 2 bytes lowercase as `i̇` = 3 bytes), so
    /// the offsets disagreed and `&msg[idx + label.len()..]` could
    /// land mid-UTF-8 → panic. Now the search is on byte windows
    /// of the original string with case-insensitive ASCII compare.
    #[test]
    fn retry_after_handles_unicode_before_label() {
        // Provider error message with a Turkish capital I before
        // the label. Lowercasing produces a different byte length.
        let msg = "İoError: Retry-After: 8";
        assert_eq!(
            retry_after_from_error_msg(msg),
            Some(Duration::from_secs(8)),
        );
    }

    /// Case-insensitive matching against the label name itself.
    /// `RETRY-AFTER-MS` and `retry-after-ms` should both parse.
    #[test]
    fn retry_after_label_match_is_case_insensitive() {
        assert_eq!(
            retry_after_from_error_msg("rate limited: RETRY-AFTER-MS: 750"),
            Some(Duration::from_millis(750)),
        );
        assert_eq!(
            retry_after_from_error_msg("Retry-After-Ms: 750"),
            Some(Duration::from_millis(750)),
        );
    }

    /// Pathological huge digit run: cap at 11 digits before parse,
    /// so `Retry-After: 999999999999999999999...` doesn't overflow
    /// or produce a 100-year wait before the upper cap clamps.
    #[test]
    fn retry_after_caps_pathological_digit_run() {
        let msg = "Retry-After: 99999999999999999999999";
        let parsed = retry_after_from_error_msg(msg);
        // 11 digits = max ~10^11 seconds — `backoff_duration_for_msg`
        // will cap at 5 minutes, but the unsanitized parse must
        // produce SOMETHING (not None, not a panic). We don't pin
        // the exact value; just verify it's bounded by the cap
        // behavior in `backoff_duration_for_msg`.
        assert!(parsed.is_some(), "must parse, not return None");
        let policy = RecoveryPolicy::default();
        let d = policy.backoff_duration_for_msg(0, msg);
        assert!(
            d <= Duration::from_secs(300),
            "backoff must cap at 5min; got {:?}",
            d,
        );
    }

    /// `backoff_duration_for_msg` picks the longer of the
    /// computed exponential backoff and the server's retry-after,
    /// capped at 5 minutes.
    #[test]
    fn backoff_duration_for_msg_prefers_longer_value() {
        let policy = RecoveryPolicy::default();
        // attempts=0 → ~1s computed. retry-after=10s → 10s wins.
        let d = policy.backoff_duration_for_msg(0, "Retry-After: 10");
        assert!(d >= Duration::from_secs(10) && d < Duration::from_secs(11));

        // Server asks for ms below computed → computed wins.
        let d = policy.backoff_duration_for_msg(3, "retry-after-ms: 50");
        // 2^3 = 8s computed.
        assert!(d >= Duration::from_secs(8));
    }

    /// Cap retry-after at 5 minutes in case the header is bogus.
    #[test]
    fn backoff_duration_for_msg_caps_at_5_minutes() {
        let policy = RecoveryPolicy::default();
        let d = policy.backoff_duration_for_msg(0, "Retry-After: 9999");
        assert!(d <= Duration::from_secs(300));
    }
}