Skip to main content

ati/core/
sentry_scope.rs

1//! Sentry scope helpers for proxy-side upstream error classification.
2//!
3//! Adds structured tags + a class-based dynamic event title so each error
4//! class becomes a distinct Sentry issue bucket instead of every failure
5//! collapsing into one generic "upstream server error" bucket. Also routes
6//! log level by status class (info/warn/error).
7//!
8//! Title strategy: Sentry's tracing bridge maps `tracing::error!` messages
9//! to event titles, and Sentry groups events by title when no fingerprint
10//! is set. We dispatch the report into 5 buckets — `auth_error`,
11//! `bad_input`, `rate_limited`, `server_error`, `transport_error` — each
12//! with its own literal-string `tracing::error!`. Provider/operation/
13//! status live as tags so each bucket is filterable per-provider without
14//! splitting the buckets per-provider.
15//!
16//! See issue #81 for the original context; the redesign closes the
17//! "Sentry errors really fucking suck" report — generic titles, missing
18//! source chains, lying tags.
19
20use crate::core::jwt::TokenClaims;
21
22/// Coarse classification of an upstream failure. Each class maps to a
23/// distinct Sentry bucket via a unique literal `tracing::error!` /
24/// `tracing::warn!` message — that's how grouping survives across
25/// providers without collapsing every error into one mega-bucket.
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
27pub enum UpstreamErrorClass {
28    /// 401 / 403 / 407 — credentials missing, revoked, or insufficient.
29    AuthError,
30    /// 400 / 404 (non-no-records) / 422 — request shape was wrong.
31    BadInput,
32    /// 402 — out of credit; not actionable code-side. Always Warning.
33    Quota,
34    /// 429 — rate limited. Often actionable as a retry; Warning level.
35    RateLimited,
36    /// 5xx — upstream service broke.
37    ServerError,
38    /// 0 — no HTTP status (DNS, TCP, TLS, MCP transport error, etc.).
39    TransportError,
40}
41
42impl UpstreamErrorClass {
43    pub fn classify(upstream_status: u16) -> Self {
44        match upstream_status {
45            401 | 403 | 407 => Self::AuthError,
46            400 | 404 | 422 => Self::BadInput,
47            402 => Self::Quota,
48            429 => Self::RateLimited,
49            500..=599 => Self::ServerError,
50            _ => Self::TransportError,
51        }
52    }
53
54    /// Short, stable label used in Sentry tags + event extras.
55    pub fn as_tag(self) -> &'static str {
56        match self {
57            Self::AuthError => "auth_error",
58            Self::BadInput => "bad_input",
59            Self::Quota => "quota",
60            Self::RateLimited => "rate_limited",
61            Self::ServerError => "server_error",
62            Self::TransportError => "transport_error",
63        }
64    }
65
66    /// True for classes that should NOT page on-call. The Sentry
67    /// `before_send` hook drops these silently to keep the issue list
68    /// useful — they still appear as breadcrumbs inside the next real
69    /// event so context isn't lost.
70    pub fn is_quota_class(self) -> bool {
71        matches!(self, Self::Quota | Self::RateLimited)
72    }
73}
74
75/// Split a proxy tool reference into `(provider, operation_id)`.
76///
77/// Three input shapes are common in the wild:
78///   - `"finnhub:price_target"` — explicit `provider:op` (recent OpenAPI tools).
79///   - `"pdl_person_enrichment"` — flat, no separator, prefixed with
80///     provider name (PDL, datalastic, several others).
81///   - `"web_search"` — flat, no separator, no prefix (`parallel` MCP-ish).
82///
83/// We always know the resolved `provider_name` at the call site because
84/// the registry told us. Use that as the source of truth, then derive
85/// the operation by stripping a `{provider}:` or `{provider}_` prefix
86/// when present. Never fabricate "unknown" — if we can't split cleanly,
87/// the operation tag IS the tool_name verbatim (still useful for grouping
88/// and search). The old "unknown" sentinel made every flat-named tool
89/// land in one mega-bucket.
90pub fn provider_and_op(provider_name: &str, tool_name: &str) -> (String, String) {
91    // Explicit `provider:op` wins regardless of provider_name (handles the
92    // colon-namespaced OpenAPI naming).
93    if let Some((p, op)) = tool_name.split_once(crate::core::manifest::TOOL_SEP) {
94        if !p.is_empty() && !op.is_empty() {
95            return (p.to_string(), op.to_string());
96        }
97    }
98    // Flat name — try to strip the provider as a colon-or-underscore prefix.
99    // Underscore is the convention PDL uses (`pdl_person_enrichment` →
100    // operation = `person_enrichment`).
101    if let Some(rest) = tool_name
102        .strip_prefix(&format!("{provider_name}:"))
103        .or_else(|| tool_name.strip_prefix(&format!("{provider_name}_")))
104    {
105        if !rest.is_empty() {
106            return (provider_name.to_string(), rest.to_string());
107        }
108    }
109    // No prefix — the tool_name IS the operation under this provider.
110    (provider_name.to_string(), tool_name.to_string())
111}
112
113/// Back-compat shim for call sites that still don't know the resolved
114/// provider name. New code should call `provider_and_op` directly.
115///
116/// Falls back to the old colon-only split for `provider:op` style names.
117/// For flat tool names with no separator, returns the tool name as the
118/// operation (NOT "unknown" — never lie about what failed) and an empty
119/// provider, which the caller is expected to overwrite.
120pub fn split_tool_name(tool_name: &str) -> (String, String) {
121    if let Some((p, op)) = tool_name.split_once(crate::core::manifest::TOOL_SEP) {
122        if !p.is_empty() && !op.is_empty() {
123            return (p.to_string(), op.to_string());
124        }
125    }
126    (String::new(), tool_name.to_string())
127}
128
129/// Scrub obvious PII patterns (UUIDs, emails, IPv4s, long hex tokens) from a
130/// user-facing message and truncate to `max_len` chars. Keeps the short form
131/// safe to send to Sentry as a tag-adjacent extra.
132pub fn scrub_and_truncate(s: &str, max_len: usize) -> String {
133    let scrubbed = scrub(s);
134    if scrubbed.chars().count() <= max_len {
135        scrubbed
136    } else {
137        let mut out: String = scrubbed.chars().take(max_len.saturating_sub(1)).collect();
138        out.push('…');
139        out
140    }
141}
142
143fn scrub(s: &str) -> String {
144    let bytes = s.as_bytes();
145    let mut out = String::with_capacity(s.len());
146    let mut i = 0;
147    while i < bytes.len() {
148        // All matchers below only match ASCII bytes (UUID/email/IPv4/hex all
149        // have an ASCII-only charset), so a match attempt at byte index `i` is
150        // safe even if `i` is the start of a multi-byte UTF-8 char — the first
151        // byte of any multi-byte sequence is `>= 0x80` and none of the matchers
152        // accept it, so they bail cleanly.
153        if let Some(end) = match_uuid(bytes, i) {
154            out.push_str("***");
155            i = end;
156        } else if let Some(end) = match_email(bytes, i) {
157            out.push_str("***");
158            i = end;
159        } else if let Some(end) = match_ipv4(bytes, i) {
160            out.push_str("***");
161            i = end;
162        } else if let Some(end) = match_long_hex(bytes, i) {
163            out.push_str("***");
164            i = end;
165        } else {
166            // Decode one UTF-8 char at `i` and advance by its byte length,
167            // preserving multi-byte chars correctly.
168            let ch_len = utf8_char_len(bytes[i]);
169            let end = (i + ch_len).min(bytes.len());
170            // SAFETY: the caller passes a &str, so bytes[i..end] is a valid
171            // UTF-8 slice starting at a char boundary.
172            out.push_str(std::str::from_utf8(&bytes[i..end]).unwrap_or(""));
173            i = end;
174        }
175    }
176    out
177}
178
179/// Length in bytes of the UTF-8 char starting with `lead`. Returns 1 for
180/// ASCII, orphaned continuation bytes, or unknown lead bytes so the scrubber
181/// always makes forward progress without panicking.
182fn utf8_char_len(lead: u8) -> usize {
183    match lead {
184        0..=0x7F => 1,
185        0xC0..=0xDF => 2,
186        0xE0..=0xEF => 3,
187        0xF0..=0xFF => 4,
188        _ => 1, // 0x80..=0xBF: orphan continuation byte
189    }
190}
191
192fn is_hex(b: u8) -> bool {
193    b.is_ascii_hexdigit()
194}
195
196/// Match `[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`.
197fn match_uuid(b: &[u8], start: usize) -> Option<usize> {
198    let spans = [8usize, 4, 4, 4, 12];
199    let mut i = start;
200    for (idx, span) in spans.iter().enumerate() {
201        if i + span > b.len() {
202            return None;
203        }
204        for k in 0..*span {
205            if !is_hex(b[i + k]) {
206                return None;
207            }
208        }
209        i += span;
210        if idx < spans.len() - 1 {
211            if i >= b.len() || b[i] != b'-' {
212                return None;
213            }
214            i += 1;
215        }
216    }
217    Some(i)
218}
219
220/// Match a hex token of at least 24 chars (API keys, token IDs). Requires
221/// the run to contain at least one digit and at least one letter to avoid
222/// scrubbing long runs of a single char or plain English words.
223fn match_long_hex(b: &[u8], start: usize) -> Option<usize> {
224    // Tokens should be bounded by non-hex (word boundary-ish) on the left.
225    if start > 0 && is_hex(b[start - 1]) {
226        return None;
227    }
228    let mut i = start;
229    let mut has_digit = false;
230    let mut has_alpha = false;
231    while i < b.len() && is_hex(b[i]) {
232        if b[i].is_ascii_digit() {
233            has_digit = true;
234        } else {
235            has_alpha = true;
236        }
237        i += 1;
238    }
239    if i - start >= 24 && has_digit && has_alpha {
240        Some(i)
241    } else {
242        None
243    }
244}
245
246fn match_email(b: &[u8], start: usize) -> Option<usize> {
247    let mut i = start;
248    let local_start = i;
249    while i < b.len() && is_email_local(b[i]) {
250        i += 1;
251    }
252    if i == local_start || i >= b.len() || b[i] != b'@' {
253        return None;
254    }
255    i += 1; // skip @
256    let domain_start = i;
257    while i < b.len() && is_email_domain(b[i]) {
258        i += 1;
259    }
260    if i == domain_start {
261        return None;
262    }
263    // Require at least one dot in the domain.
264    if !b[domain_start..i].contains(&b'.') {
265        return None;
266    }
267    Some(i)
268}
269
270fn is_email_local(b: u8) -> bool {
271    b.is_ascii_alphanumeric() || matches!(b, b'.' | b'_' | b'-' | b'+')
272}
273
274fn is_email_domain(b: u8) -> bool {
275    b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-')
276}
277
278fn match_ipv4(b: &[u8], start: usize) -> Option<usize> {
279    // Don't match inside a longer dotted-numeric run (e.g. "library 1.2.3.4"
280    // in a version string — left boundary should not be a digit or a dot).
281    if start > 0 && (b[start - 1].is_ascii_digit() || b[start - 1] == b'.') {
282        return None;
283    }
284    let mut i = start;
285    for octet in 0..4 {
286        let octet_start = i;
287        while i < b.len() && b[i].is_ascii_digit() {
288            i += 1;
289            if i - octet_start > 3 {
290                return None;
291            }
292        }
293        if i == octet_start {
294            return None;
295        }
296        // Validate octet value ≤ 255 to reject "999.999.999.999".
297        let octet_str = std::str::from_utf8(&b[octet_start..i]).unwrap_or("");
298        let octet_val: u16 = octet_str.parse().unwrap_or(u16::MAX);
299        if octet_val > 255 {
300            return None;
301        }
302        if octet < 3 {
303            if i >= b.len() || b[i] != b'.' {
304                return None;
305            }
306            i += 1;
307        }
308    }
309    // Don't match if followed by another digit or dot (continuation of longer run).
310    if i < b.len() && (b[i].is_ascii_digit() || b[i] == b'.') {
311        return None;
312    }
313    Some(i)
314}
315
316/// Best-effort parse of common upstream error JSON shapes:
317///   `{"error": {"type": "X", "message": "Y"}}`   (PDL, Stripe, Anthropic)
318///   `{"type": "X", "message": "Y"}`              (flat)
319///   `{"error": "message string"}`                (xAI, finnhub flat)
320///   `{"message": "Y"}`                           (generic)
321///
322/// Returns `(error_type, error_message)` where each is Some when extractable.
323pub fn parse_upstream_error(body: &str) -> (Option<String>, Option<String>) {
324    // Cheap early-out for non-JSON bodies (HTML error pages from load
325    // balancers, plaintext "Bad Gateway", empty strings). Avoids allocating
326    // for serde_json::from_str on every proxy error.
327    let trimmed = body.trim_start();
328    if !trimmed.starts_with('{') && !trimmed.starts_with('[') {
329        return (None, None);
330    }
331    let v: serde_json::Value = match serde_json::from_str(body) {
332        Ok(v) => v,
333        Err(_) => return (None, None),
334    };
335
336    let (error_type, error_message) = match v {
337        serde_json::Value::Object(ref map) => {
338            let err_field = map.get("error");
339            let error_type = err_field
340                .and_then(|e| e.get("type"))
341                .and_then(|t| t.as_str())
342                .map(str::to_string)
343                .or_else(|| map.get("type").and_then(|t| t.as_str()).map(str::to_string))
344                .or_else(|| {
345                    map.get("error_type")
346                        .and_then(|t| t.as_str())
347                        .map(str::to_string)
348                });
349
350            let error_message = err_field
351                .and_then(|e| e.get("message"))
352                .and_then(|m| m.as_str())
353                .map(str::to_string)
354                .or_else(|| {
355                    // `error` is a string itself (xAI-style).
356                    err_field.and_then(|e| e.as_str()).map(str::to_string)
357                })
358                .or_else(|| {
359                    map.get("message")
360                        .and_then(|m| m.as_str())
361                        .map(str::to_string)
362                });
363
364            (error_type, error_message)
365        }
366        _ => (None, None),
367    };
368
369    (error_type, error_message)
370}
371
372/// True when a 404 body looks like a legit "no records" response the caller
373/// should treat as an empty result, not an error.
374///
375/// Requires the message to match one of the known "No X (were )?found"
376/// phrases. The `error.type == "not_found"` hint **narrows** rather than
377/// short-circuits: a response with `type: not_found` and no recognizable
378/// message is treated as a real error so we don't silently convert genuine
379/// 404s (removed endpoints, mistyped routes) to empty results.
380pub fn is_no_records_body(error_type: Option<&str>, error_message: Option<&str>) -> bool {
381    let msg = match error_message {
382        Some(m) => m.trim(),
383        None => return false,
384    };
385    let lower = msg.to_ascii_lowercase();
386    let lower = lower.trim_start_matches("no ");
387    let keywords = [
388        "records were found",
389        "companies were found",
390        "persons were found",
391        "results were found",
392        "matches were found",
393        "records found",
394        "companies found",
395        "persons found",
396        "results found",
397        "matches found",
398    ];
399    let message_matches = keywords.iter().any(|k| lower.starts_with(k));
400    if message_matches {
401        return true;
402    }
403    // Accept `type: not_found` only when accompanied by a short, generic
404    // "not found" message (no specific resource-name in the text). This keeps
405    // PDL's `{type: not_found, message: "No records were found..."}` flowing
406    // through (already matched above) while still catching providers that
407    // send `{type: not_found, message: "not found"}` without being specific
408    // about what was missing.
409    if matches!(error_type, Some("not_found")) {
410        return lower == "not found" || lower.is_empty();
411    }
412    false
413}
414
415/// Attach structured tags + class-based dynamic title to the current Sentry
416/// scope and emit a tracing event at the appropriate level for the upstream
417/// status class.
418///
419/// Buckets (event title literal):
420///   - `auth_error`      → 401, 403, 407 (Error)
421///   - `bad_input`       → 400, 404 (non-no-records), 422 (Error)
422///   - `quota`           → 402 (Warning; dropped by `before_send`)
423///   - `rate_limited`    → 429 (Warning; dropped by `before_send`)
424///   - `server_error`    → 5xx (Error)
425///   - `transport_error` → 0 / unknown (Error)
426///
427/// Each title is a distinct literal so `sentry-tracing` creates a separate
428/// Sentry issue per class. The (provider, operation_id, status) live as
429/// tags so each bucket stays per-provider searchable without splitting
430/// into one bucket per provider.
431///
432/// When the `sentry` feature is off, emits the tracing event only.
433pub fn report_upstream_error(
434    provider: &str,
435    operation_id: &str,
436    upstream_status: u16,
437    proxy_status: u16,
438    error_type: Option<&str>,
439    error_message: Option<&str>,
440) {
441    let msg_short = error_message
442        .map(|m| scrub_and_truncate(m, 140))
443        .unwrap_or_default();
444    let class = UpstreamErrorClass::classify(upstream_status);
445
446    // `sentry::with_scope` pushes a temporary scope for the duration of the
447    // closure, then pops it — so tags never leak across requests running on
448    // the same tokio worker thread. The tracing macros inside the closure
449    // are picked up by `sentry_tracing::layer()` and emitted with these
450    // tags attached. `class` becomes its own tag so operators can filter
451    // a single bucket by error shape without grouping each subclass into
452    // its own issue.
453    with_upstream_scope(
454        provider,
455        operation_id,
456        upstream_status,
457        proxy_status,
458        error_type,
459        &msg_short,
460        class,
461        || {
462            emit_classified(
463                class,
464                provider,
465                operation_id,
466                upstream_status,
467                proxy_status,
468                error_type,
469                &msg_short,
470            )
471        },
472    );
473}
474
475/// Each arm carries its own LITERAL `tracing::*!` message because the
476/// `sentry-tracing` bridge maps the literal to the Sentry event title,
477/// and Sentry groups events by title. Five literals → five distinct
478/// Sentry issue buckets. All the variable detail lives in fields.
479fn emit_classified(
480    class: UpstreamErrorClass,
481    provider: &str,
482    operation_id: &str,
483    upstream_status: u16,
484    proxy_status: u16,
485    error_type: Option<&str>,
486    msg_short: &str,
487) {
488    let error_type = error_type.unwrap_or("");
489    match class {
490        UpstreamErrorClass::AuthError => tracing::error!(
491            provider,
492            operation_id,
493            upstream_status,
494            proxy_status,
495            class = class.as_tag(),
496            error_type,
497            msg = %msg_short,
498            "upstream auth_error"
499        ),
500        UpstreamErrorClass::BadInput => tracing::error!(
501            provider,
502            operation_id,
503            upstream_status,
504            proxy_status,
505            class = class.as_tag(),
506            error_type,
507            msg = %msg_short,
508            "upstream bad_input"
509        ),
510        UpstreamErrorClass::Quota => tracing::warn!(
511            provider,
512            operation_id,
513            upstream_status,
514            proxy_status,
515            class = class.as_tag(),
516            error_type,
517            msg = %msg_short,
518            "upstream quota"
519        ),
520        UpstreamErrorClass::RateLimited => tracing::warn!(
521            provider,
522            operation_id,
523            upstream_status,
524            proxy_status,
525            class = class.as_tag(),
526            error_type,
527            msg = %msg_short,
528            "upstream rate_limited"
529        ),
530        UpstreamErrorClass::ServerError => tracing::error!(
531            provider,
532            operation_id,
533            upstream_status,
534            proxy_status,
535            class = class.as_tag(),
536            error_type,
537            msg = %msg_short,
538            "upstream server_error"
539        ),
540        UpstreamErrorClass::TransportError => tracing::error!(
541            provider,
542            operation_id,
543            upstream_status,
544            proxy_status,
545            class = class.as_tag(),
546            error_type,
547            msg = %msg_short,
548            "upstream transport_error"
549        ),
550    }
551}
552
553/// Attach the structured Rust error (with its full `source()` chain) to
554/// the current Sentry scope and capture it as an exception event. This
555/// is what gives the issue page a real "Exception" block with the
556/// reqwest / `HttpError` / `McpError` source chain — without this, all
557/// Sentry sees is the tracing-bridge's flat title + tags.
558///
559/// Pair with `report_upstream_error`: that one creates the
560/// titled/grouped issue, this one attaches the structured context.
561/// When the `sentry` feature is off, this is a no-op.
562#[allow(unused_variables)]
563pub fn capture_error_with_scope(
564    err: &(dyn std::error::Error + 'static),
565    provider: &str,
566    operation_id: &str,
567    upstream_status: u16,
568    proxy_status: u16,
569    error_type: Option<&str>,
570    error_message: Option<&str>,
571) {
572    #[cfg(feature = "sentry")]
573    {
574        let msg_short = error_message
575            .map(|m| scrub_and_truncate(m, 140))
576            .unwrap_or_default();
577        let class = UpstreamErrorClass::classify(upstream_status);
578        // Skip quota / rate-limit classes — they're already dropped by
579        // before_send and we don't want to spend the quota encoding the
580        // backtrace just to throw it away.
581        if class.is_quota_class() {
582            return;
583        }
584        // Build the exception event manually so we can pin its `message`
585        // to the same class literal the tracing-bridge event uses.
586        //
587        // Why this matters: `report_upstream_error` fires a tracing event
588        // whose title is the class literal (e.g. `"upstream bad_input"`),
589        // and Sentry groups it by that title under the shared fingerprint.
590        // If we let `sentry::capture_error` build the second event with
591        // its default behavior, Sentry sets the new event's title from
592        // `exception[0].value` (the underlying error's Display, like
593        // `"HttpError: ApiError { status: 400, … }"`). Sentry's "latest
594        // event wins for title" rule then drifts the operator-visible
595        // bucket title away from the clean class literal — the headline
596        // improvement the PR is built around (Greptile P2 on PR #137).
597        //
598        // Fix: set `Event.message` to the class literal up front. The
599        // exception chain is still attached, the fingerprint still
600        // groups, and the title stays stable.
601        let class_literal = match class {
602            UpstreamErrorClass::AuthError => "upstream auth_error",
603            UpstreamErrorClass::BadInput => "upstream bad_input",
604            UpstreamErrorClass::Quota => "upstream quota",
605            UpstreamErrorClass::RateLimited => "upstream rate_limited",
606            UpstreamErrorClass::ServerError => "upstream server_error",
607            UpstreamErrorClass::TransportError => "upstream transport_error",
608        };
609        let mut event = sentry::event_from_error(err);
610        event.message = Some(class_literal.to_string());
611        with_upstream_scope(
612            provider,
613            operation_id,
614            upstream_status,
615            proxy_status,
616            error_type,
617            &msg_short,
618            class,
619            || {
620                sentry::capture_event(event);
621            },
622        );
623    }
624}
625
626/// Attach JWT identity (sub, sandbox_id, job_id) to the current Sentry
627/// scope so every event raised during this request is searchable by
628/// those facets. Set once at the start of `/call`, `/mcp`, and `/help`
629/// handlers.
630///
631/// **Tags MUST be cleared, not just conditionally set.** `configure_scope`
632/// mutates the thread-local hub scope, and tokio reuses worker threads
633/// across requests. If request A had `sandbox_id = "alice"` and request
634/// B on the same worker thread has no `sandbox_id` in its JWT, a naive
635/// `if let Some(id) = ... { set_tag(...) }` leaves "alice" stamped on
636/// every event raised during request B — a multi-tenant data bleed
637/// flagged by Greptile P1 on PR #137. The fix: unconditionally
638/// `remove_tag` for every optional field at the top of the helper, then
639/// set the ones we actually have.
640///
641/// `sub` is the only required JWT claim (auth middleware would reject
642/// the request otherwise), so we always set it. `user` is similarly
643/// always present.
644#[allow(unused_variables)]
645pub fn set_jwt_sentry_scope(claims: &TokenClaims) {
646    #[cfg(feature = "sentry")]
647    sentry::configure_scope(|scope| {
648        // Clear optional tags from any prior request that ran on this
649        // worker thread before stamping the new request's identity.
650        // remove_tag is a no-op if the tag isn't present, so it's safe
651        // to call on cold workers too.
652        scope.remove_tag("sandbox_id");
653        scope.remove_tag("job_id");
654
655        scope.set_tag("sub", claims.sub.as_str());
656        if let Some(ref id) = claims.sandbox_id {
657            scope.set_tag("sandbox_id", id.as_str());
658        }
659        if let Some(ref id) = claims.job_id {
660            scope.set_tag("job_id", id.as_str());
661        }
662        // Sentry's `user.id` is the standard slot for the auth subject —
663        // populating it lets the issue page show "users affected" counts
664        // and gives the search UI a first-class facet. set_user(Some(..))
665        // overwrites any prior value so no `set_user(None)` clear is
666        // needed (unlike the optional tags above which would silently
667        // bleed through).
668        scope.set_user(Some(sentry::User {
669            id: Some(claims.sub.clone()),
670            ..Default::default()
671        }));
672    });
673}
674
675// 8 parameters by design: each is a distinct Sentry tag value resolved
676// at the call site. Bundling into a struct would just move the noise
677// without removing it, and the helper is private. Same allow on the
678// no-sentry stub below for signature parity.
679#[cfg(feature = "sentry")]
680#[allow(clippy::too_many_arguments)]
681fn with_upstream_scope<F: FnOnce()>(
682    provider: &str,
683    operation_id: &str,
684    upstream_status: u16,
685    proxy_status: u16,
686    error_type: Option<&str>,
687    msg_short: &str,
688    class: UpstreamErrorClass,
689    body: F,
690) {
691    let upstream_s = upstream_status.to_string();
692    let proxy_s = proxy_status.to_string();
693    let class_tag = class.as_tag();
694    sentry::with_scope(
695        |scope| {
696            scope.set_tag("provider", provider);
697            scope.set_tag("operation_id", operation_id);
698            scope.set_tag("upstream_status", &upstream_s);
699            scope.set_tag("proxy_status", &proxy_s);
700            scope.set_tag("upstream_error_class", class_tag);
701            if let Some(t) = error_type {
702                scope.set_tag("upstream_error_type", t);
703            }
704            if !msg_short.is_empty() {
705                scope.set_extra(
706                    "upstream_error_message",
707                    serde_json::Value::String(msg_short.to_string()),
708                );
709            }
710            // Fingerprint keys on the error CLASS, not the upstream status,
711            // so a provider that returns 502 vs 504 doesn't fork into two
712            // buckets. Combined with the per-class literal title, this
713            // gives one issue per (provider, operation, class) — fine-
714            // grained enough to alert on a specific tool going bad but
715            // not so fine that every transient blip is a new issue.
716            scope.set_fingerprint(Some(
717                [
718                    "ati.proxy.upstream_error",
719                    provider,
720                    operation_id,
721                    class_tag,
722                ]
723                .as_slice(),
724            ));
725        },
726        body,
727    );
728}
729
730#[cfg(not(feature = "sentry"))]
731#[allow(clippy::too_many_arguments)]
732fn with_upstream_scope<F: FnOnce()>(
733    _provider: &str,
734    _operation_id: &str,
735    _upstream_status: u16,
736    _proxy_status: u16,
737    _error_type: Option<&str>,
738    _msg_short: &str,
739    _class: UpstreamErrorClass,
740    body: F,
741) {
742    body();
743}
744
745#[cfg(test)]
746mod tests {
747    use super::*;
748
749    // --- provider_and_op: the path call sites should actually use ---
750
751    #[test]
752    fn provider_and_op_explicit_colon() {
753        assert_eq!(
754            provider_and_op("finnhub", "finnhub:price_target"),
755            ("finnhub".into(), "price_target".into())
756        );
757    }
758
759    #[test]
760    fn provider_and_op_strips_underscore_prefix() {
761        // The bug from the user's screenshot: PDL ships flat tool names
762        // like `pdl_person_enrichment`. The fix peels the provider as a
763        // `{provider}_` prefix when present.
764        assert_eq!(
765            provider_and_op("pdl", "pdl_person_enrichment"),
766            ("pdl".into(), "person_enrichment".into())
767        );
768    }
769
770    #[test]
771    fn provider_and_op_strips_colon_prefix() {
772        assert_eq!(
773            provider_and_op("github", "github:search_repositories"),
774            ("github".into(), "search_repositories".into())
775        );
776    }
777
778    #[test]
779    fn provider_and_op_no_prefix_keeps_tool_name() {
780        // No prefix to strip — operation is the tool name verbatim. This
781        // is the correct behavior; the OLD code returned "unknown" here,
782        // which clumped every prefixless-tool failure into one mega-
783        // bucket and forced the on-call to read the body to find out
784        // which tool failed.
785        assert_eq!(
786            provider_and_op("parallel", "web_search"),
787            ("parallel".into(), "web_search".into())
788        );
789    }
790
791    #[test]
792    fn provider_and_op_empty_op_after_strip_keeps_tool_name() {
793        // Edge case: `pdl_` — stripping leaves an empty operation, so we
794        // keep the tool_name as-is to avoid generating empty tags.
795        assert_eq!(
796            provider_and_op("pdl", "pdl_"),
797            ("pdl".into(), "pdl_".into())
798        );
799    }
800
801    // --- split_tool_name back-compat shim (no longer fabricates "unknown") ---
802
803    #[test]
804    fn split_tool_name_colon_form() {
805        assert_eq!(
806            split_tool_name("finnhub:price_target"),
807            ("finnhub".into(), "price_target".into())
808        );
809    }
810
811    #[test]
812    fn split_tool_name_flat_returns_tool_name_as_op() {
813        // The OLD behavior was `("bare_tool", "unknown")` — that lied
814        // about what operation failed. New behavior keeps the tool name
815        // as the op; the caller is expected to overwrite the empty
816        // provider with the registry-resolved name.
817        assert_eq!(
818            split_tool_name("bare_tool"),
819            ("".into(), "bare_tool".into())
820        );
821    }
822
823    // --- UpstreamErrorClass: classification ---
824
825    #[test]
826    fn classify_400_is_bad_input() {
827        assert_eq!(
828            UpstreamErrorClass::classify(400),
829            UpstreamErrorClass::BadInput
830        );
831        assert_eq!(
832            UpstreamErrorClass::classify(404),
833            UpstreamErrorClass::BadInput
834        );
835        assert_eq!(
836            UpstreamErrorClass::classify(422),
837            UpstreamErrorClass::BadInput
838        );
839    }
840
841    #[test]
842    fn classify_401_is_auth_error() {
843        assert_eq!(
844            UpstreamErrorClass::classify(401),
845            UpstreamErrorClass::AuthError
846        );
847        assert_eq!(
848            UpstreamErrorClass::classify(403),
849            UpstreamErrorClass::AuthError
850        );
851        assert_eq!(
852            UpstreamErrorClass::classify(407),
853            UpstreamErrorClass::AuthError
854        );
855    }
856
857    #[test]
858    fn classify_402_is_quota() {
859        assert_eq!(UpstreamErrorClass::classify(402), UpstreamErrorClass::Quota);
860        assert!(UpstreamErrorClass::Quota.is_quota_class());
861    }
862
863    #[test]
864    fn classify_429_is_rate_limited() {
865        assert_eq!(
866            UpstreamErrorClass::classify(429),
867            UpstreamErrorClass::RateLimited
868        );
869        assert!(UpstreamErrorClass::RateLimited.is_quota_class());
870    }
871
872    #[test]
873    fn classify_5xx_is_server_error() {
874        assert_eq!(
875            UpstreamErrorClass::classify(500),
876            UpstreamErrorClass::ServerError
877        );
878        assert_eq!(
879            UpstreamErrorClass::classify(503),
880            UpstreamErrorClass::ServerError
881        );
882        assert_eq!(
883            UpstreamErrorClass::classify(599),
884            UpstreamErrorClass::ServerError
885        );
886        assert!(!UpstreamErrorClass::ServerError.is_quota_class());
887    }
888
889    #[test]
890    fn classify_zero_is_transport_error() {
891        // The proxy passes `upstream_status = 0` for MCP/CLI/transport
892        // failures with no HTTP exchange. Those need their own bucket.
893        assert_eq!(
894            UpstreamErrorClass::classify(0),
895            UpstreamErrorClass::TransportError
896        );
897    }
898
899    #[test]
900    fn class_tags_are_stable_strings() {
901        assert_eq!(UpstreamErrorClass::AuthError.as_tag(), "auth_error");
902        assert_eq!(UpstreamErrorClass::BadInput.as_tag(), "bad_input");
903        assert_eq!(UpstreamErrorClass::Quota.as_tag(), "quota");
904        assert_eq!(UpstreamErrorClass::RateLimited.as_tag(), "rate_limited");
905        assert_eq!(UpstreamErrorClass::ServerError.as_tag(), "server_error");
906        assert_eq!(
907            UpstreamErrorClass::TransportError.as_tag(),
908            "transport_error"
909        );
910    }
911
912    #[test]
913    fn parse_nested_pdl_body() {
914        let body = r#"{"status":404,"error":{"type":"not_found","message":"No records were found matching your request"}}"#;
915        let (t, m) = parse_upstream_error(body);
916        assert_eq!(t.as_deref(), Some("not_found"));
917        assert_eq!(
918            m.as_deref(),
919            Some("No records were found matching your request")
920        );
921    }
922
923    #[test]
924    fn parse_flat_xai_style_body() {
925        let body = r#"{"error":"Insufficient credits","message":"Your current balance is $0.01"}"#;
926        let (t, m) = parse_upstream_error(body);
927        assert!(t.is_none());
928        assert_eq!(m.as_deref(), Some("Insufficient credits"));
929    }
930
931    #[test]
932    fn parse_non_json_body() {
933        let (t, m) = parse_upstream_error("not json at all");
934        assert!(t.is_none());
935        assert!(m.is_none());
936    }
937
938    #[test]
939    fn no_records_type_alone_does_not_match() {
940        // `type: not_found` without a message is ambiguous — could be a
941        // genuine missing-resource 404 (removed endpoint, mistyped route).
942        // Don't silently convert it to an empty result.
943        assert!(!is_no_records_body(Some("not_found"), None));
944        assert!(!is_no_records_body(
945            Some("not_found"),
946            Some("User account 42 was deleted")
947        ));
948    }
949
950    #[test]
951    fn no_records_type_with_generic_not_found_message_matches() {
952        // `type: not_found` + generic message → treat as empty result.
953        assert!(is_no_records_body(Some("not_found"), Some("not found")));
954        assert!(is_no_records_body(Some("not_found"), Some("")));
955    }
956
957    #[test]
958    fn no_records_message_matches() {
959        assert!(is_no_records_body(
960            None,
961            Some("No records were found matching your request")
962        ));
963        assert!(is_no_records_body(
964            None,
965            Some("No companies were found matching your request")
966        ));
967        assert!(is_no_records_body(None, Some("no results found")));
968    }
969
970    #[test]
971    fn no_records_rejects_real_errors() {
972        assert!(!is_no_records_body(Some("invalid_request"), None));
973        assert!(!is_no_records_body(None, Some("Insufficient credits")));
974        assert!(!is_no_records_body(None, Some("Forbidden")));
975        assert!(!is_no_records_body(None, None));
976    }
977
978    #[test]
979    fn scrub_uuid() {
980        let s = "request id 550e8400-e29b-41d4-a716-446655440000 failed";
981        assert_eq!(scrub(s), "request id *** failed");
982    }
983
984    #[test]
985    fn scrub_email() {
986        assert_eq!(scrub("contact miguel@parcha.ai now"), "contact *** now");
987    }
988
989    #[test]
990    fn scrub_ipv4() {
991        assert_eq!(scrub("from 192.168.1.1 blocked"), "from *** blocked");
992    }
993
994    #[test]
995    fn scrub_ipv4_rejects_version_strings() {
996        // Version strings with more than 4 groups or trailing digits should
997        // not be scrubbed — left/right boundary guards prevent false-positives.
998        assert_eq!(
999            scrub("library 1.2.3.4.5 raised an error"),
1000            "library 1.2.3.4.5 raised an error"
1001        );
1002        assert_eq!(scrub("version 10.11.12.13.0"), "version 10.11.12.13.0");
1003    }
1004
1005    #[test]
1006    fn scrub_ipv4_rejects_out_of_range_octets() {
1007        // 999.999.999.999 isn't a valid IPv4 address.
1008        assert_eq!(
1009            scrub("bogus 999.999.999.999 ip"),
1010            "bogus 999.999.999.999 ip"
1011        );
1012    }
1013
1014    #[test]
1015    fn scrub_long_hex_token() {
1016        // 40-char hex (GitHub token length)
1017        let tok = "abcdef0123456789abcdef0123456789abcdef01";
1018        assert_eq!(scrub(&format!("token {tok} bad")), "token *** bad");
1019    }
1020
1021    #[test]
1022    fn scrub_preserves_short_hex() {
1023        // Don't scrub short hex sequences (e.g. "abc123" is not a token).
1024        assert_eq!(scrub("hex abc123 fine"), "hex abc123 fine");
1025    }
1026
1027    #[test]
1028    fn scrub_preserves_multibyte_utf8() {
1029        // Regression: byte-as-char casting corrupted non-ASCII.
1030        assert_eq!(scrub("café résumé 日本語"), "café résumé 日本語");
1031    }
1032
1033    #[test]
1034    fn scrub_mixed_utf8_and_secrets() {
1035        let input = "café contact miguel@parcha.ai résumé";
1036        assert_eq!(scrub(input), "café contact *** résumé");
1037    }
1038
1039    #[test]
1040    fn parse_non_json_html_body_early_outs() {
1041        // Load-balancer HTML error pages are common on 502/503. Should not
1042        // attempt JSON parsing at all.
1043        let (t, m) = parse_upstream_error("<html><body>502 Bad Gateway</body></html>");
1044        assert!(t.is_none());
1045        assert!(m.is_none());
1046    }
1047
1048    #[test]
1049    fn parse_empty_body_returns_none() {
1050        let (t, m) = parse_upstream_error("");
1051        assert!(t.is_none());
1052        assert!(m.is_none());
1053    }
1054
1055    #[test]
1056    fn truncate_long_message() {
1057        let s = "a".repeat(500);
1058        let out = scrub_and_truncate(&s, 20);
1059        assert_eq!(out.chars().count(), 20);
1060        assert!(out.ends_with('…'));
1061    }
1062
1063    #[test]
1064    fn truncate_short_message_untouched() {
1065        assert_eq!(scrub_and_truncate("short", 100), "short");
1066    }
1067
1068    // --- Greptile P2 on #137: title pinning on the capture_error path ---
1069
1070    /// Mirrors what `capture_error_with_scope` does internally to build
1071    /// the event it captures. Test surface for the title-pinning fix:
1072    /// without setting `message`, the issue title would drift to the
1073    /// raw error Display and overwrite the clean class literal that
1074    /// `report_upstream_error` is built around.
1075    #[cfg(feature = "sentry")]
1076    fn build_captured_event_for_test(
1077        err: &(dyn std::error::Error + 'static),
1078        class: UpstreamErrorClass,
1079    ) -> sentry::protocol::Event<'static> {
1080        let class_literal = match class {
1081            UpstreamErrorClass::AuthError => "upstream auth_error",
1082            UpstreamErrorClass::BadInput => "upstream bad_input",
1083            UpstreamErrorClass::Quota => "upstream quota",
1084            UpstreamErrorClass::RateLimited => "upstream rate_limited",
1085            UpstreamErrorClass::ServerError => "upstream server_error",
1086            UpstreamErrorClass::TransportError => "upstream transport_error",
1087        };
1088        let mut event = sentry::event_from_error(err);
1089        event.message = Some(class_literal.to_string());
1090        event
1091    }
1092
1093    #[cfg(feature = "sentry")]
1094    #[test]
1095    fn captured_event_message_pins_class_literal_not_error_display() {
1096        // The user-visible bug Greptile flagged: `sentry::event_from_error`
1097        // leaves `Event.message = None`, and Sentry's UI falls back to
1098        // the underlying error's Display for the issue title. Two events
1099        // share the same fingerprint, but the second one's Display
1100        // clobbers the first's clean class literal in the title bar.
1101        //
1102        // Fix shape: pin `Event.message` to the class literal BEFORE
1103        // capturing the event. The exception block is unchanged, so the
1104        // chain still ships; only the title is pinned.
1105        let inner = std::io::Error::other("operation timed out");
1106        let event = build_captured_event_for_test(&inner, UpstreamErrorClass::BadInput);
1107        assert_eq!(event.message.as_deref(), Some("upstream bad_input"));
1108        // Sanity: the exception chain is still there (the whole point of
1109        // calling `capture_error` in the first place).
1110        assert!(
1111            !event.exception.is_empty(),
1112            "expected the exception block to be populated by event_from_error"
1113        );
1114        // And the inner error's Display is preserved in the exception
1115        // value, just not bleeding into the title field.
1116        let exc = event
1117            .exception
1118            .values
1119            .first()
1120            .expect("at least one exception");
1121        assert_eq!(exc.value.as_deref(), Some("operation timed out"));
1122    }
1123
1124    // --- Greptile P1 on #137: set_jwt_sentry_scope tag-clear regression ---
1125    //
1126    // Async runtime workers are reused across requests, and
1127    // `sentry::configure_scope` mutates the per-thread hub permanently.
1128    // Without an explicit `remove_tag`, a prior request's `sandbox_id` /
1129    // `job_id` would bleed into the next request on the same worker
1130    // thread — a multi-tenant data leak. These tests use sentry's test
1131    // transport to inspect actual emitted events.
1132    #[cfg(feature = "sentry")]
1133    #[test]
1134    fn set_jwt_sentry_scope_clears_stale_sandbox_id_between_requests() {
1135        let events = sentry::test::with_captured_events(|| {
1136            // Request A: full identity.
1137            let claims_a = TokenClaims {
1138                iss: None,
1139                sub: "agent_alice".into(),
1140                aud: "ati-proxy".into(),
1141                iat: 0,
1142                exp: 0,
1143                jti: None,
1144                scope: String::new(),
1145                ati: None,
1146                sandbox_id: Some("sandbox_alpha".into()),
1147                job_id: Some("job_42".into()),
1148            };
1149            set_jwt_sentry_scope(&claims_a);
1150            sentry::capture_message("during request A", sentry::Level::Error);
1151
1152            // Request B on the same worker thread: claims have no
1153            // sandbox_id and no job_id. The stale values from request
1154            // A MUST NOT bleed onto request B's events.
1155            let claims_b = TokenClaims {
1156                iss: None,
1157                sub: "agent_bob".into(),
1158                aud: "ati-proxy".into(),
1159                iat: 0,
1160                exp: 0,
1161                jti: None,
1162                scope: String::new(),
1163                ati: None,
1164                sandbox_id: None,
1165                job_id: None,
1166            };
1167            set_jwt_sentry_scope(&claims_b);
1168            sentry::capture_message("during request B", sentry::Level::Error);
1169        });
1170
1171        assert_eq!(events.len(), 2, "expected two captured events");
1172        let a = events
1173            .iter()
1174            .find(|e| e.message.as_deref() == Some("during request A"))
1175            .expect("event A");
1176        let b = events
1177            .iter()
1178            .find(|e| e.message.as_deref() == Some("during request B"))
1179            .expect("event B");
1180
1181        // Request A's event has the full identity.
1182        assert_eq!(a.tags.get("sub").map(String::as_str), Some("agent_alice"));
1183        assert_eq!(
1184            a.tags.get("sandbox_id").map(String::as_str),
1185            Some("sandbox_alpha")
1186        );
1187        assert_eq!(a.tags.get("job_id").map(String::as_str), Some("job_42"));
1188
1189        // Request B's event has ONLY the sub it carried. No bleed.
1190        assert_eq!(b.tags.get("sub").map(String::as_str), Some("agent_bob"));
1191        assert!(
1192            !b.tags.contains_key("sandbox_id"),
1193            "sandbox_id from prior request bled through: {:?}",
1194            b.tags.get("sandbox_id")
1195        );
1196        assert!(
1197            !b.tags.contains_key("job_id"),
1198            "job_id from prior request bled through: {:?}",
1199            b.tags.get("job_id")
1200        );
1201    }
1202
1203    #[cfg(feature = "sentry")]
1204    #[test]
1205    fn set_jwt_sentry_scope_replaces_user_between_requests() {
1206        // `set_user(Some(..))` always overwrites the prior value, so a
1207        // dedicated `set_user(None)` clear isn't needed. This test pins
1208        // that invariant so future refactors don't silently break it.
1209        fn claims_for(sub: &str) -> TokenClaims {
1210            TokenClaims {
1211                iss: None,
1212                sub: sub.into(),
1213                aud: "ati-proxy".into(),
1214                iat: 0,
1215                exp: 0,
1216                jti: None,
1217                scope: String::new(),
1218                ati: None,
1219                sandbox_id: None,
1220                job_id: None,
1221            }
1222        }
1223        let events = sentry::test::with_captured_events(|| {
1224            set_jwt_sentry_scope(&claims_for("agent_alice"));
1225            sentry::capture_message("A", sentry::Level::Error);
1226
1227            set_jwt_sentry_scope(&claims_for("agent_bob"));
1228            sentry::capture_message("B", sentry::Level::Error);
1229        });
1230
1231        let a = events
1232            .iter()
1233            .find(|e| e.message.as_deref() == Some("A"))
1234            .unwrap();
1235        let b = events
1236            .iter()
1237            .find(|e| e.message.as_deref() == Some("B"))
1238            .unwrap();
1239        assert_eq!(
1240            a.user.as_ref().and_then(|u| u.id.as_deref()),
1241            Some("agent_alice")
1242        );
1243        assert_eq!(
1244            b.user.as_ref().and_then(|u| u.id.as_deref()),
1245            Some("agent_bob"),
1246            "Request B's user.id should be its own sub, not Alice's stale value"
1247        );
1248    }
1249
1250    #[cfg(feature = "sentry")]
1251    #[test]
1252    fn captured_event_message_pins_every_class() {
1253        // Lock in the class → literal mapping so future renames of the
1254        // tracing-event literals don't silently desync with the
1255        // capture-error path.
1256        let err = std::io::Error::other("x");
1257        let cases = [
1258            (UpstreamErrorClass::AuthError, "upstream auth_error"),
1259            (UpstreamErrorClass::BadInput, "upstream bad_input"),
1260            (UpstreamErrorClass::Quota, "upstream quota"),
1261            (UpstreamErrorClass::RateLimited, "upstream rate_limited"),
1262            (UpstreamErrorClass::ServerError, "upstream server_error"),
1263            (
1264                UpstreamErrorClass::TransportError,
1265                "upstream transport_error",
1266            ),
1267        ];
1268        for (class, expected_title) in cases {
1269            let event = build_captured_event_for_test(&err, class);
1270            assert_eq!(
1271                event.message.as_deref(),
1272                Some(expected_title),
1273                "class {class:?} should pin title to {expected_title:?}"
1274            );
1275        }
1276    }
1277}