Skip to main content

zeph_core/quality/
parser.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Best-effort JSON parser with one retry for use in the self-check pipeline.
5//!
6//! LLMs frequently wrap JSON output in markdown fences or prepend prose. This module
7//! strips those artifacts before deserializing.
8
9use std::time::Duration;
10
11use serde::de::DeserializeOwned;
12use thiserror::Error;
13use zeph_llm::any::AnyProvider;
14use zeph_llm::provider::{LlmProvider, Message, MessageMetadata, Role};
15
16#[non_exhaustive]
17/// Errors from the parser.
18#[derive(Debug, Error)]
19pub enum ParseError {
20    #[error("no opening brace found in output")]
21    NoBraceSpan,
22    #[error("JSON parse failed: {0}")]
23    Json(#[from] serde_json::Error),
24}
25
26#[non_exhaustive]
27/// Errors from `chat_json` (wraps [`ParseError`] and provider/timeout errors).
28#[derive(Debug, Error)]
29pub enum ChatJsonError {
30    #[error("LLM error: {0}")]
31    Llm(#[from] zeph_llm::LlmError),
32    #[error("timed out after {0}ms")]
33    Timeout(u64),
34    #[error("failed to parse JSON after 2 attempts; last raw (truncated): {0}")]
35    Parse(String),
36}
37
38/// Strip markdown code fences from LLM output.
39fn strip_fences(raw: &str) -> &str {
40    let trimmed = raw.trim();
41    if let Some(rest) = trimmed.strip_prefix("```") {
42        let after_lang = if let Some(nl) = rest.find('\n') {
43            &rest[nl + 1..]
44        } else {
45            rest
46        };
47        if let Some(end) = after_lang.rfind("```") {
48            return after_lang[..end].trim();
49        }
50        return after_lang.trim();
51    }
52    trimmed
53}
54
55/// Find the first `{...}` or `[...]` span in the string.
56fn find_first_brace_span(s: &str) -> Option<&str> {
57    let open = s.find(['{', '['])?;
58    let opener = s.as_bytes()[open];
59    let closer = if opener == b'{' { b'}' } else { b']' };
60    let mut depth = 0i32;
61    let bytes = s.as_bytes();
62    let mut close = None;
63    for (i, &b) in bytes.iter().enumerate().skip(open) {
64        if b == opener {
65            depth += 1;
66        } else if b == closer {
67            depth -= 1;
68            if depth == 0 {
69                close = Some(i);
70                break;
71            }
72        }
73    }
74    let close = close?;
75    Some(&s[open..=close])
76}
77
78/// Parse JSON from a raw LLM string, stripping fences and finding the first brace span.
79///
80/// # Errors
81///
82/// Returns [`ParseError`] if no brace span is found or JSON deserialization fails.
83pub fn parse_json<T: DeserializeOwned>(raw: &str) -> Result<T, ParseError> {
84    let stripped = strip_fences(raw);
85    let span = find_first_brace_span(stripped).ok_or(ParseError::NoBraceSpan)?;
86    Ok(serde_json::from_str(span)?)
87}
88
89/// Build a two-message `[system, user]` slice for a provider call.
90fn build_messages(system: &str, user: &str) -> Vec<Message> {
91    vec![
92        Message {
93            role: Role::System,
94            content: system.to_owned(),
95            parts: vec![],
96            metadata: MessageMetadata::default(),
97        },
98        Message {
99            role: Role::User,
100            content: user.to_owned(),
101            parts: vec![],
102            metadata: MessageMetadata::default(),
103        },
104    ]
105}
106
107/// Approximate token count from raw string (4 chars ≈ 1 token).
108#[must_use]
109pub fn approx_tokens(s: &str) -> u64 {
110    (s.len() as u64).saturating_add(3) / 4
111}
112
113/// Timeout duration in milliseconds, clamped to `u64::MAX`.
114fn timeout_ms(d: Duration) -> u64 {
115    u64::try_from(d.as_millis()).unwrap_or(u64::MAX)
116}
117
118/// Call the provider and parse the JSON result, retrying once on parse failure.
119///
120/// Returns `(value, approx_tokens, attempt_number)` on success.
121///
122/// # Errors
123///
124/// Returns [`ChatJsonError`] if both attempts fail, the provider errors, or timeout is hit.
125pub async fn chat_json<T: DeserializeOwned>(
126    provider: &AnyProvider,
127    system: &str,
128    user: &str,
129    per_call_timeout: Duration,
130) -> Result<(T, u64, u32), ChatJsonError> {
131    let msgs = build_messages(system, user);
132
133    // Attempt 1
134    let first = tokio::time::timeout(per_call_timeout, provider.chat(&msgs)).await;
135    match first {
136        Ok(Ok(raw)) => {
137            if let Ok(v) = parse_json::<T>(&raw) {
138                return Ok((v, approx_tokens(&raw), 1));
139            }
140            // Attempt 2: corrective nudge
141            let retry_user = format!(
142                "{user}\n\nPrevious output was not valid JSON. \
143                 Re-output strict JSON only, no prose, no fences."
144            );
145            let retry_msgs = build_messages(system, &retry_user);
146            let second = tokio::time::timeout(per_call_timeout, provider.chat(&retry_msgs)).await;
147            match second {
148                Ok(Ok(raw2)) => parse_json::<T>(&raw2)
149                    .map(|v| (v, approx_tokens(&raw2), 2))
150                    .map_err(|_| {
151                        let truncated = if raw2.len() > 4096 {
152                            let end = raw2.floor_char_boundary(4096);
153                            format!("{}…", &raw2[..end])
154                        } else {
155                            raw2.clone()
156                        };
157                        ChatJsonError::Parse(truncated)
158                    }),
159                Ok(Err(e)) => Err(ChatJsonError::Llm(e)),
160                Err(_) => Err(ChatJsonError::Timeout(timeout_ms(per_call_timeout))),
161            }
162        }
163        Ok(Err(e)) => Err(ChatJsonError::Llm(e)),
164        Err(_) => Err(ChatJsonError::Timeout(timeout_ms(per_call_timeout))),
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn strips_json_markdown_fences() {
174        let raw = "```json\n{\"a\":1}\n```";
175        let v: serde_json::Value = parse_json(raw).unwrap();
176        assert_eq!(v["a"], 1);
177    }
178
179    #[test]
180    fn strips_plain_fences() {
181        let raw = "```\n{\"a\":2}\n```";
182        let v: serde_json::Value = parse_json(raw).unwrap();
183        assert_eq!(v["a"], 2);
184    }
185
186    #[test]
187    fn finds_brace_span_in_prose() {
188        let raw = "Here is the JSON: {\"x\":42} as requested.";
189        let v: serde_json::Value = parse_json(raw).unwrap();
190        assert_eq!(v["x"], 42);
191    }
192
193    #[test]
194    fn returns_error_on_no_brace() {
195        let result = parse_json::<serde_json::Value>("no json here");
196        assert!(matches!(result, Err(ParseError::NoBraceSpan)));
197    }
198
199    #[test]
200    fn handles_nested_braces() {
201        let raw = r#"{"outer":{"inner":1}}"#;
202        let v: serde_json::Value = parse_json(raw).unwrap();
203        assert_eq!(v["outer"]["inner"], 1);
204    }
205}