llm_json_repair/lib.rs
1//! # llm-json-repair
2//!
3//! Clean and parse JSON emitted by LLMs.
4//!
5//! Models still emit invalid JSON. They wrap it in markdown code fences,
6//! leave trailing commas, write prose around it, or use smart quotes. This
7//! crate does three local repair passes before you reach for a retry:
8//!
9//! 1. Strip markdown code fences (triple-backtick or triple-tilde).
10//! 2. Extract the first balanced `{...}` or `[...]` from surrounding prose.
11//! 3. Remove trailing commas before `}` or `]`.
12//!
13//! ```
14//! let raw = r#"Sure, here you go:
15//! ```json
16//! { "answer": "Paris", "confidence": 0.95, }
17//! ```"#;
18//! let cleaned = llm_json_repair::repair(raw);
19//! assert_eq!(cleaned, r#"{ "answer": "Paris", "confidence": 0.95 }"#);
20//! ```
21//!
22//! With the default `serde` feature you can parse straight to a typed value:
23//!
24//! ```
25//! # #[cfg(feature = "serde")]
26//! # {
27//! use serde::Deserialize;
28//!
29//! #[derive(Deserialize)]
30//! struct Answer { value: String }
31//!
32//! let raw = "```\n{\"value\": \"42\",}\n```";
33//! let parsed: Answer = llm_json_repair::parse(raw).unwrap();
34//! assert_eq!(parsed.value, "42");
35//! # }
36//! ```
37
38#![deny(missing_docs)]
39
40mod balanced;
41mod commas;
42mod fences;
43
44pub use balanced::extract_balanced;
45pub use commas::strip_trailing_commas;
46pub use fences::strip_fences;
47
48use thiserror::Error;
49
50/// Errors returned by [`parse`].
51#[derive(Debug, Error)]
52pub enum RepairError {
53 /// The input could not be parsed even after repair.
54 #[cfg(feature = "serde")]
55 #[error("could not parse JSON after repair: {source}")]
56 Parse {
57 /// Underlying serde_json error from the final parse attempt.
58 #[source]
59 source: serde_json::Error,
60 /// The repaired text we tried to parse.
61 repaired: String,
62 },
63
64 /// No balanced `{…}` or `[…]` found in the input.
65 #[error("no JSON object or array found in input")]
66 NoJsonFound,
67}
68
69/// Apply all three repair passes and return the cleaned string.
70///
71/// This always returns *something* — even if the input contains no JSON, you
72/// get the trimmed, fence-stripped text back. Use [`parse`] when you want
73/// validation against a real JSON parser.
74pub fn repair(input: &str) -> String {
75 let unfenced = strip_fences(input);
76 let extracted = extract_balanced(&unfenced)
77 .unwrap_or(unfenced.trim())
78 .to_string();
79 strip_trailing_commas(&extracted)
80}
81
82/// Repair the input and parse it as JSON into the given type.
83///
84/// Equivalent to `serde_json::from_str(&repair(input))`, with cleaner errors
85/// and an early failure when no JSON is present at all.
86#[cfg(feature = "serde")]
87pub fn parse<T>(input: &str) -> Result<T, RepairError>
88where
89 T: serde::de::DeserializeOwned,
90{
91 let unfenced = strip_fences(input);
92 let extracted = match extract_balanced(&unfenced) {
93 Some(s) => s.to_string(),
94 None => return Err(RepairError::NoJsonFound),
95 };
96 let cleaned = strip_trailing_commas(&extracted);
97 serde_json::from_str::<T>(&cleaned).map_err(|source| RepairError::Parse {
98 source,
99 repaired: cleaned,
100 })
101}
102
103/// Parse without repair. Same as `serde_json::from_str` but typed under the
104/// crate's error.
105///
106/// Useful when you want to record whether repair was needed:
107///
108/// ```ignore
109/// let parsed = parse_strict(raw).or_else(|_| parse(raw))?;
110/// ```
111#[cfg(feature = "serde")]
112pub fn parse_strict<T>(input: &str) -> Result<T, RepairError>
113where
114 T: serde::de::DeserializeOwned,
115{
116 serde_json::from_str::<T>(input).map_err(|source| RepairError::Parse {
117 source,
118 repaired: input.to_string(),
119 })
120}