use crate::de::{DuplicateKeyPolicy, ParserConfig, from_str_with_config};
use crate::error::Error;
use crate::value::Value;
#[derive(Debug)]
#[non_exhaustive]
pub struct ParseResult {
pub value: Value,
pub errors: Vec<Error>,
pub is_complete: bool,
}
#[derive(Debug, Clone)]
pub struct LenientConfig {
pub max_errors: usize,
pub recover_duplicate_keys: bool,
pub line_truncation: bool,
pub base_config: ParserConfig,
pub truncation_event_budget: usize,
}
impl Default for LenientConfig {
fn default() -> Self {
Self {
max_errors: 100,
recover_duplicate_keys: true,
line_truncation: true,
base_config: ParserConfig::default(),
truncation_event_budget: 1024 * 1024,
}
}
}
#[must_use]
pub fn parse_lenient(input: &str) -> ParseResult {
parse_lenient_with(input, &LenientConfig::default())
}
#[must_use]
pub fn parse_lenient_with(input: &str, config: &LenientConfig) -> ParseResult {
let bom_skip = crate::doc_boundary::strip_bom(input.as_bytes());
let input = &input[bom_skip..];
let docs = split_documents(input, &config.base_config);
if docs.is_empty() {
return ParseResult {
value: Value::Null,
errors: Vec::new(),
is_complete: true,
};
}
if docs.len() == 1 {
let (value, errors) = recover_one(docs[0], config, config.max_errors);
let is_complete = errors.is_empty();
return ParseResult {
value,
errors,
is_complete,
};
}
let mut values: Vec<Value> = Vec::with_capacity(docs.len());
let mut errors: Vec<Error> = Vec::new();
let mut budget = config.max_errors;
let mut budget_exhausted = false;
for doc in docs {
if budget_exhausted {
values.push(Value::Null);
continue;
}
let (value, doc_errors) = recover_one(doc, config, budget);
budget = budget.saturating_sub(doc_errors.len());
errors.extend(doc_errors);
values.push(value);
if budget == 0 {
budget_exhausted = true;
}
}
let is_complete = errors.is_empty();
ParseResult {
value: Value::Sequence(values),
errors,
is_complete,
}
}
fn recover_one(input: &str, config: &LenientConfig, budget: usize) -> (Value, Vec<Error>) {
if budget == 0 {
return (Value::Null, Vec::new());
}
let strict_err = match from_str_with_config::<Value>(input, &config.base_config) {
Ok(v) => return (v, Vec::new()),
Err(e) => e,
};
let mut errors = vec![strict_err];
let mut tweaked_cfg: Option<ParserConfig> = None;
if config.recover_duplicate_keys
&& config.base_config.duplicate_key_policy != DuplicateKeyPolicy::Last
&& errors.len() < budget
{
let cfg2 = tweaked_cfg.insert({
let mut c = config.base_config.clone();
c.duplicate_key_policy = DuplicateKeyPolicy::Last;
c
});
match from_str_with_config::<Value>(input, cfg2) {
Ok(v) => return (v, errors),
Err(e) => errors.push(e),
}
}
if config.line_truncation && errors.len() < budget {
let pass3_cfg = tweaked_cfg.as_ref().unwrap_or(&config.base_config);
match try_line_truncation(input, pass3_cfg, config.truncation_event_budget) {
TruncationOutcome::Recovered(v) => return (v, errors),
TruncationOutcome::Exhausted(Some(e)) if errors.len() < budget => errors.push(e),
TruncationOutcome::Exhausted(_) => {}
}
}
(Value::Null, errors)
}
enum TruncationOutcome {
Recovered(Value),
Exhausted(Option<Error>),
}
fn try_line_truncation(
input: &str,
config: &ParserConfig,
event_budget: usize,
) -> TruncationOutcome {
let mut boundaries: Vec<usize> = Vec::new();
for (i, b) in input.as_bytes().iter().enumerate() {
if *b == b'\n' {
boundaries.push(i);
}
}
if boundaries.last().copied() != Some(input.len()) {
boundaries.push(input.len());
}
let mut budget_remaining = event_budget;
let mut last_err: Option<Error> = None;
for &cut in boundaries.iter().rev() {
let candidate = &input[..cut];
if candidate.trim().is_empty() {
continue;
}
let cost = candidate.len();
if cost > budget_remaining {
break;
}
budget_remaining = budget_remaining.saturating_sub(cost);
match from_str_with_config::<Value>(candidate, config) {
Ok(v) => return TruncationOutcome::Recovered(v),
Err(e) => last_err = Some(e),
}
}
TruncationOutcome::Exhausted(last_err)
}
fn split_documents<'a>(input: &'a str, config: &ParserConfig) -> Vec<&'a str> {
crate::doc_boundary::split_documents(input, config.max_documents)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn valid_input_is_complete() {
let r = parse_lenient("a: 1\nb: 2\n");
assert!(r.is_complete);
assert!(r.errors.is_empty());
let m = r.value.as_mapping().unwrap();
assert!(m.contains_key("a"));
assert!(m.contains_key("b"));
}
#[test]
fn empty_input_is_complete() {
let r = parse_lenient("");
assert!(r.is_complete);
assert!(r.errors.is_empty());
assert!(matches!(r.value, Value::Null));
}
#[test]
fn duplicate_key_is_recovered() {
let cfg = LenientConfig {
base_config: ParserConfig::default().duplicate_key_policy(DuplicateKeyPolicy::Error),
..LenientConfig::default()
};
let r = parse_lenient_with("a: 1\na: 2\n", &cfg);
assert!(!r.is_complete);
assert_eq!(r.errors.len(), 1);
let m = r.value.as_mapping().unwrap();
let v = m.get("a").unwrap();
assert_eq!(v.as_i64(), Some(2));
}
#[test]
fn unrecoverable_input_yields_null_with_errors() {
let r = parse_lenient("[\n");
assert!(!r.is_complete);
assert!(!r.errors.is_empty());
}
#[test]
fn line_truncation_recovers_trailing_garbage() {
let r = parse_lenient("a: 1\nb: 2\nc: [unclosed\n");
assert!(!r.is_complete);
assert!(!r.errors.is_empty());
if let Value::Mapping(m) = &r.value {
assert!(m.contains_key("a"));
}
}
#[test]
fn multi_doc_recovers_each_independently() {
let yaml = "---\na: 1\n---\nb: [unclosed\n---\nc: 3\n";
let r = parse_lenient(yaml);
assert!(!r.is_complete);
let seq = match &r.value {
Value::Sequence(s) => s,
_ => panic!("expected sequence for multi-doc input"),
};
assert_eq!(seq.len(), 3);
assert!(matches!(&seq[0], Value::Mapping(_)));
assert!(matches!(&seq[2], Value::Mapping(_)));
}
#[test]
fn max_errors_caps_collection() {
let cfg = LenientConfig {
max_errors: 1,
..LenientConfig::default()
};
let yaml = "---\na: [bad\n---\nb: [bad\n---\nc: [bad\n";
let r = parse_lenient_with(yaml, &cfg);
assert!(r.errors.len() <= 1);
}
#[test]
fn split_documents_handles_single() {
let d = split_documents("a: 1\n", &ParserConfig::default());
assert_eq!(d.len(), 1);
}
#[test]
fn split_documents_handles_empty() {
let cfg = ParserConfig::default();
assert!(split_documents("", &cfg).is_empty());
assert!(split_documents(" \n", &cfg).is_empty());
}
#[test]
fn recover_disabled_passes_just_collect_errors() {
let cfg = LenientConfig {
recover_duplicate_keys: false,
line_truncation: false,
..LenientConfig::default()
};
let r = parse_lenient_with("[unclosed", &cfg);
assert!(!r.is_complete);
assert_eq!(r.errors.len(), 1);
assert!(matches!(r.value, Value::Null));
}
#[test]
fn line_truncation_disabled_skips_third_pass() {
let cfg = LenientConfig {
line_truncation: false,
..LenientConfig::default()
};
let r = parse_lenient_with("a: 1\nb: [bad\n", &cfg);
assert!(!r.is_complete);
assert!(matches!(r.value, Value::Null));
}
#[test]
fn config_is_debug_and_clone() {
let cfg = LenientConfig::default();
let _printed = format!("{cfg:?}");
let cloned = cfg.clone();
assert_eq!(cloned.max_errors, cfg.max_errors);
}
#[test]
fn parse_result_is_debug() {
let r = parse_lenient("a: 1\n");
let _printed = format!("{r:?}");
}
#[test]
fn split_documents_handles_implicit_first_doc() {
let d = split_documents("name: pre\n---\nname: post\n", &ParserConfig::default());
assert_eq!(d.len(), 2);
}
#[test]
fn split_documents_ignores_mid_line_dashes() {
let d = split_documents("a: ---\nb: 2\n", &ParserConfig::default());
assert_eq!(d.len(), 1);
}
#[test]
fn crlf_input_recovers_cleanly() {
let r = parse_lenient("a: 1\r\nb: 2\r\n");
assert!(r.is_complete);
if let Value::Mapping(m) = &r.value {
assert!(m.contains_key("a"));
assert!(m.contains_key("b"));
} else {
panic!("expected mapping for CRLF input, got {:?}", r.value);
}
}
#[test]
fn bom_prefix_is_stripped() {
let r = parse_lenient("\u{FEFF}a: 1\nb: 2\n");
assert!(r.is_complete);
if let Value::Mapping(m) = &r.value {
assert!(m.contains_key("a"));
} else {
panic!("BOM-prefixed input should parse cleanly");
}
}
#[test]
fn marker_spam_is_bounded() {
let yaml = "---\n".repeat(10_000);
let r = parse_lenient(&yaml);
if let Value::Sequence(s) = &r.value {
assert!(s.len() <= 1000);
} else {
}
}
#[test]
fn truncation_handles_no_trailing_newline() {
let r = parse_lenient("a: 1\nb: [bad");
if let Value::Mapping(m) = &r.value {
assert_eq!(m.get("a").and_then(|v| v.as_i64()), Some(1));
}
}
#[test]
fn budget_exhaustion_preserves_indices() {
let cfg = LenientConfig {
max_errors: 1,
..LenientConfig::default()
};
let yaml = "---\na: [bad\n---\nb: [bad\n---\nc: [bad\n";
let r = parse_lenient_with(yaml, &cfg);
if let Value::Sequence(s) = &r.value {
assert_eq!(s.len(), 3);
} else {
panic!("expected sequence with all 3 indices preserved");
}
}
#[test]
fn truncation_budget_caps_retries() {
let cfg = LenientConfig {
truncation_event_budget: 64,
..LenientConfig::default()
};
let mut yaml = String::from("a: 1\n");
for _ in 0..10_000 {
yaml.push_str("[bad\n");
}
let _r = parse_lenient_with(&yaml, &cfg);
}
}