use serde_json::Value as JsonValue;
use super::errors::{
CanonicalizeError, CanonicalizeResult, MAX_DEPTH, MAX_KEYS_PER_MAPPING, MAX_SAFE_INTEGER,
MAX_STRING_LENGTH, MAX_TOTAL_SIZE, MIN_SAFE_INTEGER,
};
pub fn parse_yaml_strict(content: &str) -> CanonicalizeResult<JsonValue> {
if content.len() > MAX_TOTAL_SIZE {
return Err(CanonicalizeError::InputTooLarge {
size: content.len(),
});
}
pre_scan_yaml(content)?;
let yaml_value: serde_yaml::Value =
serde_yaml::from_str(content).map_err(|e| CanonicalizeError::ParseError {
message: e.to_string(),
})?;
let json_value = yaml_to_json(&yaml_value, 0)?;
Ok(json_value)
}
fn pre_scan_yaml(content: &str) -> CanonicalizeResult<()> {
let mut key_stack: Vec<(usize, std::collections::HashSet<String>)> =
vec![(0, std::collections::HashSet::new())];
for (line_num, line) in content.lines().enumerate() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let indent = line.len() - line.trim_start().len();
if trimmed == "---" || trimmed.starts_with("--- ") || trimmed == "..." {
return Err(CanonicalizeError::MultiDocumentFound);
}
if let Some(err) = find_anchor_or_alias_outside_quotes(trimmed, line_num + 1) {
return Err(err);
}
if trimmed.contains("!!") || trimmed.contains("!<") {
if !is_inside_quotes(trimmed, "!!") && !is_inside_quotes(trimmed, "!<") {
let tag_start = trimmed.find("!!").or_else(|| trimmed.find("!<")).unwrap();
let tag_end = trimmed[tag_start..]
.find(|c: char| c.is_whitespace() || c == ':')
.map(|p| tag_start + p)
.unwrap_or(trimmed.len().min(tag_start + 20));
return Err(CanonicalizeError::TagFound {
tag: trimmed[tag_start..tag_end].to_string(),
});
}
}
let is_list_item = trimmed.starts_with('-');
let key_source = if is_list_item {
trimmed
.strip_prefix('-')
.map(|s| s.trim_start())
.unwrap_or("")
} else {
trimmed
};
if is_list_item {
while key_stack.len() > 1
&& key_stack.last().map(|(i, _)| *i >= indent).unwrap_or(false)
{
key_stack.pop();
}
key_stack.push((indent + 1, std::collections::HashSet::new()));
}
if let Some(key) = extract_yaml_key(key_source) {
if key == "<<" {
return Err(CanonicalizeError::ParseError {
message: "reason=merge_key_not_allowed: merge keys (<<) not allowed"
.to_string(),
});
}
if !is_list_item {
while key_stack.len() > 1
&& key_stack.last().map(|(i, _)| *i > indent).unwrap_or(false)
{
key_stack.pop();
}
if key_stack.last().map(|(i, _)| *i < indent).unwrap_or(true) {
key_stack.push((indent, std::collections::HashSet::new()));
}
}
if let Some((_, keys)) = key_stack.last_mut() {
if !keys.insert(key.clone()) {
return Err(CanonicalizeError::DuplicateKey { key });
}
}
}
}
Ok(())
}
fn find_anchor_or_alias_outside_quotes(line: &str, line_num: usize) -> Option<CanonicalizeError> {
let mut in_single = false;
let mut in_double = false;
let mut chars = line.chars().peekable();
while let Some(ch) = chars.next() {
match ch {
'\'' if !in_double => in_single = !in_single,
'"' if !in_single => in_double = !in_double,
'&' | '*' if !in_single && !in_double => {
if let Some(&next) = chars.peek() {
if next.is_alphanumeric() || next == '_' {
let position = format!("line {}", line_num);
return Some(match ch {
'&' => CanonicalizeError::AnchorFound { position },
_ => CanonicalizeError::AliasFound { position },
});
}
}
}
_ => {}
}
}
None
}
fn find_closing_double_quote(s: &str) -> Option<usize> {
let mut chars = s.char_indices();
while let Some((pos, c)) = chars.next() {
match c {
'\\' => {
chars.next();
}
'"' => return Some(pos),
_ => {}
}
}
None
}
fn is_inside_quotes(line: &str, pattern: &str) -> bool {
if let Some(pos) = line.find(pattern) {
let before = &line[..pos];
let double_quotes = before.matches('"').count() - before.matches("\\\"").count();
let single_quotes = before.matches('\'').count() - before.matches("\\'").count();
double_quotes % 2 == 1 || single_quotes % 2 == 1
} else {
false
}
}
fn extract_yaml_key(line: &str) -> Option<String> {
let trimmed = line.trim();
if trimmed.starts_with('-') {
return None;
}
if trimmed == "|" || trimmed == ">" || trimmed == "|-" || trimmed == ">-" {
return None;
}
if let Some(after_dquote) = trimmed.strip_prefix('"') {
if let Some(end_quote) = find_closing_double_quote(after_dquote) {
let key = &after_dquote[..end_quote];
let after_key = &after_dquote[end_quote + 1..];
if after_key.trim_start().starts_with(':') {
return Some(key.to_string());
}
}
return None;
}
if let Some(after_squote) = trimmed.strip_prefix('\'') {
if let Some(end_quote) = after_squote.find('\'') {
let key = &after_squote[..end_quote];
let after_key = &after_squote[end_quote + 1..];
if after_key.trim_start().starts_with(':') {
return Some(key.to_string());
}
}
return None;
}
let mut depth: usize = 0;
for (i, c) in trimmed.char_indices() {
match c {
'[' | '{' => depth += 1,
']' | '}' => depth = depth.saturating_sub(1),
':' if depth == 0 => {
let key = trimmed[..i].trim();
if !key.is_empty() && !key.contains(' ') {
return Some(key.to_string());
}
return None;
}
_ => {}
}
}
None
}
fn yaml_to_json(yaml: &serde_yaml::Value, depth: usize) -> CanonicalizeResult<JsonValue> {
if depth > MAX_DEPTH {
return Err(CanonicalizeError::MaxDepthExceeded { depth });
}
match yaml {
serde_yaml::Value::Null => Ok(JsonValue::Null),
serde_yaml::Value::Bool(b) => Ok(JsonValue::Bool(*b)),
serde_yaml::Value::Number(n) => {
if n.is_f64() {
return Err(CanonicalizeError::FloatNotAllowed {
value: n.to_string(),
});
}
if let Some(i) = n.as_i64() {
if !(MIN_SAFE_INTEGER..=MAX_SAFE_INTEGER).contains(&i) {
return Err(CanonicalizeError::IntegerOutOfRange { value: i });
}
Ok(JsonValue::Number(serde_json::Number::from(i)))
} else if let Some(u) = n.as_u64() {
if u > MAX_SAFE_INTEGER as u64 {
return Err(CanonicalizeError::IntegerOutOfRange { value: u as i64 });
}
Ok(JsonValue::Number(serde_json::Number::from(u)))
} else {
Err(CanonicalizeError::FloatNotAllowed {
value: n.to_string(),
})
}
}
serde_yaml::Value::String(s) => {
if s.len() > MAX_STRING_LENGTH {
return Err(CanonicalizeError::StringTooLong { length: s.len() });
}
Ok(JsonValue::String(s.clone()))
}
serde_yaml::Value::Sequence(seq) => {
let items: CanonicalizeResult<Vec<JsonValue>> = seq
.iter()
.map(|item| yaml_to_json(item, depth + 1))
.collect();
Ok(JsonValue::Array(items?))
}
serde_yaml::Value::Mapping(map) => {
if map.len() > MAX_KEYS_PER_MAPPING {
return Err(CanonicalizeError::MaxKeysExceeded { count: map.len() });
}
let mut json_map = serde_json::Map::new();
let mut seen_keys = std::collections::HashSet::new();
for (key, value) in map {
let key_str = match key {
serde_yaml::Value::String(s) => s.clone(),
_ => {
return Err(CanonicalizeError::ParseError {
message: format!("non-string key: {:?}", key),
})
}
};
if key_str == "<<" {
return Err(CanonicalizeError::ParseError {
message: "reason=merge_key_not_allowed: merge keys (<<) not allowed"
.to_string(),
});
}
if !seen_keys.insert(key_str.clone()) {
return Err(CanonicalizeError::DuplicateKey { key: key_str });
}
let json_value = yaml_to_json(value, depth + 1)?;
json_map.insert(key_str, json_value);
}
Ok(JsonValue::Object(json_map))
}
serde_yaml::Value::Tagged(tagged) => Err(CanonicalizeError::TagFound {
tag: format!("{:?}", tagged.tag),
}),
}
}