rulemorph 0.3.0

YAML-based declarative data transformation engine for CSV/JSON to JSON
Documentation
use serde_json::{Map, Number as JsonNumber, Value as JsonValue};
use serde_yaml::Value as YamlValue;

use crate::error::{TransformError, TransformErrorKind};
use crate::model::RuleFile;
use crate::serde_guard::parse_yaml_value_strict_with_limits;

use super::{NormalizationOptions, enforce_json_limits, select_records_from_document};

pub fn normalize_yaml_records(
    rule: &RuleFile,
    input: &str,
    options: &NormalizationOptions,
) -> Result<Vec<JsonValue>, TransformError> {
    enforce_yaml_alias_limit(input, options)?;
    let value = parse_yaml_value_strict_with_limits(
        input,
        options.max_depth,
        options.max_yaml_expanded_nodes,
        options.max_array_len,
        options.max_text_bytes,
    )
    .map_err(|err| {
        TransformError::new(
            TransformErrorKind::InvalidInput,
            format!("failed to parse YAML input: {}", err),
        )
    })?;
    let mut node_count = 0usize;
    let json = yaml_to_json(&value, options, 0, &mut node_count)?;
    enforce_json_limits(&json, options)?;
    let records = select_records_from_document(
        &json,
        rule.input
            .yaml
            .as_ref()
            .and_then(|yaml| yaml.records_path.as_deref()),
        "input.yaml.records_path",
        options,
    )?;
    Ok(records)
}

fn enforce_yaml_alias_limit(
    input: &str,
    options: &NormalizationOptions,
) -> Result<(), TransformError> {
    let aliases = count_yaml_alias_tokens(input);
    if aliases > options.max_yaml_aliases {
        return Err(TransformError::new(
            TransformErrorKind::InvalidInput,
            "input exceeds max_yaml_aliases",
        ));
    }
    Ok(())
}

fn count_yaml_alias_tokens(input: &str) -> usize {
    let mut count = 0usize;
    let mut block_scalar_indent: Option<usize> = None;

    for line in input.lines() {
        let indent = line.chars().take_while(|value| *value == ' ').count();
        if let Some(block_indent) = block_scalar_indent {
            if line.trim().is_empty() || indent > block_indent {
                continue;
            }
            block_scalar_indent = None;
        }

        if starts_block_scalar(line) {
            block_scalar_indent = Some(indent);
        }
        count = count.saturating_add(count_yaml_alias_tokens_in_line(line));
    }

    count
}

fn starts_block_scalar(line: &str) -> bool {
    let mut in_single = false;
    let mut in_double = false;
    let mut chars = line.chars().peekable();
    while let Some(ch) = chars.next() {
        match ch {
            '\'' if !in_double => in_single = !in_single,
            '"' if !in_single => {
                in_double = !in_double;
                while in_double {
                    match chars.next() {
                        Some('\\') => {
                            chars.next();
                        }
                        Some('"') => in_double = false,
                        Some(_) => {}
                        None => break,
                    }
                }
            }
            '#' if !in_single && !in_double => break,
            '|' | '>' if !in_single && !in_double => {
                let tail = chars.collect::<String>();
                let tail = tail.trim();
                return tail.is_empty()
                    || tail
                        .chars()
                        .all(|value| matches!(value, '+' | '-' | '0'..='9'));
            }
            _ => {}
        }
    }
    false
}

fn count_yaml_alias_tokens_in_line(line: &str) -> usize {
    let mut count = 0usize;
    let mut in_single = false;
    let mut in_double = false;
    let bytes = line.as_bytes();
    let mut index = 0usize;

    while index < bytes.len() {
        let byte = bytes[index];
        match byte {
            b'\'' if !in_double => {
                in_single = !in_single;
                index += 1;
            }
            b'"' if !in_single => {
                in_double = !in_double;
                index += 1;
            }
            b'\\' if in_double => {
                index = (index + 2).min(bytes.len());
            }
            b'#' if !in_single && !in_double => break,
            b'*' if !in_single && !in_double => {
                if is_alias_token_boundary(bytes, index) {
                    count = count.saturating_add(1);
                }
                index += 1;
            }
            _ => index += 1,
        }
    }

    count
}

fn is_alias_token_boundary(bytes: &[u8], index: usize) -> bool {
    let previous = index
        .checked_sub(1)
        .and_then(|previous| bytes.get(previous))
        .copied();
    let next = bytes.get(index + 1).copied();
    previous.is_none_or(|value| {
        value.is_ascii_whitespace() || matches!(value, b'[' | b'{' | b',' | b':' | b'-')
    }) && next.is_some_and(is_yaml_anchor_char)
}

fn is_yaml_anchor_char(value: u8) -> bool {
    value.is_ascii_alphanumeric() || matches!(value, b'_' | b'-')
}

fn yaml_to_json(
    value: &YamlValue,
    options: &NormalizationOptions,
    depth: usize,
    node_count: &mut usize,
) -> Result<JsonValue, TransformError> {
    if depth > options.max_depth {
        return Err(TransformError::new(
            TransformErrorKind::InvalidInput,
            "input exceeds max_depth",
        ));
    }
    *node_count = node_count.saturating_add(1);
    if *node_count > options.max_yaml_expanded_nodes {
        return Err(TransformError::new(
            TransformErrorKind::InvalidInput,
            "input exceeds max_yaml_expanded_nodes",
        ));
    }

    match value {
        YamlValue::Null => Ok(JsonValue::Null),
        YamlValue::Bool(value) => Ok(JsonValue::Bool(*value)),
        YamlValue::Number(value) => yaml_number_to_json(value),
        YamlValue::String(value) => Ok(JsonValue::String(value.clone())),
        YamlValue::Sequence(items) => {
            if items.len() > options.max_array_len {
                return Err(TransformError::new(
                    TransformErrorKind::InvalidInput,
                    "input exceeds max_array_len",
                ));
            }
            let mut out = Vec::with_capacity(items.len());
            for item in items {
                out.push(yaml_to_json(item, options, depth + 1, node_count)?);
            }
            Ok(JsonValue::Array(out))
        }
        YamlValue::Mapping(map) => {
            let mut out = Map::new();
            for (key, value) in map {
                let key = match key {
                    YamlValue::String(key) => key.clone(),
                    _ => {
                        return Err(TransformError::new(
                            TransformErrorKind::InvalidInput,
                            "YAML mapping keys must be strings",
                        ));
                    }
                };
                out.insert(key, yaml_to_json(value, options, depth + 1, node_count)?);
            }
            Ok(JsonValue::Object(out))
        }
        YamlValue::Tagged(_) => Err(TransformError::new(
            TransformErrorKind::InvalidInput,
            "YAML custom tags are not supported",
        )),
    }
}

fn yaml_number_to_json(value: &serde_yaml::Number) -> Result<JsonValue, TransformError> {
    if let Some(value) = value.as_i64() {
        return Ok(JsonValue::Number(value.into()));
    }
    if let Some(value) = value.as_u64() {
        return Ok(JsonValue::Number(value.into()));
    }
    if let Some(value) = value.as_f64()
        && let Some(value) = JsonNumber::from_f64(value)
    {
        return Ok(JsonValue::Number(value));
    }
    Err(TransformError::new(
        TransformErrorKind::InvalidInput,
        "YAML number is not JSON-compatible",
    ))
}