use anyhow::{Result, anyhow};
use crate::PriorityConfig;
use crate::order::NodeKind;
use crate::utils::tree_arena::{JsonTreeArena, JsonTreeNode};
use crate::ingest::sampling::{ArraySamplerKind, choose_indices};
use yaml_rust2::Yaml;
pub fn build_yaml_tree_arena_from_bytes(
bytes: &[u8],
config: &PriorityConfig,
) -> Result<JsonTreeArena> {
let s = std::str::from_utf8(bytes)
.map_err(|_| anyhow!("input is not valid UTF-8 text"))?;
let docs = yaml_rust2::YamlLoader::load_from_str(s)?;
let mut b = YamlArenaBuilder::new(
config.array_max_items,
config.array_sampler.into(),
);
let root_id = if docs.len() <= 1 {
match docs.first() {
Some(doc) => b.build(doc),
None => b.build(&Yaml::Array(vec![])),
}
} else {
let mut children: Vec<usize> = Vec::with_capacity(docs.len());
for d in &docs {
children.push(b.build(d));
}
b.push_array(&children, docs.len(), (0..docs.len()).collect())
};
let mut arena = b.finish();
arena.root_id = root_id;
Ok(arena)
}
struct YamlArenaBuilder {
arena: JsonTreeArena,
array_cap: usize,
sampler: ArraySamplerKind,
}
impl YamlArenaBuilder {
fn new(array_cap: usize, sampler: ArraySamplerKind) -> Self {
Self {
arena: JsonTreeArena::default(),
array_cap,
sampler,
}
}
fn finish(self) -> JsonTreeArena {
self.arena
}
fn push_default(&mut self) -> usize {
let id = self.arena.nodes.len();
self.arena.nodes.push(JsonTreeNode::default());
id
}
fn push_object_root(
&mut self,
keys: Vec<String>,
children: Vec<usize>,
) -> usize {
let id = self.push_default();
let count = keys.len().min(children.len());
self.finish_object(id, count, children, keys);
id
}
fn push_array(
&mut self,
children: &[usize],
total_len: usize,
indices: Vec<usize>,
) -> usize {
let id = self.push_default();
let kept = indices.len().min(self.array_cap).min(children.len());
let mut kept_children = Vec::with_capacity(kept);
for &i in indices.iter().take(kept) {
if let Some(cid) = children.get(i) {
kept_children.push(*cid);
}
}
self.finish_array(
id,
kept,
total_len,
kept_children,
indices.into_iter().take(kept).collect(),
);
id
}
fn finish_array(
&mut self,
id: usize,
kept: usize,
total: usize,
local_children: Vec<usize>,
local_indices: Vec<usize>,
) {
let children_start = self.arena.children.len();
self.arena.children.extend(local_children);
let contiguous = local_indices.len() == kept
&& local_indices.iter().enumerate().all(|(i, &idx)| idx == i);
let (arr_indices_start, pushed_len) =
if kept == 0 || contiguous || local_indices.is_empty() {
(0usize, 0usize)
} else {
let start = self.arena.arr_indices.len();
self.arena.arr_indices.extend(local_indices);
let pushed =
self.arena.arr_indices.len().saturating_sub(start);
(start, pushed)
};
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::Array;
n.children_start = children_start;
n.children_len = kept;
n.array_len = Some(total);
n.arr_indices_start = arr_indices_start;
n.arr_indices_len = pushed_len.min(kept);
}
fn finish_object(
&mut self,
id: usize,
count: usize,
local_children: Vec<usize>,
local_keys: Vec<String>,
) {
let children_start = self.arena.children.len();
let obj_keys_start = self.arena.obj_keys.len();
self.arena.children.extend(local_children);
self.arena.obj_keys.extend(local_keys);
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::Object;
n.children_start = children_start;
n.children_len = count;
n.obj_keys_start = obj_keys_start;
n.obj_keys_len = count;
n.object_len = Some(count);
}
#[allow(
clippy::cognitive_complexity,
reason = "YAML node conversion keeps all cases local for clarity"
)]
fn build(&mut self, y: &Yaml) -> usize {
match y {
Yaml::Array(v) => {
let total = v.len();
let idxs = choose_indices(self.sampler, total, self.array_cap);
let mut child_ids = Vec::with_capacity(idxs.len());
for i in &idxs {
if let Some(item) = v.get(*i) {
child_ids.push(self.build(item));
}
}
self.push_array(&child_ids, total, idxs)
}
Yaml::Hash(hm) => {
let mut keys: Vec<String> = Vec::with_capacity(hm.len());
let mut children: Vec<usize> = Vec::with_capacity(hm.len());
for (k, v) in hm.iter() {
let key = stringify_yaml_key(k);
let cid = self.build(v);
keys.push(key);
children.push(cid);
}
self.push_object_root(keys, children)
}
Yaml::String(s) => {
let id = self.push_default();
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::String;
n.string_value = Some(s.clone());
id
}
Yaml::Integer(i) => {
let id = self.push_default();
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::Number;
n.atomic_token = Some(i.to_string());
id
}
Yaml::Real(s) => {
let id = self.push_default();
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::Number;
n.atomic_token = Some(s.clone());
id
}
Yaml::Boolean(b) => {
let id = self.push_default();
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::Bool;
n.atomic_token =
Some(if *b { "true" } else { "false" }.to_string());
id
}
Yaml::Null | Yaml::BadValue => {
let id = self.push_default();
let n = &mut self.arena.nodes[id];
n.kind = NodeKind::Null;
n.atomic_token = Some("null".to_string());
id
}
Yaml::Alias(_n) => {
let id = self.push_default();
let node = &mut self.arena.nodes[id];
node.kind = NodeKind::String;
node.string_value = Some("*alias".to_string());
id
}
}
}
}
fn stringify_yaml_key(k: &Yaml) -> String {
match k {
Yaml::String(s) | Yaml::Real(s) => s.clone(),
Yaml::Integer(i) => i.to_string(),
Yaml::Boolean(b) => (if *b { "true" } else { "false" }).to_string(),
Yaml::Null | Yaml::BadValue => "null".to_string(),
Yaml::Array(_) | Yaml::Hash(_) | Yaml::Alias(_) => {
"<complex>".to_string()
}
}
}
pub fn parse_yaml_one(
bytes: &[u8],
cfg: &PriorityConfig,
) -> Result<JsonTreeArena> {
build_yaml_tree_arena_from_bytes(bytes, cfg)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ArraySamplerStrategy;
#[test]
fn tail_sampler_keeps_last_n_indices_yaml() {
let input = b"[0,1,2,3,4,5,6,7,8,9]".to_vec();
let mut cfg = PriorityConfig::new(usize::MAX, 5);
cfg.array_sampler = ArraySamplerStrategy::Tail;
let arena =
build_yaml_tree_arena_from_bytes(&input, &cfg).expect("arena");
let root = &arena.nodes[arena.root_id];
assert_eq!(root.children_len, 5, "kept 5");
let mut orig_indices = Vec::new();
for i in 0..root.children_len {
let oi = if root.arr_indices_len > 0 {
arena.arr_indices[root.arr_indices_start + i]
} else {
i
};
orig_indices.push(oi);
}
assert_eq!(orig_indices, vec![5, 6, 7, 8, 9]);
}
}