liwe 0.1.10

IWE core library
Documentation
use std::collections::{BTreeMap, HashMap};

use serde::Serialize;
use serde_yaml::Value;

use crate::graph::Graph;
use crate::model::Key;
use crate::query::document::YamlType;
use crate::query::filter::detect_type;
use crate::query::frontmatter::is_reserved_segment;

const MAX_DISTINCT_VALUES: usize = 100;

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct TypeCount {
    #[serde(rename = "type")]
    pub yaml_type: String,
    pub count: usize,
    pub percentage: f64,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct ValueCount {
    pub value: String,
    pub count: usize,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct Coverage {
    pub count: usize,
    pub percentage: f64,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct FieldSchema {
    #[serde(rename = "field")]
    pub name: String,
    pub types: Vec<TypeCount>,
    pub coverage: Coverage,
    pub distinct: usize,
    pub values: Vec<ValueCount>,
}

struct FieldAccumulator {
    type_counts: HashMap<YamlType, usize>,
    coverage: usize,
    value_counts: HashMap<String, usize>,
}

impl FieldAccumulator {
    fn new() -> Self {
        Self {
            type_counts: HashMap::new(),
            coverage: 0,
            value_counts: HashMap::new(),
        }
    }

    fn record(&mut self, value: &Value) {
        self.coverage += 1;
        let t = detect_type(value);
        *self.type_counts.entry(t).or_insert(0) += 1;

        if let Some(s) = scalar_to_string(value) {
            *self.value_counts.entry(s).or_insert(0) += 1;
        }
    }
}

fn is_enum_like(s: &str) -> bool {
    !s.is_empty()
        && s.chars()
            .all(|c| c.is_alphanumeric() || c == '-' || c == '_' || c == '.' || c == '/')
}

fn scalar_to_string(v: &Value) -> Option<String> {
    match v {
        Value::Null => Some("null".to_string()),
        Value::Bool(b) => Some(b.to_string()),
        Value::Number(n) => Some(n.to_string()),
        Value::String(s) if is_enum_like(s) => Some(s.clone()),
        Value::Tagged(t) => scalar_to_string(&t.value),
        _ => None,
    }
}

fn walk_mapping(
    mapping: &serde_yaml::Mapping,
    prefix: &str,
    accumulators: &mut BTreeMap<String, FieldAccumulator>,
) {
    for (key, value) in mapping {
        let field_name = match key.as_str() {
            Some(s) => s,
            None => continue,
        };

        if is_reserved_segment(field_name) {
            continue;
        }

        let path = if prefix.is_empty() {
            field_name.to_string()
        } else {
            format!("{}.{}", prefix, field_name)
        };

        let acc = accumulators
            .entry(path.clone())
            .or_insert_with(FieldAccumulator::new);
        acc.record(value);

        if let Value::Mapping(nested) = value {
            walk_mapping(nested, &path, accumulators);
        }
    }
}

pub fn infer_schema(graph: &Graph, keys: &[Key]) -> Vec<FieldSchema> {
    let total_documents = keys.len();
    let mut accumulators: BTreeMap<String, FieldAccumulator> = BTreeMap::new();

    for key in keys {
        if let Some(mapping) = graph.frontmatter(key) {
            walk_mapping(mapping, "", &mut accumulators);
        }
    }

    accumulators
        .into_iter()
        .map(|(name, acc)| {
            let coverage_count = acc.coverage;

            let mut types: Vec<TypeCount> = acc
                .type_counts
                .into_iter()
                .map(|(t, count)| TypeCount {
                    yaml_type: t.to_string(),
                    count,
                    percentage: if coverage_count > 0 {
                        (count as f64 / coverage_count as f64) * 100.0
                    } else {
                        0.0
                    },
                })
                .collect();
            types.sort_by(|a, b| b.count.cmp(&a.count).then(a.yaml_type.cmp(&b.yaml_type)));

            let distinct = acc.value_counts.len();
            let values = if distinct <= MAX_DISTINCT_VALUES {
                let mut vals: Vec<ValueCount> = acc
                    .value_counts
                    .into_iter()
                    .map(|(value, count)| ValueCount { value, count })
                    .collect();
                vals.sort_by(|a, b| b.count.cmp(&a.count).then(a.value.cmp(&b.value)));
                vals
            } else {
                Vec::new()
            };

            FieldSchema {
                name,
                types,
                coverage: Coverage {
                    count: coverage_count,
                    percentage: if total_documents > 0 {
                        (coverage_count as f64 / total_documents as f64) * 100.0
                    } else {
                        0.0
                    },
                },
                distinct,
                values,
            }
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::markdown::MarkdownReader;
    use crate::model::Key;

    fn build_graph(docs: &[(&str, &str)]) -> Graph {
        let mut graph = Graph::new();
        for (key, content) in docs {
            graph.from_markdown(Key::name(key), content, MarkdownReader::new());
        }
        graph
    }

    #[test]
    fn empty_graph() {
        let graph = build_graph(&[]);
        let fields = infer_schema(&graph, &[]);
        assert!(fields.is_empty());
    }

    #[test]
    fn single_doc_flat_fields() {
        let graph = build_graph(&[("doc1", "---\ntype: post\nstatus: draft\n---\n# Title\n")]);
        let keys = vec![Key::name("doc1")];
        let fields = infer_schema(&graph, &keys);

        assert_eq!(fields.len(), 2);

        let type_field = fields.iter().find(|f| f.name == "type").unwrap();
        assert_eq!(type_field.coverage.count, 1);
        assert_eq!(type_field.coverage.percentage, 100.0);
        assert_eq!(type_field.types.len(), 1);
        assert_eq!(type_field.types[0].yaml_type, "string");
        assert_eq!(type_field.values.len(), 1);
        assert_eq!(type_field.values[0].value, "post");
    }

    #[test]
    fn partial_coverage() {
        let graph = build_graph(&[
            ("doc1", "---\ntype: post\nstatus: draft\n---\n# A\n"),
            ("doc2", "---\ntype: external\n---\n# B\n"),
        ]);
        let keys = vec![Key::name("doc1"), Key::name("doc2")];
        let fields = infer_schema(&graph, &keys);

        let type_field = fields.iter().find(|f| f.name == "type").unwrap();
        assert_eq!(type_field.coverage.count, 2);
        assert_eq!(type_field.coverage.percentage, 100.0);

        let status_field = fields.iter().find(|f| f.name == "status").unwrap();
        assert_eq!(status_field.coverage.count, 1);
        assert_eq!(status_field.coverage.percentage, 50.0);
    }

    #[test]
    fn polymorphic_types() {
        let graph = build_graph(&[
            ("doc1", "---\nurl: https://example.com\n---\n# A\n"),
            ("doc2", "---\nurl: null\n---\n# B\n"),
        ]);
        let keys = vec![Key::name("doc1"), Key::name("doc2")];
        let fields = infer_schema(&graph, &keys);

        let url_field = fields.iter().find(|f| f.name == "url").unwrap();
        assert_eq!(url_field.coverage.count, 2);
        assert_eq!(url_field.types.len(), 2);

        let string_type = url_field
            .types
            .iter()
            .find(|t| t.yaml_type == "string")
            .unwrap();
        assert_eq!(string_type.count, 1);
        let null_type = url_field
            .types
            .iter()
            .find(|t| t.yaml_type == "null")
            .unwrap();
        assert_eq!(null_type.count, 1);
    }

    #[test]
    fn nested_objects() {
        let graph = build_graph(&[(
            "doc1",
            "---\nengagement:\n  upvotes: 10\n  comments: 5\n---\n# A\n",
        )]);
        let keys = vec![Key::name("doc1")];
        let fields = infer_schema(&graph, &keys);

        let names: Vec<&str> = fields.iter().map(|f| f.name.as_str()).collect();
        assert!(names.contains(&"engagement"));
        assert!(names.contains(&"engagement.upvotes"));
        assert!(names.contains(&"engagement.comments"));

        let engagement = fields.iter().find(|f| f.name == "engagement").unwrap();
        assert_eq!(engagement.types[0].yaml_type, "object");
    }

    #[test]
    fn date_detection() {
        let graph = build_graph(&[("doc1", "---\ncreated: 2026-04-25\n---\n# A\n")]);
        let keys = vec![Key::name("doc1")];
        let fields = infer_schema(&graph, &keys);

        let created = fields.iter().find(|f| f.name == "created").unwrap();
        assert_eq!(created.types[0].yaml_type, "date");
    }

    #[test]
    fn value_counting() {
        let graph = build_graph(&[
            ("doc1", "---\nstatus: draft\n---\n# A\n"),
            ("doc2", "---\nstatus: published\n---\n# B\n"),
            ("doc3", "---\nstatus: draft\n---\n# C\n"),
        ]);
        let keys = vec![Key::name("doc1"), Key::name("doc2"), Key::name("doc3")];
        let fields = infer_schema(&graph, &keys);

        let status = fields.iter().find(|f| f.name == "status").unwrap();
        assert_eq!(status.values.len(), 2);
        assert_eq!(status.values[0].value, "draft");
        assert_eq!(status.values[0].count, 2);
        assert_eq!(status.values[1].value, "published");
        assert_eq!(status.values[1].count, 1);
    }

    #[test]
    fn reserved_fields_skipped() {
        let graph = build_graph(&[("doc1", "---\ntype: post\n_internal: secret\n---\n# A\n")]);
        let keys = vec![Key::name("doc1")];
        let fields = infer_schema(&graph, &keys);

        let names: Vec<&str> = fields.iter().map(|f| f.name.as_str()).collect();
        assert!(names.contains(&"type"));
        assert!(!names.contains(&"_internal"));
    }

    #[test]
    fn filtered_keys_subset() {
        let graph = build_graph(&[
            ("doc1", "---\ntype: post\nstatus: draft\n---\n# A\n"),
            (
                "doc2",
                "---\ntype: external\nurl: https://x.com\n---\n# B\n",
            ),
        ]);
        let keys = vec![Key::name("doc1")];
        let fields = infer_schema(&graph, &keys);

        assert_eq!(fields.len(), 2);
        let names: Vec<&str> = fields.iter().map(|f| f.name.as_str()).collect();
        assert!(names.contains(&"status"));
        assert!(!names.contains(&"url"));
    }
}