Skip to main content

argyph_parse/structural/
mod.rs

1//! Structural parsers for non-code file formats (markdown, JSON, YAML, TOML, CSV).
2
3use serde::{Deserialize, Serialize};
4
5pub mod csv;
6pub mod json;
7pub mod markdown;
8pub mod toml_parser;
9pub mod yaml;
10
11/// Stable identifier for a structural node.
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
13pub struct NodeId(pub u64);
14
15impl std::fmt::Display for NodeId {
16    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
17        write!(f, "{}", self.0)
18    }
19}
20
21/// The kind of a structural node in a non-code file.
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
23pub enum NodeKind {
24    MdSection,
25    MdParagraph,
26    MdCodeBlock,
27    MdTable,
28    JsonKey,
29    YamlKey,
30    TomlKey,
31    CsvHeader,
32    CsvRow,
33}
34
35/// A structural node extracted from a non-code file (markdown, JSON, YAML, TOML, CSV).
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct StructuralNode {
38    pub id: NodeId,
39    pub file_id: u64,
40    pub kind: NodeKind,
41    pub label: String,
42    pub path: Vec<String>,
43    /// Half-open `[start, end)` byte range in the source text.
44    pub byte_range: (usize, usize),
45    /// 1-based `(first_line, last_line)` inclusive.
46    pub line_range: (u32, u32),
47    pub parent: Option<NodeId>,
48    pub depth: u32,
49}
50
51impl StructuralNode {
52    /// Create a stable, repeatable `NodeId` from the triple `(file_id, kind, path)`.
53    ///
54    /// Uses BLAKE3 for determinism — the same inputs always produce the same ID.
55    #[must_use]
56    pub fn make_id(file_id: u64, kind: NodeKind, path: &[String]) -> NodeId {
57        let mut hasher = blake3::Hasher::new();
58        hasher.update(&file_id.to_le_bytes());
59        hasher.update(&[(kind as u8)]);
60        for segment in path {
61            hasher.update(segment.as_bytes());
62            hasher.update(&[0u8]);
63        }
64        let hash = hasher.finalize();
65        let mut bytes = [0u8; 8];
66        bytes.copy_from_slice(&hash.as_bytes()[..8]);
67        NodeId(u64::from_le_bytes(bytes))
68    }
69}
70
71/// Pre-compute the byte offset of the start of each line in `source`.
72#[must_use]
73pub(crate) fn line_starts(source: &str) -> Vec<usize> {
74    let mut starts = vec![0];
75    for (i, b) in source.bytes().enumerate() {
76        if b == b'\n' {
77            starts.push(i + 1);
78        }
79    }
80    starts
81}
82
83/// Convert a half-open byte range `[start, end)` into a 1-based line range
84/// `(first_line, last_line)`, inclusive.
85///
86/// `line_starts` must have been produced by [`line_starts`].
87#[must_use]
88pub(crate) fn byte_to_line_range(line_starts: &[usize], start: usize, end: usize) -> (u32, u32) {
89    let first = match line_starts.binary_search(&start) {
90        Ok(idx) => idx + 1,
91        Err(idx) => idx,
92    };
93    let end = std::cmp::max(start, end);
94    let last = match line_starts.binary_search(&end.saturating_sub(1)) {
95        Ok(idx) => idx + 1,
96        Err(idx) => idx,
97    };
98    (first as u32, last as u32)
99}
100
101#[cfg(test)]
102mod tests {
103    use super::*;
104
105    #[test]
106    fn make_id_is_stable_across_calls() {
107        let a = StructuralNode::make_id(1, NodeKind::MdSection, &["top".into()]);
108        let b = StructuralNode::make_id(1, NodeKind::MdSection, &["top".into()]);
109        assert_eq!(a, b);
110    }
111
112    #[test]
113    fn make_id_differs_across_paths() {
114        let a = StructuralNode::make_id(1, NodeKind::MdSection, &["a".into()]);
115        let b = StructuralNode::make_id(1, NodeKind::MdSection, &["b".into()]);
116        assert_ne!(a, b);
117    }
118}