Skip to main content

llm_transpile/
ir.rs

1//! ir.rs — Intermediate Representation
2//!
3//! A language-neutral internal representation (IR) that holds raw documents
4//! before they are converted into the LLM bridge format.
5//! Semantic preservation level is controlled explicitly.
6
7// ────────────────────────────────────────────────
8// 1. Semantic preservation level
9// ────────────────────────────────────────────────
10
11/// The degree of information loss permitted during document conversion.
12///
13/// Decided once at the top of the pipeline; every subsequent transformation
14/// stage consistently respects this constraint.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum FidelityLevel {
17    /// Audit/legal documents — 100% source preservation, compression forbidden.
18    Lossless,
19    /// General RAG pipeline — minimal compression at the semantic unit level.
20    Semantic,
21    /// Summarization pipeline — maximum compression, only key information kept.
22    Compressed,
23}
24
25impl FidelityLevel {
26    /// Returns whether lossy compression is permitted.
27    pub fn allows_compression(self) -> bool {
28        matches!(self, FidelityLevel::Semantic | FidelityLevel::Compressed)
29    }
30}
31
32// ────────────────────────────────────────────────
33// 2. Document node (DocNode)
34// ────────────────────────────────────────────────
35
36/// A semantic unit that makes up a document.
37///
38/// Produced by the parser and consumed by the renderer, compressor, and symbolizer.
39#[derive(Debug, Clone)]
40pub enum DocNode {
41    /// Heading (H1–H6).
42    Header {
43        /// Heading level (1–6).
44        level: u8,
45        text: String,
46    },
47
48    /// Regular paragraph.
49    Para {
50        text: String,
51        /// Importance score (0.0 = lowest, 1.0 = highest).
52        /// Used by the compressor for priority-based trimming.
53        importance: f32,
54    },
55
56    /// Table.
57    Table {
58        headers: Vec<String>,
59        rows: Vec<Vec<String>>,
60    },
61
62    /// Code block.
63    Code { lang: Option<String>, body: String },
64
65    /// List (ordered or unordered).
66    List { ordered: bool, items: Vec<String> },
67
68    /// Key-value metadata (title, summary, keywords, etc.).
69    Metadata { key: String, value: String },
70}
71
72impl DocNode {
73    /// Returns the importance score of the node.
74    ///
75    /// Nodes other than `Para` have a default importance of 1.0.
76    pub fn importance(&self) -> f32 {
77        match self {
78            DocNode::Para { importance, .. } => *importance,
79            _ => 1.0,
80        }
81    }
82
83    /// Returns the approximate character count of the text held by the node.
84    /// Used for pre-filtering against the token budget.
85    pub fn char_len(&self) -> usize {
86        match self {
87            DocNode::Header { text, .. } => text.len(),
88            DocNode::Para { text, .. } => text.len(),
89            DocNode::Table { headers, rows } => {
90                headers.iter().map(|h| h.len()).sum::<usize>()
91                    + rows
92                        .iter()
93                        .flat_map(|r| r.iter())
94                        .map(|c| c.len())
95                        .sum::<usize>()
96            }
97            DocNode::Code { body, .. } => body.len(),
98            DocNode::List { items, .. } => items.iter().map(|i| i.len()).sum(),
99            DocNode::Metadata { key, value } => key.len() + value.len(),
100        }
101    }
102}
103
104// ────────────────────────────────────────────────
105// 3. IR document
106// ────────────────────────────────────────────────
107
108/// The complete IR representation of a parsed document.
109///
110/// `fidelity` and `token_budget` act as constraints for every subsequent transformation stage.
111#[derive(Debug, Clone)]
112pub struct IRDocument {
113    /// Semantic preservation level.
114    pub fidelity: FidelityLevel,
115    /// Sequence of document nodes.
116    pub nodes: Vec<DocNode>,
117    /// Maximum allowed token count. `None` means unlimited.
118    pub token_budget: Option<usize>,
119}
120
121impl IRDocument {
122    /// Creates a new IR document.
123    pub fn new(fidelity: FidelityLevel, token_budget: Option<usize>) -> Self {
124        Self {
125            fidelity,
126            nodes: Vec::new(),
127            token_budget,
128        }
129    }
130
131    /// Appends a node.
132    pub fn push(&mut self, node: DocNode) {
133        self.nodes.push(node);
134    }
135
136    /// Total character count of the document (for pre-validation against the token budget).
137    pub fn total_char_len(&self) -> usize {
138        self.nodes.iter().map(|n| n.char_len()).sum()
139    }
140
141    /// Looks up the value for a specific key from metadata nodes.
142    pub fn get_metadata(&self, key: &str) -> Option<&str> {
143        self.nodes.iter().find_map(|n| {
144            if let DocNode::Metadata { key: k, value } = n
145                && k == key
146            {
147                return Some(value.as_str());
148            }
149            None
150        })
151    }
152}
153
154// ────────────────────────────────────────────────
155// 4. Unit tests
156// ────────────────────────────────────────────────
157
158#[cfg(test)]
159mod tests {
160    use super::*;
161
162    #[test]
163    fn fidelity_compression_flag() {
164        assert!(!FidelityLevel::Lossless.allows_compression());
165        assert!(FidelityLevel::Semantic.allows_compression());
166        assert!(FidelityLevel::Compressed.allows_compression());
167    }
168
169    #[test]
170    fn doc_node_importance_defaults() {
171        let header = DocNode::Header {
172            level: 1,
173            text: "제목".into(),
174        };
175        assert_eq!(header.importance(), 1.0);
176
177        let para = DocNode::Para {
178            text: "내용".into(),
179            importance: 0.3,
180        };
181        assert_eq!(para.importance(), 0.3);
182    }
183
184    #[test]
185    fn ir_document_metadata_lookup() {
186        let mut doc = IRDocument::new(FidelityLevel::Semantic, Some(4096));
187        doc.push(DocNode::Metadata {
188            key: "title".into(),
189            value: "테스트 문서".into(),
190        });
191        assert_eq!(doc.get_metadata("title"), Some("테스트 문서"));
192        assert_eq!(doc.get_metadata("missing"), None);
193    }
194
195    #[test]
196    fn table_char_len() {
197        let node = DocNode::Table {
198            headers: vec!["이름".into(), "나이".into()],
199            rows: vec![vec!["홍길동".into(), "30".into()]],
200        };
201        // "이름"(6 bytes) + "나이"(6 bytes) + "홍길동"(9 bytes) + "30"(2 bytes) = 23 bytes
202        assert_eq!(node.char_len(), 23);
203    }
204}