llm_transpile/ir.rs
1//! ir.rs — Intermediate Representation
2//!
3//! A language-neutral internal representation (IR) that holds raw documents
4//! before they are converted into the LLM bridge format.
5//! Semantic preservation level is controlled explicitly.
6
7// ────────────────────────────────────────────────
8// 1. Semantic preservation level
9// ────────────────────────────────────────────────
10
11/// The degree of information loss permitted during document conversion.
12///
13/// Decided once at the top of the pipeline; every subsequent transformation
14/// stage consistently respects this constraint.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum FidelityLevel {
17 /// Audit/legal documents — 100% source preservation, compression forbidden.
18 Lossless,
19 /// General RAG pipeline — minimal compression at the semantic unit level.
20 Semantic,
21 /// Summarization pipeline — maximum compression, only key information kept.
22 Compressed,
23}
24
25impl FidelityLevel {
26 /// Returns whether lossy compression is permitted.
27 pub fn allows_compression(self) -> bool {
28 matches!(self, FidelityLevel::Semantic | FidelityLevel::Compressed)
29 }
30}
31
32// ────────────────────────────────────────────────
33// 2. Document node (DocNode)
34// ────────────────────────────────────────────────
35
36/// A semantic unit that makes up a document.
37///
38/// Produced by the parser and consumed by the renderer, compressor, and symbolizer.
39#[derive(Debug, Clone)]
40pub enum DocNode {
41 /// Heading (H1–H6).
42 Header {
43 /// Heading level (1–6).
44 level: u8,
45 text: String,
46 },
47
48 /// Regular paragraph.
49 Para {
50 text: String,
51 /// Importance score (0.0 = lowest, 1.0 = highest).
52 /// Used by the compressor for priority-based trimming.
53 importance: f32,
54 },
55
56 /// Table.
57 Table {
58 headers: Vec<String>,
59 rows: Vec<Vec<String>>,
60 },
61
62 /// Code block.
63 Code { lang: Option<String>, body: String },
64
65 /// List (ordered or unordered).
66 List { ordered: bool, items: Vec<String> },
67
68 /// Key-value metadata (title, summary, keywords, etc.).
69 Metadata { key: String, value: String },
70}
71
72impl DocNode {
73 /// Returns the importance score of the node.
74 ///
75 /// Nodes other than `Para` have a default importance of 1.0.
76 pub fn importance(&self) -> f32 {
77 match self {
78 DocNode::Para { importance, .. } => *importance,
79 _ => 1.0,
80 }
81 }
82
83 /// Returns the approximate character count of the text held by the node.
84 /// Used for pre-filtering against the token budget.
85 pub fn char_len(&self) -> usize {
86 match self {
87 DocNode::Header { text, .. } => text.len(),
88 DocNode::Para { text, .. } => text.len(),
89 DocNode::Table { headers, rows } => {
90 headers.iter().map(|h| h.len()).sum::<usize>()
91 + rows
92 .iter()
93 .flat_map(|r| r.iter())
94 .map(|c| c.len())
95 .sum::<usize>()
96 }
97 DocNode::Code { body, .. } => body.len(),
98 DocNode::List { items, .. } => items.iter().map(|i| i.len()).sum(),
99 DocNode::Metadata { key, value } => key.len() + value.len(),
100 }
101 }
102}
103
104// ────────────────────────────────────────────────
105// 3. IR document
106// ────────────────────────────────────────────────
107
108/// The complete IR representation of a parsed document.
109///
110/// `fidelity` and `token_budget` act as constraints for every subsequent transformation stage.
111#[derive(Debug, Clone)]
112pub struct IRDocument {
113 /// Semantic preservation level.
114 pub fidelity: FidelityLevel,
115 /// Sequence of document nodes.
116 pub nodes: Vec<DocNode>,
117 /// Maximum allowed token count. `None` means unlimited.
118 pub token_budget: Option<usize>,
119}
120
121impl IRDocument {
122 /// Creates a new IR document.
123 pub fn new(fidelity: FidelityLevel, token_budget: Option<usize>) -> Self {
124 Self {
125 fidelity,
126 nodes: Vec::new(),
127 token_budget,
128 }
129 }
130
131 /// Appends a node.
132 pub fn push(&mut self, node: DocNode) {
133 self.nodes.push(node);
134 }
135
136 /// Total character count of the document (for pre-validation against the token budget).
137 pub fn total_char_len(&self) -> usize {
138 self.nodes.iter().map(|n| n.char_len()).sum()
139 }
140
141 /// Looks up the value for a specific key from metadata nodes.
142 pub fn get_metadata(&self, key: &str) -> Option<&str> {
143 self.nodes.iter().find_map(|n| {
144 if let DocNode::Metadata { key: k, value } = n
145 && k == key
146 {
147 return Some(value.as_str());
148 }
149 None
150 })
151 }
152}
153
154// ────────────────────────────────────────────────
155// 4. Unit tests
156// ────────────────────────────────────────────────
157
158#[cfg(test)]
159mod tests {
160 use super::*;
161
162 #[test]
163 fn fidelity_compression_flag() {
164 assert!(!FidelityLevel::Lossless.allows_compression());
165 assert!(FidelityLevel::Semantic.allows_compression());
166 assert!(FidelityLevel::Compressed.allows_compression());
167 }
168
169 #[test]
170 fn doc_node_importance_defaults() {
171 let header = DocNode::Header {
172 level: 1,
173 text: "제목".into(),
174 };
175 assert_eq!(header.importance(), 1.0);
176
177 let para = DocNode::Para {
178 text: "내용".into(),
179 importance: 0.3,
180 };
181 assert_eq!(para.importance(), 0.3);
182 }
183
184 #[test]
185 fn ir_document_metadata_lookup() {
186 let mut doc = IRDocument::new(FidelityLevel::Semantic, Some(4096));
187 doc.push(DocNode::Metadata {
188 key: "title".into(),
189 value: "테스트 문서".into(),
190 });
191 assert_eq!(doc.get_metadata("title"), Some("테스트 문서"));
192 assert_eq!(doc.get_metadata("missing"), None);
193 }
194
195 #[test]
196 fn table_char_len() {
197 let node = DocNode::Table {
198 headers: vec!["이름".into(), "나이".into()],
199 rows: vec![vec!["홍길동".into(), "30".into()]],
200 };
201 // "이름"(6 bytes) + "나이"(6 bytes) + "홍길동"(9 bytes) + "30"(2 bytes) = 23 bytes
202 assert_eq!(node.char_len(), 23);
203 }
204}