Skip to main content

llm_transpile/
lib.rs

1//! # llm-transpiler
2//!
3//! A high-performance Rust library that converts raw documents (Markdown, HTML,
4//! Plain Text, Tables, etc.) into a structured bridge format so LLM agents can
5//! receive **maximum information with minimum tokens**.
6//!
7//! ## Quick Start
8//!
9//! ```rust
10//! use llm_transpile::{transpile, FidelityLevel, InputFormat};
11//!
12//! let md = "# Contract\n\nThis agreement was concluded in 2024.";
13//! let result = transpile(md, InputFormat::Markdown, FidelityLevel::Semantic, Some(4096))
14//!     .expect("transpile failed");
15//! println!("{}", result);
16//! ```
17//!
18//! ## Streaming Usage
19//!
20//! ```rust,no_run
21//! use llm_transpile::{transpile_stream, FidelityLevel, InputFormat};
22//! use futures::StreamExt;
23//!
24//! async fn example() {
25//!     let md = "# Document\n\nThis is a paragraph.";
26//!     let mut stream = transpile_stream(md, InputFormat::Markdown, FidelityLevel::Semantic, 4096).await;
27//!     while let Some(chunk) = stream.next().await {
28//!         let chunk = chunk.expect("stream error");
29//!         print!("{}", chunk.content);
30//!         if chunk.is_final { break; }
31//!     }
32//! }
33//! ```
34
35// ────────────────────────────────────────────────
36// Internal modules
37// ────────────────────────────────────────────────
38
39pub(crate) mod compressor;
40pub(crate) mod ir;
41pub(crate) mod renderer;
42pub(crate) mod stream;
43pub(crate) mod symbol;
44
45// Parser module (Markdown → IR)
46mod parser;
47
48// ────────────────────────────────────────────────
49// Public re-exports
50// ────────────────────────────────────────────────
51
52pub use compressor::{AdaptiveCompressor, CompressionConfig, CompressionStage};
53pub use ir::{DocNode, FidelityLevel, IRDocument};
54pub use renderer::{build_yaml_header, linearize_table, render_full, render_node};
55pub use stream::{StreamError, StreamingTranspiler, TranspileChunk};
56pub use symbol::SymbolDict;
57
58// ────────────────────────────────────────────────
59// Public enumerations
60// ────────────────────────────────────────────────
61
62/// Input document format.
63#[derive(Debug, Clone, Copy, PartialEq, Eq)]
64pub enum InputFormat {
65    /// Plain text.
66    PlainText,
67    /// CommonMark-compatible Markdown.
68    Markdown,
69    /// HTML5.
70    Html,
71}
72
73// ────────────────────────────────────────────────
74// Top-level error type
75// ────────────────────────────────────────────────
76
77/// Transpile error.
78#[derive(Debug, thiserror::Error)]
79pub enum TranspileError {
80    #[error("parse failed: {0}")]
81    Parse(String),
82
83    #[error("symbol table overflow: {0}")]
84    SymbolOverflow(#[from] symbol::SymbolOverflowError),
85
86    #[error("stream error: {0}")]
87    Stream(#[from] stream::StreamError),
88
89    #[error("compression attempted in Lossless mode")]
90    LosslessModeViolation,
91
92    #[error("input exceeds maximum allowed size of {0} bytes")]
93    InputTooLarge(usize),
94}
95
96/// Maximum input size accepted by [`transpile`] and [`transpile_stream`].
97/// Inputs larger than this limit are rejected with [`TranspileError::InputTooLarge`]
98/// to prevent resource exhaustion on unbounded documents.
99pub const MAX_INPUT_BYTES: usize = 10 * 1024 * 1024; // 10 MiB
100
101// ────────────────────────────────────────────────
102// Internal helpers
103// ────────────────────────────────────────────────
104
105/// Strips Unicode PUA range (U+E000–U+F8FF) characters from the input string.
106/// Prevents external input from colliding with the internal symbol substitution scheme.
107fn strip_pua(input: &str) -> std::borrow::Cow<'_, str> {
108    if input
109        .chars()
110        .any(|c| ('\u{E000}'..='\u{F8FF}').contains(&c))
111    {
112        std::borrow::Cow::Owned(
113            input
114                .chars()
115                .filter(|c| !('\u{E000}'..='\u{F8FF}').contains(c))
116                .collect(),
117        )
118    } else {
119        std::borrow::Cow::Borrowed(input)
120    }
121}
122
123// ────────────────────────────────────────────────
124// Public API
125// ────────────────────────────────────────────────
126
127/// Converts a document **synchronously** into the bridge format.
128///
129/// # Arguments
130/// - `input`    — source document text
131/// - `format`   — input format (Markdown / HTML / PlainText)
132/// - `fidelity` — semantic preservation level
133/// - `budget`   — maximum token count (`None` = unlimited)
134///
135/// # Returns
136/// Bridge-format string (`<D>?<H><B>...</B>`)
137///
138/// # Errors
139/// Returns `TranspileError` on parse failure or symbol table overflow.
140pub fn transpile(
141    input: &str,
142    format: InputFormat,
143    fidelity: FidelityLevel,
144    budget: Option<usize>,
145) -> Result<String, TranspileError> {
146    if input.len() > MAX_INPUT_BYTES {
147        return Err(TranspileError::InputTooLarge(input.len()));
148    }
149    let input = strip_pua(input);
150    let input = input.as_ref();
151
152    // 1. Parse → IR
153    let mut doc = parser::parse(input, format, fidelity, budget).map_err(TranspileError::Parse)?;
154
155    // 2. Compress (only when a budget is provided)
156    if let Some(b) = budget {
157        let compressor = AdaptiveCompressor::new();
158        let cfg = CompressionConfig {
159            budget: b,
160            // Note: token count is estimated from raw input before compression and symbol
161            // substitution. The actual output token count will typically be lower. This
162            // estimate drives compression stage selection and may cause slight over-compression.
163            current_tokens: stream::estimate_tokens(input),
164            fidelity,
165        };
166        doc.nodes = compressor.compress(std::mem::take(&mut doc.nodes), &cfg);
167    }
168
169    // 3. Render
170    let mut dict = SymbolDict::new();
171    let output = render_full(&doc, &mut dict);
172    Ok(output)
173}
174
175/// Converts a document into a **Tokio stream**.
176///
177/// The first chunk is delivered immediately, minimizing TTFT.
178///
179/// # Arguments
180/// - `input`    — source document text
181/// - `format`   — input format (Markdown / HTML / PlainText)
182/// - `fidelity` — semantic preservation level
183/// - `budget`   — maximum allowed token count. Passing `0` is treated as
184///   "unlimited" and immediately switches to `Compressed` mode during
185///   budget-usage calculations. Use a positive non-zero value to enforce a token limit.
186///
187/// # Errors
188/// On parse failure, `Err(StreamError::Parse(...))` is sent as the first stream item
189/// and the stream is then closed. Use [`transpile`] if you prefer a single `Result`.
190pub async fn transpile_stream(
191    input: &str,
192    format: InputFormat,
193    fidelity: FidelityLevel,
194    budget: usize,
195) -> std::pin::Pin<Box<dyn futures::Stream<Item = Result<TranspileChunk, StreamError>> + Send>> {
196    if input.len() > MAX_INPUT_BYTES {
197        return Box::pin(futures::stream::once(futures::future::ready(Err(
198            StreamError::InputTooLarge(input.len()),
199        ))));
200    }
201    let sanitized = strip_pua(input);
202    let input_ref = sanitized.as_ref();
203
204    let doc = match parser::parse(input_ref, format, fidelity, Some(budget)) {
205        Ok(doc) => doc,
206        Err(msg) => {
207            // Parse failure: immediately return a stream containing a single Err chunk.
208            // futures::future::ready() is Unpin, so it can be safely used with stream::once.
209            return Box::pin(futures::stream::once(futures::future::ready(Err(
210                StreamError::Parse(msg),
211            ))));
212        }
213    };
214
215    let transpiler = StreamingTranspiler::new(budget, fidelity);
216    Box::pin(transpiler.transpile(doc))
217}
218
219/// Returns the approximate token count for the given text.
220///
221/// Uses a character-count-based heuristic without a real model tokenizer.
222/// For higher accuracy, use `tiktoken-rs` or the `tokenizers` crate directly.
223pub fn token_count(text: &str) -> usize {
224    stream::estimate_tokens(text)
225}
226
227// ────────────────────────────────────────────────
228// Integration tests
229// ────────────────────────────────────────────────
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    const SAMPLE_MD: &str = r#"
236# 소프트웨어 라이선스 계약
237
238## 계약 당사자
239
240본 계약은 갑(라이선서)과 을(라이선시) 사이에 체결됩니다.
241
242## 주요 조항
243
244- 소스 코드 배포 금지
245- 역설계 금지
246- 연간 라이선스 비용: 1,000,000원
247
248| 항목 | 금액 |
249|------|------|
250| 기본료 | 800,000원 |
251| 유지보수 | 200,000원 |
252"#;
253
254    #[test]
255    fn transpile_markdown_produces_bridge_format() {
256        let result = transpile(
257            SAMPLE_MD,
258            InputFormat::Markdown,
259            FidelityLevel::Semantic,
260            Some(2048),
261        );
262        assert!(
263            result.is_ok(),
264            "transpile should succeed: {:?}",
265            result.err()
266        );
267        let output = result.unwrap();
268        assert!(output.contains("<B>"), "output must contain <B> tag");
269        assert!(
270            output.contains("</B>"),
271            "output must contain </B> closing tag"
272        );
273    }
274
275    #[test]
276    fn transpile_lossless_preserves_content() {
277        let result = transpile(
278            "중요한 법적 내용입니다.",
279            InputFormat::PlainText,
280            FidelityLevel::Lossless,
281            None,
282        );
283        let output = result.unwrap();
284        assert!(output.contains("중요한 법적 내용입니다."));
285    }
286
287    #[test]
288    fn token_count_is_positive() {
289        assert!(token_count("hello world") > 0);
290    }
291
292    #[test]
293    fn pua_chars_stripped_from_input() {
294        let input_with_pua = "hello \u{E000}world\u{F8FF}";
295        let output = transpile(
296            input_with_pua,
297            InputFormat::PlainText,
298            FidelityLevel::Lossless,
299            None,
300        )
301        .unwrap();
302        assert!(
303            !output.contains('\u{E000}'),
304            "PUA characters must not appear in output"
305        );
306        assert!(output.contains("hello"), "plain text must be preserved");
307        assert!(
308            output.contains("world"),
309            "adjacent text after PUA removal must be preserved"
310        );
311    }
312
313    #[tokio::test]
314    async fn stream_error_variant_is_send_and_stream_works() {
315        use futures::StreamExt;
316        use stream::StreamError;
317
318        // Compile-time check for StreamError::Parse variant
319        fn _assert_send<T: Send>(_: T) {}
320        _assert_send(StreamError::Parse("test".to_string()));
321
322        // Verify normal streaming behavior
323        let mut stream = transpile_stream(
324            SAMPLE_MD,
325            InputFormat::Markdown,
326            FidelityLevel::Semantic,
327            8192,
328        )
329        .await;
330        let first = stream.next().await.expect("at least one chunk must exist");
331        assert!(
332            first.is_ok(),
333            "valid input must yield an Ok chunk: {:?}",
334            first.err()
335        );
336    }
337
338    #[test]
339    fn transpile_rejects_oversized_input() {
340        let huge = "a".repeat(MAX_INPUT_BYTES + 1);
341        let result = transpile(&huge, InputFormat::PlainText, FidelityLevel::Lossless, None);
342        assert!(
343            matches!(result, Err(TranspileError::InputTooLarge(_))),
344            "expected InputTooLarge, got: {:?}",
345            result
346        );
347    }
348
349    #[tokio::test]
350    async fn stream_rejects_oversized_input() {
351        use futures::StreamExt;
352        let huge = "a".repeat(MAX_INPUT_BYTES + 1);
353        let mut stream =
354            transpile_stream(&huge, InputFormat::PlainText, FidelityLevel::Lossless, 0).await;
355        let first = stream.next().await.expect("must yield an error item");
356        assert!(
357            matches!(first, Err(stream::StreamError::InputTooLarge(_))),
358            "oversized stream input must yield InputTooLarge, got: {:?}",
359            first
360        );
361    }
362
363    #[test]
364    fn html_pua_entity_stripped_after_tag_removal() {
365        // &#xE000; decoded by ammonia becomes a PUA char — must be stripped
366        let html = "<p>hello &#xE000; world</p>";
367        let output = transpile(html, InputFormat::Html, FidelityLevel::Lossless, None).unwrap();
368        assert!(
369            !output.contains('\u{E000}'),
370            "PUA from HTML entity decoding must be stripped"
371        );
372        assert!(output.contains("hello"), "surrounding text must be preserved");
373    }
374}