1pub(crate) mod compressor;
40pub(crate) mod ir;
41pub(crate) mod renderer;
42pub(crate) mod stream;
43pub(crate) mod symbol;
44
45mod parser;
47
48pub use compressor::{AdaptiveCompressor, CompressionConfig, CompressionStage};
53pub use ir::{DocNode, FidelityLevel, IRDocument};
54pub use renderer::{build_yaml_header, linearize_table, render_full, render_node};
55pub use stream::{StreamError, StreamingTranspiler, TranspileChunk};
56pub use symbol::SymbolDict;
57
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
64pub enum InputFormat {
65 PlainText,
67 Markdown,
69 Html,
71}
72
73#[derive(Debug, thiserror::Error)]
79pub enum TranspileError {
80 #[error("parse failed: {0}")]
81 Parse(String),
82
83 #[error("symbol table overflow: {0}")]
84 SymbolOverflow(#[from] symbol::SymbolOverflowError),
85
86 #[error("stream error: {0}")]
87 Stream(#[from] stream::StreamError),
88
89 #[error("compression attempted in Lossless mode")]
90 LosslessModeViolation,
91
92 #[error("input exceeds maximum allowed size of {0} bytes")]
93 InputTooLarge(usize),
94}
95
96pub const MAX_INPUT_BYTES: usize = 10 * 1024 * 1024; fn strip_pua(input: &str) -> std::borrow::Cow<'_, str> {
108 if input
109 .chars()
110 .any(|c| ('\u{E000}'..='\u{F8FF}').contains(&c))
111 {
112 std::borrow::Cow::Owned(
113 input
114 .chars()
115 .filter(|c| !('\u{E000}'..='\u{F8FF}').contains(c))
116 .collect(),
117 )
118 } else {
119 std::borrow::Cow::Borrowed(input)
120 }
121}
122
123pub fn transpile(
141 input: &str,
142 format: InputFormat,
143 fidelity: FidelityLevel,
144 budget: Option<usize>,
145) -> Result<String, TranspileError> {
146 if input.len() > MAX_INPUT_BYTES {
147 return Err(TranspileError::InputTooLarge(input.len()));
148 }
149 let input = strip_pua(input);
150 let input = input.as_ref();
151
152 let mut doc = parser::parse(input, format, fidelity, budget).map_err(TranspileError::Parse)?;
154
155 if let Some(b) = budget {
157 let compressor = AdaptiveCompressor::new();
158 let cfg = CompressionConfig {
159 budget: b,
160 current_tokens: stream::estimate_tokens(input),
164 fidelity,
165 };
166 doc.nodes = compressor.compress(std::mem::take(&mut doc.nodes), &cfg);
167 }
168
169 let mut dict = SymbolDict::new();
171 let output = render_full(&doc, &mut dict);
172 Ok(output)
173}
174
175pub async fn transpile_stream(
191 input: &str,
192 format: InputFormat,
193 fidelity: FidelityLevel,
194 budget: usize,
195) -> std::pin::Pin<Box<dyn futures::Stream<Item = Result<TranspileChunk, StreamError>> + Send>> {
196 if input.len() > MAX_INPUT_BYTES {
197 return Box::pin(futures::stream::once(futures::future::ready(Err(
198 StreamError::InputTooLarge(input.len()),
199 ))));
200 }
201 let sanitized = strip_pua(input);
202 let input_ref = sanitized.as_ref();
203
204 let doc = match parser::parse(input_ref, format, fidelity, Some(budget)) {
205 Ok(doc) => doc,
206 Err(msg) => {
207 return Box::pin(futures::stream::once(futures::future::ready(Err(
210 StreamError::Parse(msg),
211 ))));
212 }
213 };
214
215 let transpiler = StreamingTranspiler::new(budget, fidelity);
216 Box::pin(transpiler.transpile(doc))
217}
218
219pub fn token_count(text: &str) -> usize {
224 stream::estimate_tokens(text)
225}
226
227#[cfg(test)]
232mod tests {
233 use super::*;
234
235 const SAMPLE_MD: &str = r#"
236# 소프트웨어 라이선스 계약
237
238## 계약 당사자
239
240본 계약은 갑(라이선서)과 을(라이선시) 사이에 체결됩니다.
241
242## 주요 조항
243
244- 소스 코드 배포 금지
245- 역설계 금지
246- 연간 라이선스 비용: 1,000,000원
247
248| 항목 | 금액 |
249|------|------|
250| 기본료 | 800,000원 |
251| 유지보수 | 200,000원 |
252"#;
253
254 #[test]
255 fn transpile_markdown_produces_bridge_format() {
256 let result = transpile(
257 SAMPLE_MD,
258 InputFormat::Markdown,
259 FidelityLevel::Semantic,
260 Some(2048),
261 );
262 assert!(
263 result.is_ok(),
264 "transpile should succeed: {:?}",
265 result.err()
266 );
267 let output = result.unwrap();
268 assert!(output.contains("<B>"), "output must contain <B> tag");
269 assert!(
270 output.contains("</B>"),
271 "output must contain </B> closing tag"
272 );
273 }
274
275 #[test]
276 fn transpile_lossless_preserves_content() {
277 let result = transpile(
278 "중요한 법적 내용입니다.",
279 InputFormat::PlainText,
280 FidelityLevel::Lossless,
281 None,
282 );
283 let output = result.unwrap();
284 assert!(output.contains("중요한 법적 내용입니다."));
285 }
286
287 #[test]
288 fn token_count_is_positive() {
289 assert!(token_count("hello world") > 0);
290 }
291
292 #[test]
293 fn pua_chars_stripped_from_input() {
294 let input_with_pua = "hello \u{E000}world\u{F8FF}";
295 let output = transpile(
296 input_with_pua,
297 InputFormat::PlainText,
298 FidelityLevel::Lossless,
299 None,
300 )
301 .unwrap();
302 assert!(
303 !output.contains('\u{E000}'),
304 "PUA characters must not appear in output"
305 );
306 assert!(output.contains("hello"), "plain text must be preserved");
307 assert!(
308 output.contains("world"),
309 "adjacent text after PUA removal must be preserved"
310 );
311 }
312
313 #[tokio::test]
314 async fn stream_error_variant_is_send_and_stream_works() {
315 use futures::StreamExt;
316 use stream::StreamError;
317
318 fn _assert_send<T: Send>(_: T) {}
320 _assert_send(StreamError::Parse("test".to_string()));
321
322 let mut stream = transpile_stream(
324 SAMPLE_MD,
325 InputFormat::Markdown,
326 FidelityLevel::Semantic,
327 8192,
328 )
329 .await;
330 let first = stream.next().await.expect("at least one chunk must exist");
331 assert!(
332 first.is_ok(),
333 "valid input must yield an Ok chunk: {:?}",
334 first.err()
335 );
336 }
337
338 #[test]
339 fn transpile_rejects_oversized_input() {
340 let huge = "a".repeat(MAX_INPUT_BYTES + 1);
341 let result = transpile(&huge, InputFormat::PlainText, FidelityLevel::Lossless, None);
342 assert!(
343 matches!(result, Err(TranspileError::InputTooLarge(_))),
344 "expected InputTooLarge, got: {:?}",
345 result
346 );
347 }
348
349 #[tokio::test]
350 async fn stream_rejects_oversized_input() {
351 use futures::StreamExt;
352 let huge = "a".repeat(MAX_INPUT_BYTES + 1);
353 let mut stream =
354 transpile_stream(&huge, InputFormat::PlainText, FidelityLevel::Lossless, 0).await;
355 let first = stream.next().await.expect("must yield an error item");
356 assert!(
357 matches!(first, Err(stream::StreamError::InputTooLarge(_))),
358 "oversized stream input must yield InputTooLarge, got: {:?}",
359 first
360 );
361 }
362
363 #[test]
364 fn html_pua_entity_stripped_after_tag_removal() {
365 let html = "<p>hello  world</p>";
367 let output = transpile(html, InputFormat::Html, FidelityLevel::Lossless, None).unwrap();
368 assert!(
369 !output.contains('\u{E000}'),
370 "PUA from HTML entity decoding must be stripped"
371 );
372 assert!(output.contains("hello"), "surrounding text must be preserved");
373 }
374}