pub(crate) mod compressor;
pub(crate) mod ir;
pub(crate) mod renderer;
pub(crate) mod stream;
pub(crate) mod symbol;
mod parser;
pub use compressor::{AdaptiveCompressor, CompressionConfig, CompressionStage};
pub use ir::{DocNode, FidelityLevel, IRDocument};
pub use renderer::{build_yaml_header, linearize_table, render_full, render_node};
pub use stream::{StreamError, StreamingTranspiler, TranspileChunk};
pub use symbol::SymbolDict;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum InputFormat {
PlainText,
Markdown,
Html,
}
#[derive(Debug, thiserror::Error)]
pub enum TranspileError {
#[error("parse failed: {0}")]
Parse(String),
#[error("symbol table overflow: {0}")]
SymbolOverflow(#[from] symbol::SymbolOverflowError),
#[error("stream error: {0}")]
Stream(#[from] stream::StreamError),
#[error("compression attempted in Lossless mode")]
LosslessModeViolation,
#[error("input exceeds maximum allowed size of {0} bytes")]
InputTooLarge(usize),
}
pub const MAX_INPUT_BYTES: usize = 10 * 1024 * 1024;
fn strip_pua(input: &str) -> std::borrow::Cow<'_, str> {
if input
.chars()
.any(|c| ('\u{E000}'..='\u{F8FF}').contains(&c))
{
std::borrow::Cow::Owned(
input
.chars()
.filter(|c| !('\u{E000}'..='\u{F8FF}').contains(c))
.collect(),
)
} else {
std::borrow::Cow::Borrowed(input)
}
}
pub fn transpile(
input: &str,
format: InputFormat,
fidelity: FidelityLevel,
budget: Option<usize>,
) -> Result<String, TranspileError> {
if input.len() > MAX_INPUT_BYTES {
return Err(TranspileError::InputTooLarge(input.len()));
}
let input = strip_pua(input);
let input = input.as_ref();
let mut doc = parser::parse(input, format, fidelity, budget).map_err(TranspileError::Parse)?;
if let Some(b) = budget {
let compressor = AdaptiveCompressor::new();
let cfg = CompressionConfig {
budget: b,
current_tokens: stream::estimate_tokens(input),
fidelity,
};
doc.nodes = compressor.compress(std::mem::take(&mut doc.nodes), &cfg);
}
let mut dict = SymbolDict::new();
let output = render_full(&doc, &mut dict);
Ok(output)
}
pub async fn transpile_stream(
input: &str,
format: InputFormat,
fidelity: FidelityLevel,
budget: usize,
) -> std::pin::Pin<Box<dyn futures::Stream<Item = Result<TranspileChunk, StreamError>> + Send>> {
if input.len() > MAX_INPUT_BYTES {
return Box::pin(futures::stream::once(futures::future::ready(Err(
StreamError::InputTooLarge(input.len()),
))));
}
let sanitized = strip_pua(input);
let input_ref = sanitized.as_ref();
let doc = match parser::parse(input_ref, format, fidelity, Some(budget)) {
Ok(doc) => doc,
Err(msg) => {
return Box::pin(futures::stream::once(futures::future::ready(Err(
StreamError::Parse(msg),
))));
}
};
let transpiler = StreamingTranspiler::new(budget, fidelity);
Box::pin(transpiler.transpile(doc))
}
pub fn token_count(text: &str) -> usize {
stream::estimate_tokens(text)
}
#[cfg(test)]
mod tests {
use super::*;
const SAMPLE_MD: &str = r#"
# 소프트웨어 라이선스 계약
## 계약 당사자
본 계약은 갑(라이선서)과 을(라이선시) 사이에 체결됩니다.
## 주요 조항
- 소스 코드 배포 금지
- 역설계 금지
- 연간 라이선스 비용: 1,000,000원
| 항목 | 금액 |
|------|------|
| 기본료 | 800,000원 |
| 유지보수 | 200,000원 |
"#;
#[test]
fn transpile_markdown_produces_bridge_format() {
let result = transpile(
SAMPLE_MD,
InputFormat::Markdown,
FidelityLevel::Semantic,
Some(2048),
);
assert!(
result.is_ok(),
"transpile should succeed: {:?}",
result.err()
);
let output = result.unwrap();
assert!(output.contains("<B>"), "output must contain <B> tag");
assert!(
output.contains("</B>"),
"output must contain </B> closing tag"
);
}
#[test]
fn transpile_lossless_preserves_content() {
let result = transpile(
"중요한 법적 내용입니다.",
InputFormat::PlainText,
FidelityLevel::Lossless,
None,
);
let output = result.unwrap();
assert!(output.contains("중요한 법적 내용입니다."));
}
#[test]
fn token_count_is_positive() {
assert!(token_count("hello world") > 0);
}
#[test]
fn pua_chars_stripped_from_input() {
let input_with_pua = "hello \u{E000}world\u{F8FF}";
let output = transpile(
input_with_pua,
InputFormat::PlainText,
FidelityLevel::Lossless,
None,
)
.unwrap();
assert!(
!output.contains('\u{E000}'),
"PUA characters must not appear in output"
);
assert!(output.contains("hello"), "plain text must be preserved");
assert!(
output.contains("world"),
"adjacent text after PUA removal must be preserved"
);
}
#[tokio::test]
async fn stream_error_variant_is_send_and_stream_works() {
use futures::StreamExt;
use stream::StreamError;
fn _assert_send<T: Send>(_: T) {}
_assert_send(StreamError::Parse("test".to_string()));
let mut stream = transpile_stream(
SAMPLE_MD,
InputFormat::Markdown,
FidelityLevel::Semantic,
8192,
)
.await;
let first = stream.next().await.expect("at least one chunk must exist");
assert!(
first.is_ok(),
"valid input must yield an Ok chunk: {:?}",
first.err()
);
}
#[test]
fn transpile_rejects_oversized_input() {
let huge = "a".repeat(MAX_INPUT_BYTES + 1);
let result = transpile(&huge, InputFormat::PlainText, FidelityLevel::Lossless, None);
assert!(
matches!(result, Err(TranspileError::InputTooLarge(_))),
"expected InputTooLarge, got: {:?}",
result
);
}
#[tokio::test]
async fn stream_rejects_oversized_input() {
use futures::StreamExt;
let huge = "a".repeat(MAX_INPUT_BYTES + 1);
let mut stream =
transpile_stream(&huge, InputFormat::PlainText, FidelityLevel::Lossless, 0).await;
let first = stream.next().await.expect("must yield an error item");
assert!(
matches!(first, Err(stream::StreamError::InputTooLarge(_))),
"oversized stream input must yield InputTooLarge, got: {:?}",
first
);
}
#[test]
fn html_pua_entity_stripped_after_tag_removal() {
let html = "<p>hello  world</p>";
let output = transpile(html, InputFormat::Html, FidelityLevel::Lossless, None).unwrap();
assert!(
!output.contains('\u{E000}'),
"PUA from HTML entity decoding must be stripped"
);
assert!(output.contains("hello"), "surrounding text must be preserved");
}
}