use regex::Regex;
use std::collections::HashMap;
use ucm_core::{BlockId, Content, Document};
fn content_to_string(content: &Content) -> String {
match content {
Content::Text(t) => t.text.clone(),
Content::Code(c) => c.source.clone(),
Content::Table(t) => format!("Table {}x{}", t.columns.len(), t.rows.len()),
Content::Math(m) => m.expression.clone(),
Content::Json { value, .. } => value.to_string(),
Content::Media(m) => format!("Media: {:?}", m.media_type),
Content::Binary { mime_type, .. } => format!("Binary: {}", mime_type),
Content::Composite { layout, children } => {
format!("{:?} ({} children)", layout, children.len())
}
}
}
#[derive(Debug, Clone)]
pub struct IdMapper {
to_short: HashMap<BlockId, u32>,
to_long: HashMap<u32, BlockId>,
next_id: u32,
}
impl IdMapper {
pub fn new() -> Self {
Self {
to_short: HashMap::new(),
to_long: HashMap::new(),
next_id: 1,
}
}
pub fn from_document(doc: &Document) -> Self {
let mut mapper = Self::new();
mapper.register(&doc.root);
let mut block_ids: Vec<_> = doc.blocks.keys().collect();
block_ids.sort_by_key(|a| a.to_string());
for block_id in block_ids {
if block_id != &doc.root {
mapper.register(block_id);
}
}
mapper
}
pub fn register(&mut self, block_id: &BlockId) -> u32 {
if let Some(&short_id) = self.to_short.get(block_id) {
return short_id;
}
let short_id = self.next_id;
self.next_id += 1;
self.to_short.insert(*block_id, short_id);
self.to_long.insert(short_id, *block_id);
short_id
}
pub fn to_short_id(&self, block_id: &BlockId) -> Option<u32> {
self.to_short.get(block_id).copied()
}
pub fn to_block_id(&self, short_id: u32) -> Option<&BlockId> {
self.to_long.get(&short_id)
}
pub fn shorten_text(&self, text: &str) -> String {
let mut result = text.to_string();
for (block_id, short_id) in &self.to_short {
let long_str = block_id.to_string();
let short_str = short_id.to_string();
result = result.replace(&long_str, &short_str);
}
result
}
pub fn expand_text(&self, text: &str) -> String {
let mut result = text.to_string();
let mut ids: Vec<_> = self.to_long.iter().collect();
ids.sort_by(|a, b| b.0.cmp(a.0));
for (short_id, block_id) in ids {
let short_str = short_id.to_string();
let long_str = block_id.to_string();
let patterns = [
(
format!("block {}", short_str),
format!("block {}", long_str),
),
(format!("id {}", short_str), format!("id {}", long_str)),
(format!("#{}", short_str), format!("#{}", long_str)),
(format!("[{}]", short_str), format!("[{}]", long_str)),
];
for (from, to) in patterns {
result = result.replace(&from, &to);
}
}
result
}
pub fn shorten_ucl(&self, ucl: &str) -> String {
let mut result = ucl.to_string();
let mut entries: Vec<_> = self.to_short.iter().collect();
entries.sort_by(|a, b| b.0.to_string().len().cmp(&a.0.to_string().len()));
for (block_id, short_id) in entries {
result = result.replace(&block_id.to_string(), &short_id.to_string());
}
result
}
pub fn expand_ucl(&self, ucl: &str) -> String {
let ucl_id_pattern = Regex::new(
r"(?x)
(?P<prefix>
\b(?:EDIT|APPEND|MOVE|DELETE|LINK|UNLINK|TO|BEFORE|AFTER)\s+
)
(?P<id>\d+)
",
)
.unwrap();
let mut result = ucl.to_string();
let replacements: Vec<_> = ucl_id_pattern
.captures_iter(&result.clone())
.filter_map(|cap| {
let id_str = cap.name("id")?.as_str();
let short_id: u32 = id_str.parse().ok()?;
let block_id = self.to_long.get(&short_id)?;
let full_match = cap.get(0)?;
let prefix = cap.name("prefix")?.as_str();
Some((
full_match.as_str().to_string(),
format!("{}{}", prefix, block_id),
))
})
.collect();
for (from, to) in replacements.iter().rev() {
result = result.replacen(from, to, 1);
}
let link_target_pattern = Regex::new(
r"(?x)
(?P<prefix>
\b(?:references|elaborates|summarizes|contradicts|supports|requires|parent_of)\s+
)
(?P<id>\d+)
",
)
.unwrap();
let replacements: Vec<_> = link_target_pattern
.captures_iter(&result.clone())
.filter_map(|cap| {
let id_str = cap.name("id")?.as_str();
let short_id: u32 = id_str.parse().ok()?;
let block_id = self.to_long.get(&short_id)?;
let full_match = cap.get(0)?;
let prefix = cap.name("prefix")?.as_str();
Some((
full_match.as_str().to_string(),
format!("{}{}", prefix, block_id),
))
})
.collect();
for (from, to) in replacements.iter().rev() {
result = result.replacen(from, to, 1);
}
result
}
pub fn estimate_token_savings(&self, text: &str) -> (usize, usize, usize) {
let shortened = self.shorten_text(text);
let original_tokens = text.len() / 4;
let shortened_tokens = shortened.len() / 4;
let savings = original_tokens.saturating_sub(shortened_tokens);
(original_tokens, shortened_tokens, savings)
}
pub fn document_to_prompt(&self, doc: &Document) -> String {
let mut lines = Vec::new();
lines.push("Document structure:".to_string());
let mut all_blocks = Vec::new();
let mut queue = std::collections::VecDeque::new();
queue.push_back(doc.root);
while let Some(block_id) = queue.pop_front() {
all_blocks.push(block_id);
if let Some(children) = doc.structure.get(&block_id) {
for child in children {
queue.push_back(*child);
}
}
}
for block_id in &all_blocks {
let short_id = self
.to_short
.get(block_id)
.map(|id| id.to_string())
.unwrap_or_else(|| "?".to_string());
let children = doc.children(block_id);
if children.is_empty() {
lines.push(format!("{}:", short_id));
} else {
let child_ids: Vec<String> = children
.iter()
.map(|c| {
self.to_short
.get(c)
.map(|id| id.to_string())
.unwrap_or_else(|| "?".to_string())
})
.collect();
lines.push(format!("{}: {}", short_id, child_ids.join(" ")));
}
}
lines.push(String::new());
lines.push("Blocks:".to_string());
for block_id in &all_blocks {
if let Some(block) = doc.get_block(block_id) {
let short_id = self
.to_short
.get(block_id)
.map(|id| id.to_string())
.unwrap_or_else(|| "?".to_string());
let content_type = block.content.type_tag();
let content_str = content_to_string(&block.content);
let escaped_content = content_str
.replace('\\', "\\\\")
.replace('"', "\\\"")
.replace('\n', "\\n");
lines.push(format!(
"{} type={} content=\"{}\"",
short_id, content_type, escaped_content
));
}
}
lines.join("\n")
}
pub fn mapping_table(&self) -> String {
let mut lines = Vec::new();
lines.push("ID Mapping:".to_string());
let mut entries: Vec<_> = self.to_short.iter().collect();
entries.sort_by_key(|(_, &id)| id);
for (block_id, short_id) in entries {
lines.push(format!(" {} = {}", short_id, block_id));
}
lines.join("\n")
}
pub fn len(&self) -> usize {
self.to_short.len()
}
pub fn is_empty(&self) -> bool {
self.to_short.is_empty()
}
}
impl Default for IdMapper {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use ucm_core::{Block, Content};
#[test]
fn test_id_mapper() {
let mut doc = Document::create();
let root = doc.root;
let block1 = Block::new(Content::text("Hello"), Some("heading1"));
let id1 = doc.add_block(block1, &root).unwrap();
let block2 = Block::new(Content::text("World"), Some("paragraph"));
let id2 = doc.add_block(block2, &id1).unwrap();
let mapper = IdMapper::from_document(&doc);
assert_eq!(mapper.to_short_id(&root), Some(1));
assert!(mapper.to_short_id(&id1).is_some());
assert!(mapper.to_short_id(&id2).is_some());
assert_eq!(mapper.to_block_id(1), Some(&root));
}
#[test]
fn test_shorten_text() {
let mut mapper = IdMapper::new();
let block_id = BlockId::from_hex("aabbccdd11223344").unwrap();
mapper.register(&block_id);
let text = format!("Edit block {}", block_id);
let shortened = mapper.shorten_text(&text);
assert_eq!(shortened, "Edit block 1");
}
#[test]
fn test_shorten_ucl() {
let mut mapper = IdMapper::new();
let block1 = BlockId::from_hex("aabbccdd11223344").unwrap();
let block2 = BlockId::from_hex("11223344aabbccdd").unwrap();
mapper.register(&block1);
mapper.register(&block2);
let ucl = format!("EDIT {} SET text = \"hello\"", block1);
let shortened = mapper.shorten_ucl(&ucl);
assert_eq!(shortened, "EDIT 1 SET text = \"hello\"");
let ucl = format!("MOVE {} TO {}", block1, block2);
let shortened = mapper.shorten_ucl(&ucl);
assert_eq!(shortened, "MOVE 1 TO 2");
}
#[test]
fn test_expand_ucl() {
let mut mapper = IdMapper::new();
let block1 = BlockId::from_hex("aabbccdd11223344").unwrap();
let block2 = BlockId::from_hex("11223344aabbccdd").unwrap();
mapper.register(&block1);
mapper.register(&block2);
let ucl = "EDIT 1 SET text = \"hello\"";
let expanded = mapper.expand_ucl(ucl);
assert!(expanded.contains(&block1.to_string()));
let ucl = "MOVE 1 TO 2";
let expanded = mapper.expand_ucl(ucl);
assert!(expanded.contains(&block1.to_string()));
assert!(expanded.contains(&block2.to_string()));
}
#[test]
fn test_ucl_roundtrip() {
let mut mapper = IdMapper::new();
let block1 = BlockId::from_hex("aabbccdd11223344").unwrap();
let block2 = BlockId::from_hex("11223344aabbccdd").unwrap();
mapper.register(&block1);
mapper.register(&block2);
let original = format!("LINK {} references {}", block1, block2);
let shortened = mapper.shorten_ucl(&original);
let expanded = mapper.expand_ucl(&shortened);
assert_eq!(original, expanded);
}
#[test]
fn test_token_savings() {
let mut doc = Document::create();
let root = doc.root;
for i in 0..10 {
let block = Block::new(Content::text(format!("Block {}", i)), None);
doc.add_block(block, &root).unwrap();
}
let mapper = IdMapper::from_document(&doc);
let mut prompt = String::new();
for block_id in mapper.to_short.keys() {
prompt.push_str(&format!("Block {} has content. ", block_id));
}
let (original, shortened, savings) = mapper.estimate_token_savings(&prompt);
assert!(savings > 0, "Should have token savings");
assert!(shortened < original, "Shortened should be smaller");
}
#[test]
fn test_document_to_prompt_format() {
let mut doc = Document::create();
let root = doc.root;
let block1 = Block::new(Content::text("Title"), Some("heading1"));
let id1 = doc.add_block(block1, &root).unwrap();
let block2 = Block::new(Content::text("Paragraph"), Some("paragraph"));
doc.add_block(block2, &id1).unwrap();
let mapper = IdMapper::from_document(&doc);
let prompt = mapper.document_to_prompt(&doc);
assert!(prompt.contains("Document structure:"));
assert!(prompt.contains("Blocks:"));
assert!(prompt.contains("type="));
assert!(prompt.contains("content=\""));
}
}