Skip to main content

infiniloom_engine/output/
toon.rs

1//! TOON (Token-Oriented Object Notation) output formatter
2//!
3//! TOON is a compact, human-readable format designed for LLM context.
4//! It provides ~40% fewer tokens than JSON while maintaining readability.
5//!
6//! Supports both in-memory (`format()`) and streaming (`format_to_writer()`) modes.
7//!
8//! Format specification: https://github.com/toon-format/toon
9
10use crate::output::{Formatter, StreamingFormatter};
11use crate::repomap::RepoMap;
12use crate::types::{Repository, TokenizerModel};
13use std::io::{self, Write};
14
15/// TOON formatter - most token-efficient format for LLMs
16pub struct ToonFormatter {
17    /// Include line numbers in code
18    include_line_numbers: bool,
19    /// Use tabular format for file metadata
20    use_tabular: bool,
21    /// Include file index/summary section
22    show_file_index: bool,
23    /// Token model for counts in output
24    token_model: TokenizerModel,
25}
26
27impl ToonFormatter {
28    /// Create a new TOON formatter with default settings
29    pub fn new() -> Self {
30        Self {
31            include_line_numbers: true,
32            use_tabular: true,
33            show_file_index: true,
34            token_model: TokenizerModel::Claude,
35        }
36    }
37
38    /// Set line numbers option
39    pub fn with_line_numbers(mut self, enabled: bool) -> Self {
40        self.include_line_numbers = enabled;
41        self
42    }
43
44    /// Set tabular format option
45    pub fn with_tabular(mut self, enabled: bool) -> Self {
46        self.use_tabular = enabled;
47        self
48    }
49
50    /// Set file index/summary option
51    pub fn with_file_index(mut self, enabled: bool) -> Self {
52        self.show_file_index = enabled;
53        self
54    }
55
56    /// Set token model for token counts in output
57    pub fn with_model(mut self, model: TokenizerModel) -> Self {
58        self.token_model = model;
59        self
60    }
61
62    /// Estimate output size for pre-allocation
63    fn estimate_output_size(repo: &Repository) -> usize {
64        // Base overhead for headers and metadata
65        let base = 500;
66        // Estimate ~300 bytes per file for content + metadata
67        let files = repo.files.len() * 300;
68        // Estimate content size
69        let content: usize = repo
70            .files
71            .iter()
72            .filter_map(|f| f.content.as_ref())
73            .map(|c| c.len())
74            .sum();
75        base + files + content
76    }
77
78    // =========================================================================
79    // Streaming methods (write to impl std::io::Write)
80    // =========================================================================
81
82    fn stream_metadata<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
83        writeln!(w, "metadata:")?;
84        writeln!(w, "  name: {}", repo.name)?;
85        writeln!(w, "  files: {}", repo.metadata.total_files)?;
86        writeln!(w, "  lines: {}", repo.metadata.total_lines)?;
87        writeln!(w, "  tokens: {}", repo.metadata.total_tokens.get(self.token_model))?;
88
89        if let Some(ref desc) = repo.metadata.description {
90            writeln!(w, "  description: {}", escape_toon(desc))?;
91        }
92        if let Some(ref branch) = repo.metadata.branch {
93            writeln!(w, "  branch: {}", branch)?;
94        }
95        if let Some(ref commit) = repo.metadata.commit {
96            writeln!(w, "  commit: {}", commit)?;
97        }
98        writeln!(w)
99    }
100
101    fn stream_languages<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
102        if repo.metadata.languages.is_empty() {
103            return Ok(());
104        }
105
106        let count = repo.metadata.languages.len();
107        writeln!(w, "languages[{}]{{name,files,percentage}}:", count)?;
108        for lang in &repo.metadata.languages {
109            writeln!(w, "  {},{},{:.1}", lang.language, lang.files, lang.percentage)?;
110        }
111        writeln!(w)
112    }
113
114    fn stream_directory_structure<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
115        if let Some(ref structure) = repo.metadata.directory_structure {
116            writeln!(w, "directory_structure: |")?;
117            for line in structure.lines() {
118                writeln!(w, "  {}", line)?;
119            }
120            writeln!(w)?;
121        }
122        Ok(())
123    }
124
125    fn stream_dependencies<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
126        if repo.metadata.external_dependencies.is_empty() {
127            return Ok(());
128        }
129
130        let count = repo.metadata.external_dependencies.len();
131        writeln!(w, "dependencies[{}]:", count)?;
132        for dep in &repo.metadata.external_dependencies {
133            writeln!(w, "  {}", escape_toon(dep))?;
134        }
135        writeln!(w)
136    }
137
138    fn stream_repomap<W: Write>(&self, w: &mut W, map: &RepoMap) -> io::Result<()> {
139        writeln!(w, "repository_map:")?;
140        writeln!(w, "  token_budget: {}", map.token_count)?;
141        writeln!(w, "  summary: |")?;
142        for line in map.summary.lines() {
143            writeln!(w, "    {}", line)?;
144        }
145
146        if !map.key_symbols.is_empty() {
147            let count = map.key_symbols.len();
148            writeln!(w, "  symbols[{}]{{name,type,file,line,rank,summary}}:", count)?;
149            for sym in &map.key_symbols {
150                writeln!(
151                    w,
152                    "    {},{},{},{},{},{}",
153                    escape_toon(&sym.name),
154                    escape_toon(&sym.kind),
155                    escape_toon(&sym.file),
156                    sym.line,
157                    sym.rank,
158                    escape_toon(sym.summary.as_deref().unwrap_or(""))
159                )?;
160            }
161        }
162
163        if !map.module_graph.nodes.is_empty() {
164            let count = map.module_graph.nodes.len();
165            writeln!(w, "  modules[{}]{{name,files,tokens}}:", count)?;
166            for module in &map.module_graph.nodes {
167                writeln!(
168                    w,
169                    "    {},{},{}",
170                    escape_toon(&module.name),
171                    module.files,
172                    module.tokens
173                )?;
174            }
175        }
176        writeln!(w)
177    }
178
179    fn stream_file_index<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
180        if repo.files.is_empty() {
181            return Ok(());
182        }
183
184        let count = repo.files.len();
185        writeln!(w, "file_index[{}]{{path,tokens,importance}}:", count)?;
186        for file in &repo.files {
187            let importance = if file.importance > 0.8 {
188                "critical"
189            } else if file.importance > 0.6 {
190                "high"
191            } else if file.importance > 0.3 {
192                "normal"
193            } else {
194                "low"
195            };
196            writeln!(
197                w,
198                "  {},{},{}",
199                escape_toon(&file.relative_path),
200                file.token_count.get(self.token_model),
201                importance
202            )?;
203        }
204        writeln!(w)
205    }
206
207    fn stream_files<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
208        writeln!(w, "files:")?;
209
210        for file in &repo.files {
211            if let Some(ref content) = file.content {
212                let lang = file.language.as_deref().unwrap_or("?");
213                writeln!(
214                    w,
215                    "- {}|{}|{}:",
216                    escape_toon(&file.relative_path),
217                    lang,
218                    file.token_count.get(self.token_model)
219                )?;
220
221                if self.include_line_numbers {
222                    // Check if content has embedded line numbers (format: "N:content")
223                    // This preserves original line numbers when content has been compressed
224                    let first_line = content.lines().next().unwrap_or("");
225                    let has_embedded_line_nums = first_line.contains(':')
226                        && first_line
227                            .split(':')
228                            .next()
229                            .is_some_and(|s| s.parse::<u32>().is_ok());
230
231                    if has_embedded_line_nums {
232                        // Content has embedded line numbers - parse and output
233                        for line in content.lines() {
234                            if let Some((num_str, rest)) = line.split_once(':') {
235                                if let Ok(line_num) = num_str.parse::<u32>() {
236                                    writeln!(w, "  {}:{}", line_num, rest)?;
237                                } else {
238                                    // Fallback for malformed lines
239                                    writeln!(w, "  {}", line)?;
240                                }
241                            } else {
242                                writeln!(w, "  {}", line)?;
243                            }
244                        }
245                    } else {
246                        // No embedded line numbers - use sequential (uncompressed content)
247                        for (i, line) in content.lines().enumerate() {
248                            writeln!(w, "  {}:{}", i + 1, line)?;
249                        }
250                    }
251                } else {
252                    for line in content.lines() {
253                        writeln!(w, "  {}", line)?;
254                    }
255                }
256            }
257        }
258        Ok(())
259    }
260}
261
262impl Default for ToonFormatter {
263    fn default() -> Self {
264        Self::new()
265    }
266}
267
268impl Formatter for ToonFormatter {
269    fn format(&self, repo: &Repository, map: &RepoMap) -> String {
270        // Use streaming internally for consistency
271        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
272        // Vec<u8> write cannot fail, ignore result
273        drop(self.format_to_writer(repo, map, &mut output));
274        // Use lossy conversion to handle any edge cases with invalid UTF-8
275        String::from_utf8(output)
276            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
277    }
278
279    fn format_repo(&self, repo: &Repository) -> String {
280        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
281        // Vec<u8> write cannot fail, ignore result
282        drop(self.format_repo_to_writer(repo, &mut output));
283        // Use lossy conversion to handle any edge cases with invalid UTF-8
284        String::from_utf8(output)
285            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
286    }
287
288    fn name(&self) -> &'static str {
289        "toon"
290    }
291}
292
293impl StreamingFormatter for ToonFormatter {
294    fn format_to_writer<W: Write>(
295        &self,
296        repo: &Repository,
297        map: &RepoMap,
298        writer: &mut W,
299    ) -> io::Result<()> {
300        writeln!(writer, "# Infiniloom Repository Context (TOON format)")?;
301        writeln!(writer, "# Format: https://github.com/toon-format/toon")?;
302        writeln!(writer)?;
303
304        self.stream_metadata(writer, repo)?;
305        self.stream_languages(writer, repo)?;
306        self.stream_directory_structure(writer, repo)?;
307        self.stream_dependencies(writer, repo)?;
308        self.stream_repomap(writer, map)?;
309        if self.show_file_index {
310            self.stream_file_index(writer, repo)?;
311        }
312        self.stream_files(writer, repo)?;
313        Ok(())
314    }
315
316    fn format_repo_to_writer<W: Write>(&self, repo: &Repository, writer: &mut W) -> io::Result<()> {
317        writeln!(writer, "# Infiniloom Repository Context (TOON format)")?;
318        writeln!(writer)?;
319
320        self.stream_metadata(writer, repo)?;
321        self.stream_languages(writer, repo)?;
322        self.stream_directory_structure(writer, repo)?;
323        self.stream_dependencies(writer, repo)?;
324        if self.show_file_index {
325            self.stream_file_index(writer, repo)?;
326        }
327        self.stream_files(writer, repo)?;
328        Ok(())
329    }
330}
331
332/// Escape special characters for TOON format (v3.0 spec compliant)
333///
334/// Per TOON v3.0 spec, strings MUST be quoted if:
335/// - String is empty
336/// - Contains leading/trailing whitespace
337/// - Matches reserved literals (true, false, null)
338/// - Matches numeric patterns
339/// - Contains structural characters (colon, comma, pipe)
340/// - Contains control characters (newline, carriage return, tab)
341/// - Contains quote or backslash character
342///
343/// Only five escape sequences are valid per spec:
344/// - \\ (backslash)
345/// - \" (quote)
346/// - \n (newline)
347/// - \r (carriage return)
348/// - \t (tab)
349fn escape_toon(s: &str) -> String {
350    // Check if quoting is needed per TOON v3.0 spec
351    let needs_quotes = s.is_empty()
352        || s.starts_with(' ')
353        || s.ends_with(' ')
354        || s == "true"
355        || s == "false"
356        || s == "null"
357        || s.parse::<f64>().is_ok()
358        || s.contains(':')  // structural: key-value separator
359        || s.contains(',')  // structural: default delimiter
360        || s.contains('|')  // structural: alternate delimiter
361        || s.contains('\n')
362        || s.contains('\r')
363        || s.contains('\t')
364        || s.contains('"')
365        || s.contains('\\'); // backslash needs escaping
366
367    if needs_quotes {
368        // Only the five escapes allowed by TOON v3.0 spec
369        let escaped = s
370            .replace('\\', "\\\\")
371            .replace('"', "\\\"")
372            .replace('\n', "\\n")
373            .replace('\r', "\\r")
374            .replace('\t', "\\t");
375        format!("\"{}\"", escaped)
376    } else {
377        s.to_owned()
378    }
379}
380
381#[cfg(test)]
382#[allow(clippy::str_to_string)]
383mod tests {
384    use super::*;
385    use crate::repomap::RepoMapGenerator;
386    use crate::types::{LanguageStats, RepoFile, RepoMetadata, TokenCounts};
387
388    fn create_test_repo() -> Repository {
389        Repository {
390            name: "test".to_string(),
391            path: "/tmp/test".into(),
392            files: vec![RepoFile {
393                path: "/tmp/test/main.py".into(),
394                relative_path: "main.py".to_string(),
395                language: Some("python".to_string()),
396                size_bytes: 100,
397                token_count: TokenCounts {
398                    o200k: 48,
399                    cl100k: 49,
400                    claude: 50,
401                    gemini: 47,
402                    llama: 46,
403                    mistral: 46,
404                    deepseek: 46,
405                    qwen: 46,
406                    cohere: 47,
407                    grok: 46,
408                },
409                symbols: Vec::new(),
410                importance: 0.8,
411                content: Some("def main():\n    print('hello')".to_string()),
412            }],
413            metadata: RepoMetadata {
414                total_files: 1,
415                total_lines: 2,
416                total_tokens: TokenCounts {
417                    o200k: 48,
418                    cl100k: 49,
419                    claude: 50,
420                    gemini: 47,
421                    llama: 46,
422                    mistral: 46,
423                    deepseek: 46,
424                    qwen: 46,
425                    cohere: 47,
426                    grok: 46,
427                },
428                languages: vec![LanguageStats {
429                    language: "Python".to_string(),
430                    files: 1,
431                    lines: 2,
432                    percentage: 100.0,
433                }],
434                framework: None,
435                description: None,
436                branch: None,
437                commit: None,
438                directory_structure: Some("main.py\n".to_string()),
439                external_dependencies: vec!["requests".to_string(), "numpy".to_string()],
440                git_history: None,
441            },
442        }
443    }
444
445    #[test]
446    fn test_toon_output() {
447        let repo = create_test_repo();
448        let map = RepoMapGenerator::new(1000).generate(&repo);
449
450        let formatter = ToonFormatter::new();
451        let output = formatter.format(&repo, &map);
452
453        assert!(output.contains("# Infiniloom Repository Context"));
454        assert!(output.contains("metadata:"));
455        assert!(output.contains("name: test"));
456        assert!(output.contains("files: 1"));
457        assert!(output.contains("languages[1]{name,files,percentage}:"));
458        assert!(output.contains("directory_structure: |"));
459        // Files are formatted as "- path|lang|tokens:"
460        assert!(output.contains("main.py|python|50:"));
461    }
462
463    #[test]
464    fn test_toon_escaping() {
465        // Plain strings - no quoting needed
466        assert_eq!(escape_toon("hello"), "hello");
467        assert_eq!(escape_toon("hello_world"), "hello_world");
468        assert_eq!(escape_toon("CamelCase"), "CamelCase");
469
470        // Empty string - must be quoted
471        assert_eq!(escape_toon(""), "\"\"");
472
473        // Reserved literals - must be quoted
474        assert_eq!(escape_toon("true"), "\"true\"");
475        assert_eq!(escape_toon("false"), "\"false\"");
476        assert_eq!(escape_toon("null"), "\"null\"");
477
478        // Numeric patterns - must be quoted
479        assert_eq!(escape_toon("123"), "\"123\"");
480        assert_eq!(escape_toon("3.14"), "\"3.14\"");
481        assert_eq!(escape_toon("-42"), "\"-42\"");
482        assert_eq!(escape_toon("0"), "\"0\"");
483
484        // Structural characters - must be quoted (TOON v3.0)
485        assert_eq!(escape_toon("a,b"), "\"a,b\""); // comma (default delimiter)
486        assert_eq!(escape_toon("a|b"), "\"a|b\""); // pipe (alt delimiter)
487        assert_eq!(escape_toon("key:value"), "\"key:value\""); // colon (key-value sep)
488
489        // Control characters - must be quoted and escaped
490        assert_eq!(escape_toon("line\nbreak"), "\"line\\nbreak\"");
491        assert_eq!(escape_toon("tab\there"), "\"tab\\there\"");
492        assert_eq!(escape_toon("cr\rhere"), "\"cr\\rhere\"");
493
494        // Quote character - must be quoted and escaped
495        assert_eq!(escape_toon("say \"hello\""), "\"say \\\"hello\\\"\"");
496
497        // Backslash - must be quoted and escaped
498        assert_eq!(escape_toon("path\\to\\file"), "\"path\\\\to\\\\file\"");
499
500        // Leading/trailing whitespace - must be quoted
501        assert_eq!(escape_toon(" leading"), "\" leading\"");
502        assert_eq!(escape_toon("trailing "), "\"trailing \"");
503        assert_eq!(escape_toon(" both "), "\" both \"");
504    }
505
506    #[test]
507    fn test_toon_tabular_format() {
508        let repo = create_test_repo();
509        let formatter = ToonFormatter::new();
510        let output = formatter.format_repo(&repo);
511
512        // Should use tabular format for languages and file_index
513        assert!(output.contains("languages[1]{name,files,percentage}:"));
514        assert!(output.contains("file_index[1]{path,tokens,importance}:"));
515    }
516}