infiniloom_engine/output/
toon.rs

1//! TOON (Token-Oriented Object Notation) output formatter
2//!
3//! TOON is a compact, human-readable format designed for LLM context.
4//! It provides ~40% fewer tokens than JSON while maintaining readability.
5//!
6//! Supports both in-memory (`format()`) and streaming (`format_to_writer()`) modes.
7//!
8//! Format specification: https://github.com/toon-format/toon
9
10use crate::output::{Formatter, StreamingFormatter};
11use crate::repomap::RepoMap;
12use crate::types::{Repository, TokenizerModel};
13use std::io::{self, Write};
14
15/// TOON formatter - most token-efficient format for LLMs
16pub struct ToonFormatter {
17    /// Include line numbers in code
18    include_line_numbers: bool,
19    /// Use tabular format for file metadata
20    use_tabular: bool,
21    /// Include file index/summary section
22    show_file_index: bool,
23    /// Token model for counts in output
24    token_model: TokenizerModel,
25}
26
27impl ToonFormatter {
28    /// Create a new TOON formatter with default settings
29    pub fn new() -> Self {
30        Self {
31            include_line_numbers: true,
32            use_tabular: true,
33            show_file_index: true,
34            token_model: TokenizerModel::Claude,
35        }
36    }
37
38    /// Set line numbers option
39    pub fn with_line_numbers(mut self, enabled: bool) -> Self {
40        self.include_line_numbers = enabled;
41        self
42    }
43
44    /// Set tabular format option
45    pub fn with_tabular(mut self, enabled: bool) -> Self {
46        self.use_tabular = enabled;
47        self
48    }
49
50    /// Set file index/summary option
51    pub fn with_file_index(mut self, enabled: bool) -> Self {
52        self.show_file_index = enabled;
53        self
54    }
55
56    /// Set token model for token counts in output
57    pub fn with_model(mut self, model: TokenizerModel) -> Self {
58        self.token_model = model;
59        self
60    }
61
62    /// Estimate output size for pre-allocation
63    fn estimate_output_size(repo: &Repository) -> usize {
64        // Base overhead for headers and metadata
65        let base = 500;
66        // Estimate ~300 bytes per file for content + metadata
67        let files = repo.files.len() * 300;
68        // Estimate content size
69        let content: usize = repo
70            .files
71            .iter()
72            .filter_map(|f| f.content.as_ref())
73            .map(|c| c.len())
74            .sum();
75        base + files + content
76    }
77
78    // =========================================================================
79    // Streaming methods (write to impl std::io::Write)
80    // =========================================================================
81
82    fn stream_metadata<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
83        writeln!(w, "metadata:")?;
84        writeln!(w, "  name: {}", repo.name)?;
85        writeln!(w, "  files: {}", repo.metadata.total_files)?;
86        writeln!(w, "  lines: {}", repo.metadata.total_lines)?;
87        writeln!(w, "  tokens: {}", repo.metadata.total_tokens.get(self.token_model))?;
88
89        if let Some(ref desc) = repo.metadata.description {
90            writeln!(w, "  description: {}", escape_toon(desc))?;
91        }
92        if let Some(ref branch) = repo.metadata.branch {
93            writeln!(w, "  branch: {}", branch)?;
94        }
95        if let Some(ref commit) = repo.metadata.commit {
96            writeln!(w, "  commit: {}", commit)?;
97        }
98        writeln!(w)
99    }
100
101    fn stream_languages<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
102        if repo.metadata.languages.is_empty() {
103            return Ok(());
104        }
105
106        let count = repo.metadata.languages.len();
107        writeln!(w, "languages[{}]{{name,files,percentage}}:", count)?;
108        for lang in &repo.metadata.languages {
109            writeln!(w, "  {},{},{:.1}", lang.language, lang.files, lang.percentage)?;
110        }
111        writeln!(w)
112    }
113
114    fn stream_directory_structure<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
115        if let Some(ref structure) = repo.metadata.directory_structure {
116            writeln!(w, "directory_structure: |")?;
117            for line in structure.lines() {
118                writeln!(w, "  {}", line)?;
119            }
120            writeln!(w)?;
121        }
122        Ok(())
123    }
124
125    fn stream_dependencies<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
126        if repo.metadata.external_dependencies.is_empty() {
127            return Ok(());
128        }
129
130        let count = repo.metadata.external_dependencies.len();
131        writeln!(w, "dependencies[{}]:", count)?;
132        for dep in &repo.metadata.external_dependencies {
133            writeln!(w, "  {}", escape_toon(dep))?;
134        }
135        writeln!(w)
136    }
137
138    fn stream_repomap<W: Write>(&self, w: &mut W, map: &RepoMap) -> io::Result<()> {
139        writeln!(w, "repository_map:")?;
140        writeln!(w, "  token_budget: {}", map.token_count)?;
141        writeln!(w, "  summary: |")?;
142        for line in map.summary.lines() {
143            writeln!(w, "    {}", line)?;
144        }
145
146        if !map.key_symbols.is_empty() {
147            let count = map.key_symbols.len();
148            writeln!(w, "  symbols[{}]{{name,type,file,line,rank,summary}}:", count)?;
149            for sym in &map.key_symbols {
150                writeln!(
151                    w,
152                    "    {},{},{},{},{},{}",
153                    escape_toon(&sym.name),
154                    escape_toon(&sym.kind),
155                    escape_toon(&sym.file),
156                    sym.line,
157                    sym.rank,
158                    escape_toon(sym.summary.as_deref().unwrap_or(""))
159                )?;
160            }
161        }
162
163        if !map.module_graph.nodes.is_empty() {
164            let count = map.module_graph.nodes.len();
165            writeln!(w, "  modules[{}]{{name,files,tokens}}:", count)?;
166            for module in &map.module_graph.nodes {
167                writeln!(
168                    w,
169                    "    {},{},{}",
170                    escape_toon(&module.name),
171                    module.files,
172                    module.tokens
173                )?;
174            }
175        }
176        writeln!(w)
177    }
178
179    fn stream_file_index<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
180        if repo.files.is_empty() {
181            return Ok(());
182        }
183
184        let count = repo.files.len();
185        writeln!(w, "file_index[{}]{{path,tokens,importance}}:", count)?;
186        for file in &repo.files {
187            let importance = if file.importance > 0.8 {
188                "critical"
189            } else if file.importance > 0.6 {
190                "high"
191            } else if file.importance > 0.3 {
192                "normal"
193            } else {
194                "low"
195            };
196            writeln!(
197                w,
198                "  {},{},{}",
199                escape_toon(&file.relative_path),
200                file.token_count.get(self.token_model),
201                importance
202            )?;
203        }
204        writeln!(w)
205    }
206
207    fn stream_files<W: Write>(&self, w: &mut W, repo: &Repository) -> io::Result<()> {
208        writeln!(w, "files:")?;
209
210        for file in &repo.files {
211            if let Some(ref content) = file.content {
212                let lang = file.language.as_deref().unwrap_or("?");
213                writeln!(
214                    w,
215                    "- {}|{}|{}:",
216                    escape_toon(&file.relative_path),
217                    lang,
218                    file.token_count.get(self.token_model)
219                )?;
220
221                if self.include_line_numbers {
222                    // Check if content has embedded line numbers (format: "N:content")
223                    // This preserves original line numbers when content has been compressed
224                    let first_line = content.lines().next().unwrap_or("");
225                    let has_embedded_line_nums = first_line.contains(':')
226                        && first_line
227                            .split(':')
228                            .next()
229                            .map(|s| s.parse::<u32>().is_ok())
230                            .unwrap_or(false);
231
232                    if has_embedded_line_nums {
233                        // Content has embedded line numbers - parse and output
234                        for line in content.lines() {
235                            if let Some((num_str, rest)) = line.split_once(':') {
236                                if let Ok(line_num) = num_str.parse::<u32>() {
237                                    writeln!(w, "  {}:{}", line_num, rest)?;
238                                } else {
239                                    // Fallback for malformed lines
240                                    writeln!(w, "  {}", line)?;
241                                }
242                            } else {
243                                writeln!(w, "  {}", line)?;
244                            }
245                        }
246                    } else {
247                        // No embedded line numbers - use sequential (uncompressed content)
248                        for (i, line) in content.lines().enumerate() {
249                            writeln!(w, "  {}:{}", i + 1, line)?;
250                        }
251                    }
252                } else {
253                    for line in content.lines() {
254                        writeln!(w, "  {}", line)?;
255                    }
256                }
257            }
258        }
259        Ok(())
260    }
261}
262
263impl Default for ToonFormatter {
264    fn default() -> Self {
265        Self::new()
266    }
267}
268
269impl Formatter for ToonFormatter {
270    fn format(&self, repo: &Repository, map: &RepoMap) -> String {
271        // Use streaming internally for consistency
272        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
273        // Vec<u8> write cannot fail, ignore result
274        drop(self.format_to_writer(repo, map, &mut output));
275        // Use lossy conversion to handle any edge cases with invalid UTF-8
276        String::from_utf8(output)
277            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
278    }
279
280    fn format_repo(&self, repo: &Repository) -> String {
281        let mut output = Vec::with_capacity(Self::estimate_output_size(repo));
282        // Vec<u8> write cannot fail, ignore result
283        drop(self.format_repo_to_writer(repo, &mut output));
284        // Use lossy conversion to handle any edge cases with invalid UTF-8
285        String::from_utf8(output)
286            .unwrap_or_else(|e| String::from_utf8_lossy(e.as_bytes()).into_owned())
287    }
288
289    fn name(&self) -> &'static str {
290        "toon"
291    }
292}
293
294impl StreamingFormatter for ToonFormatter {
295    fn format_to_writer<W: Write>(
296        &self,
297        repo: &Repository,
298        map: &RepoMap,
299        writer: &mut W,
300    ) -> io::Result<()> {
301        writeln!(writer, "# Infiniloom Repository Context (TOON format)")?;
302        writeln!(writer, "# Format: https://github.com/toon-format/toon")?;
303        writeln!(writer)?;
304
305        self.stream_metadata(writer, repo)?;
306        self.stream_languages(writer, repo)?;
307        self.stream_directory_structure(writer, repo)?;
308        self.stream_dependencies(writer, repo)?;
309        self.stream_repomap(writer, map)?;
310        if self.show_file_index {
311            self.stream_file_index(writer, repo)?;
312        }
313        self.stream_files(writer, repo)?;
314        Ok(())
315    }
316
317    fn format_repo_to_writer<W: Write>(&self, repo: &Repository, writer: &mut W) -> io::Result<()> {
318        writeln!(writer, "# Infiniloom Repository Context (TOON format)")?;
319        writeln!(writer)?;
320
321        self.stream_metadata(writer, repo)?;
322        self.stream_languages(writer, repo)?;
323        self.stream_directory_structure(writer, repo)?;
324        self.stream_dependencies(writer, repo)?;
325        if self.show_file_index {
326            self.stream_file_index(writer, repo)?;
327        }
328        self.stream_files(writer, repo)?;
329        Ok(())
330    }
331}
332
333/// Escape special characters for TOON format (v3.0 spec compliant)
334///
335/// Per TOON v3.0 spec, strings MUST be quoted if:
336/// - String is empty
337/// - Contains leading/trailing whitespace
338/// - Matches reserved literals (true, false, null)
339/// - Matches numeric patterns
340/// - Contains structural characters (colon, comma, pipe)
341/// - Contains control characters (newline, carriage return, tab)
342/// - Contains quote or backslash character
343///
344/// Only five escape sequences are valid per spec:
345/// - \\ (backslash)
346/// - \" (quote)
347/// - \n (newline)
348/// - \r (carriage return)
349/// - \t (tab)
350fn escape_toon(s: &str) -> String {
351    // Check if quoting is needed per TOON v3.0 spec
352    let needs_quotes = s.is_empty()
353        || s.starts_with(' ')
354        || s.ends_with(' ')
355        || s == "true"
356        || s == "false"
357        || s == "null"
358        || s.parse::<f64>().is_ok()
359        || s.contains(':')  // structural: key-value separator
360        || s.contains(',')  // structural: default delimiter
361        || s.contains('|')  // structural: alternate delimiter
362        || s.contains('\n')
363        || s.contains('\r')
364        || s.contains('\t')
365        || s.contains('"')
366        || s.contains('\\'); // backslash needs escaping
367
368    if needs_quotes {
369        // Only the five escapes allowed by TOON v3.0 spec
370        let escaped = s
371            .replace('\\', "\\\\")
372            .replace('"', "\\\"")
373            .replace('\n', "\\n")
374            .replace('\r', "\\r")
375            .replace('\t', "\\t");
376        format!("\"{}\"", escaped)
377    } else {
378        s.to_owned()
379    }
380}
381
382#[cfg(test)]
383#[allow(clippy::str_to_string)]
384mod tests {
385    use super::*;
386    use crate::repomap::RepoMapGenerator;
387    use crate::types::{LanguageStats, RepoFile, RepoMetadata, TokenCounts};
388
389    fn create_test_repo() -> Repository {
390        Repository {
391            name: "test".to_string(),
392            path: "/tmp/test".into(),
393            files: vec![RepoFile {
394                path: "/tmp/test/main.py".into(),
395                relative_path: "main.py".to_string(),
396                language: Some("python".to_string()),
397                size_bytes: 100,
398                token_count: TokenCounts {
399                    o200k: 48,
400                    cl100k: 49,
401                    claude: 50,
402                    gemini: 47,
403                    llama: 46,
404                    mistral: 46,
405                    deepseek: 46,
406                    qwen: 46,
407                    cohere: 47,
408                    grok: 46,
409                },
410                symbols: Vec::new(),
411                importance: 0.8,
412                content: Some("def main():\n    print('hello')".to_string()),
413            }],
414            metadata: RepoMetadata {
415                total_files: 1,
416                total_lines: 2,
417                total_tokens: TokenCounts {
418                    o200k: 48,
419                    cl100k: 49,
420                    claude: 50,
421                    gemini: 47,
422                    llama: 46,
423                    mistral: 46,
424                    deepseek: 46,
425                    qwen: 46,
426                    cohere: 47,
427                    grok: 46,
428                },
429                languages: vec![LanguageStats {
430                    language: "Python".to_string(),
431                    files: 1,
432                    lines: 2,
433                    percentage: 100.0,
434                }],
435                framework: None,
436                description: None,
437                branch: None,
438                commit: None,
439                directory_structure: Some("main.py\n".to_string()),
440                external_dependencies: vec!["requests".to_string(), "numpy".to_string()],
441                git_history: None,
442            },
443        }
444    }
445
446    #[test]
447    fn test_toon_output() {
448        let repo = create_test_repo();
449        let map = RepoMapGenerator::new(1000).generate(&repo);
450
451        let formatter = ToonFormatter::new();
452        let output = formatter.format(&repo, &map);
453
454        assert!(output.contains("# Infiniloom Repository Context"));
455        assert!(output.contains("metadata:"));
456        assert!(output.contains("name: test"));
457        assert!(output.contains("files: 1"));
458        assert!(output.contains("languages[1]{name,files,percentage}:"));
459        assert!(output.contains("directory_structure: |"));
460        // Files are formatted as "- path|lang|tokens:"
461        assert!(output.contains("main.py|python|50:"));
462    }
463
464    #[test]
465    fn test_toon_escaping() {
466        // Plain strings - no quoting needed
467        assert_eq!(escape_toon("hello"), "hello");
468        assert_eq!(escape_toon("hello_world"), "hello_world");
469        assert_eq!(escape_toon("CamelCase"), "CamelCase");
470
471        // Empty string - must be quoted
472        assert_eq!(escape_toon(""), "\"\"");
473
474        // Reserved literals - must be quoted
475        assert_eq!(escape_toon("true"), "\"true\"");
476        assert_eq!(escape_toon("false"), "\"false\"");
477        assert_eq!(escape_toon("null"), "\"null\"");
478
479        // Numeric patterns - must be quoted
480        assert_eq!(escape_toon("123"), "\"123\"");
481        assert_eq!(escape_toon("3.14"), "\"3.14\"");
482        assert_eq!(escape_toon("-42"), "\"-42\"");
483        assert_eq!(escape_toon("0"), "\"0\"");
484
485        // Structural characters - must be quoted (TOON v3.0)
486        assert_eq!(escape_toon("a,b"), "\"a,b\""); // comma (default delimiter)
487        assert_eq!(escape_toon("a|b"), "\"a|b\""); // pipe (alt delimiter)
488        assert_eq!(escape_toon("key:value"), "\"key:value\""); // colon (key-value sep)
489
490        // Control characters - must be quoted and escaped
491        assert_eq!(escape_toon("line\nbreak"), "\"line\\nbreak\"");
492        assert_eq!(escape_toon("tab\there"), "\"tab\\there\"");
493        assert_eq!(escape_toon("cr\rhere"), "\"cr\\rhere\"");
494
495        // Quote character - must be quoted and escaped
496        assert_eq!(escape_toon("say \"hello\""), "\"say \\\"hello\\\"\"");
497
498        // Backslash - must be quoted and escaped
499        assert_eq!(escape_toon("path\\to\\file"), "\"path\\\\to\\\\file\"");
500
501        // Leading/trailing whitespace - must be quoted
502        assert_eq!(escape_toon(" leading"), "\" leading\"");
503        assert_eq!(escape_toon("trailing "), "\"trailing \"");
504        assert_eq!(escape_toon(" both "), "\" both \"");
505    }
506
507    #[test]
508    fn test_toon_tabular_format() {
509        let repo = create_test_repo();
510        let formatter = ToonFormatter::new();
511        let output = formatter.format_repo(&repo);
512
513        // Should use tabular format for languages and file_index
514        assert!(output.contains("languages[1]{name,files,percentage}:"));
515        assert!(output.contains("file_index[1]{path,tokens,importance}:"));
516    }
517}