Skip to main content

codebones_core/
plugin.rs

1use crate::cache::{CacheStore, SqliteCache};
2use crate::parser::Bone;
3use crate::parser::Parser;
4use anyhow::Result;
5use std::path::{Path, PathBuf};
6use std::sync::OnceLock;
7
8// Regex::new is called inside OnceLock::get_or_init, which guarantees compilation at most once.
9// Clippy cannot see through the OnceLock abstraction and fires regex_creation_in_loops.
10#[allow(clippy::regex_creation_in_loops)]
11static RE_EMPTY_LINES: OnceLock<regex::Regex> = OnceLock::new();
12#[allow(clippy::regex_creation_in_loops)]
13static RE_BASE64: OnceLock<regex::Regex> = OnceLock::new();
14#[allow(clippy::regex_creation_in_loops)]
15static RE_LINE_COMMENT: OnceLock<regex::Regex> = OnceLock::new();
16#[allow(clippy::regex_creation_in_loops)]
17static RE_BLOCK_COMMENT: OnceLock<regex::Regex> = OnceLock::new();
18
19/// A plugin that can enrich extracted code bones with domain-specific metadata.
20pub trait ContextPlugin: Send + Sync {
21    /// The unique name of the plugin (e.g., "dbt", "openapi").
22    fn name(&self) -> &str;
23
24    /// Returns true if this plugin should be active for the given directory/workspace.
25    fn detect(&self, directory: &Path) -> bool;
26
27    /// Enriches the extracted bones for a specific file with additional metadata.
28    /// The plugin can modify the `base_bones` in place (e.g., adding JSON metadata).
29    fn enrich(&self, file_path: &Path, base_bones: &mut Vec<Bone>) -> Result<()>;
30}
31
32/// Supported output formats for the packed context.
33pub enum OutputFormat {
34    Xml,
35    Markdown,
36}
37
38/// Bundles files and their enriched bones into an AI-friendly output format.
39pub struct Packer {
40    cache: SqliteCache,
41    parser: Parser,
42    plugins: Vec<Box<dyn ContextPlugin>>,
43    format: OutputFormat,
44    max_tokens: Option<usize>,
45    no_file_summary: bool,
46    no_files: bool,
47    remove_comments: bool,
48    remove_empty_lines: bool,
49    truncate_base64: bool,
50}
51
52impl Packer {
53    fn xml_escape(s: &str) -> String {
54        s.replace('&', "&amp;")
55            .replace('<', "&lt;")
56            .replace('>', "&gt;")
57            .replace('"', "&quot;")
58            .replace('\'', "&apos;")
59    }
60
61    fn xml_escape_cdata(s: &str) -> String {
62        // Split ]]> into ]]]]><![CDATA[> to keep it inside CDATA
63        s.replace("]]>", "]]]]><![CDATA[>")
64    }
65
66    /// Creates a new Packer instance.
67    #[allow(clippy::too_many_arguments)]
68    pub fn new(
69        cache: SqliteCache,
70        parser: Parser,
71        format: OutputFormat,
72        max_tokens: Option<usize>,
73        no_file_summary: bool,
74        no_files: bool,
75        remove_comments: bool,
76        remove_empty_lines: bool,
77        truncate_base64: bool,
78    ) -> Self {
79        Self {
80            cache,
81            parser,
82            plugins: Vec::new(),
83            format,
84            max_tokens,
85            no_file_summary,
86            no_files,
87            remove_comments,
88            remove_empty_lines,
89            truncate_base64,
90        }
91    }
92
93    /// Registers a context plugin.
94    pub fn register_plugin(&mut self, plugin: Box<dyn ContextPlugin>) {
95        self.plugins.push(plugin);
96    }
97
98    /// Packs the specified files into a single formatted string.
99    // OnceLock::get_or_init guarantees each regex is compiled at most once.
100    // Clippy fires regex_creation_in_loops because it cannot see through the OnceLock
101    // abstraction — the allow is intentional and correct.
102    #[allow(clippy::regex_creation_in_loops)]
103    pub fn pack(&self, file_paths: &[PathBuf]) -> Result<String> {
104        let _ = &self.parser;
105
106        let mut output = String::new();
107
108        // Retrieve all files and their symbols from DB to build the skeleton map
109        let db_files_symbols: Vec<(String, Vec<(String, String)>)> =
110            self.cache.list_files_with_symbols().unwrap_or_default();
111
112        match self.format {
113            OutputFormat::Xml => output.push_str("<repository>\n"),
114            OutputFormat::Markdown => {}
115        }
116
117        // Match the correct DB file path using ends_with since path_str may contain dir prefix
118        let lookup_symbols = |path: &PathBuf| -> Vec<(String, String)> {
119            let path_str = path.to_string_lossy().to_string();
120            let path_normalized = path_str.strip_prefix("./").unwrap_or(&path_str);
121            db_files_symbols
122                .iter()
123                .find(|(db_p, _)| {
124                    path_normalized.ends_with(db_p.as_str()) || db_p.ends_with(path_normalized)
125                })
126                .map(|(_, syms)| syms.clone())
127                .unwrap_or_default()
128        };
129
130        // Generate Skeleton Map
131        if !self.no_file_summary {
132            match self.format {
133                OutputFormat::Xml => {
134                    output.push_str("  <skeleton_map>\n");
135                    for path in file_paths {
136                        output.push_str(&format!(
137                            "    <file path=\"{}\">\n",
138                            Self::xml_escape(&path.display().to_string())
139                        ));
140                        for (kind, name) in lookup_symbols(path) {
141                            output.push_str(&format!(
142                                "      <signature>{} {}</signature>\n",
143                                Self::xml_escape(&kind),
144                                Self::xml_escape(&name)
145                            ));
146                        }
147                        output.push_str("    </file>\n");
148                    }
149                    output.push_str("  </skeleton_map>\n");
150                }
151                OutputFormat::Markdown => {
152                    output.push_str("## Skeleton Map\n\n");
153                    for path in file_paths {
154                        output.push_str(&format!("- {}\n", path.display()));
155                        for (kind, name) in lookup_symbols(path) {
156                            output.push_str(&format!("  - {} {}\n", kind, name));
157                        }
158                    }
159                    output.push('\n');
160                }
161            }
162        }
163
164        if self.no_files {
165            if let OutputFormat::Xml = self.format {
166                output.push_str("</repository>\n");
167            }
168            return Ok(output);
169        }
170
171        let bpe = tiktoken_rs::cl100k_base()
172            .map_err(|e| anyhow::anyhow!("Failed to initialize tokenizer: {}", e))?;
173        let mut degrade_to_bones = false;
174
175        for path in file_paths {
176            let mut raw_content = match std::fs::read_to_string(path) {
177                Ok(s) => s,
178                Err(e) => {
179                    eprintln!(
180                        "Warning: skipping unreadable file {}: {}",
181                        path.display(),
182                        e
183                    );
184                    continue;
185                }
186            };
187
188            if self.remove_empty_lines {
189                raw_content = RE_EMPTY_LINES
190                    .get_or_init(|| {
191                        regex::Regex::new(r"\n\s*\n").expect("valid static regex: empty lines")
192                    })
193                    .replace_all(&raw_content, "\n")
194                    .to_string();
195            }
196
197            if self.truncate_base64 {
198                // Truncate long hex or base64 looking strings (length > 100)
199                raw_content = RE_BASE64
200                    .get_or_init(|| {
201                        regex::Regex::new(r"[A-Za-z0-9+/=]{100,}")
202                            .expect("valid static regex: base64")
203                    })
204                    .replace_all(&raw_content, "[TRUNCATED_BASE64]")
205                    .to_string();
206            }
207
208            // Generate the skeleton by eliding function/class bodies
209            let content = {
210                let ext = path.extension().unwrap_or_default().to_string_lossy();
211                if let Some(spec) = crate::parser::get_spec_for_extension(&ext) {
212                    let doc = crate::parser::parse_file(&raw_content, &spec);
213                    let mut result = String::new();
214                    let mut last_end = 0;
215
216                    let mut indices: Vec<usize> = (0..doc.symbols.len()).collect();
217                    indices.sort_by_key(|&i| doc.symbols[i].full_range.start);
218
219                    for i in &indices {
220                        let sym = &doc.symbols[*i];
221                        if let Some(body_range) = &sym.body_range {
222                            if body_range.start >= last_end {
223                                result.push_str(&raw_content[last_end..body_range.start]);
224                                result.push_str("...");
225                                last_end = body_range.end;
226                            }
227                        }
228                    }
229                    result.push_str(&raw_content[last_end..]);
230
231                    if self.remove_comments {
232                        // Simple regex fallback for comments (C-style, Python, HTML)
233                        result = RE_BLOCK_COMMENT
234                            .get_or_init(|| {
235                                regex::Regex::new(r"(?s)/\*.*?\*/|<!--.*?-->")
236                                    .expect("valid static regex: block comment")
237                            })
238                            .replace_all(&result, "")
239                            .to_string();
240                        result = RE_LINE_COMMENT
241                            .get_or_init(|| {
242                                regex::Regex::new(r"(?m)(//|#).*\n")
243                                    .expect("valid static regex: line comment")
244                            })
245                            .replace_all(&result, "\n")
246                            .to_string();
247                    }
248
249                    result
250                } else {
251                    if self.remove_comments {
252                        let no_blocks = RE_BLOCK_COMMENT
253                            .get_or_init(|| {
254                                regex::Regex::new(r"(?s)/\*.*?\*/|<!--.*?-->")
255                                    .expect("valid static regex: block comment")
256                            })
257                            .replace_all(&raw_content, "")
258                            .to_string();
259                        RE_LINE_COMMENT
260                            .get_or_init(|| {
261                                regex::Regex::new(r"(?m)(//|#).*\n")
262                                    .expect("valid static regex: line comment")
263                            })
264                            .replace_all(&no_blocks, "\n")
265                            .to_string()
266                    } else {
267                        raw_content.clone() // Fallback to raw content if language isn't supported
268                    }
269                }
270            };
271
272            let mut bones = vec![Bone::default()];
273
274            for plugin in &self.plugins {
275                if plugin.detect(path) {
276                    plugin.enrich(path, &mut bones)?;
277                }
278            }
279
280            if !degrade_to_bones {
281                if let Some(max) = self.max_tokens {
282                    let current_tokens = bpe.encode_with_special_tokens(&output).len();
283                    let content_tokens = bpe.encode_with_special_tokens(&content).len();
284                    if current_tokens + content_tokens > max {
285                        degrade_to_bones = true;
286                    }
287                }
288            }
289
290            match self.format {
291                OutputFormat::Xml => {
292                    output.push_str(&format!(
293                        "  <file path=\"{}\">\n",
294                        Self::xml_escape(&path.display().to_string())
295                    ));
296                    if !degrade_to_bones {
297                        let safe_content = Self::xml_escape_cdata(&content);
298                        if safe_content == content {
299                            output.push_str(&format!(
300                                "    <content><![CDATA[\n{}\n]]></content>\n",
301                                safe_content
302                            ));
303                        } else {
304                            // Content contains ]]> which cannot be safely embedded in CDATA;
305                            // fall back to XML entity escaping so the document stays well-formed.
306                            output.push_str(&format!(
307                                "    <content>{}</content>\n",
308                                Self::xml_escape(&content)
309                            ));
310                        }
311                    }
312                    // Only print bones block if plugins added metadata
313                    let has_metadata = bones.iter().any(|b| !b.metadata.is_empty());
314                    if has_metadata {
315                        output.push_str("    <bones>\n");
316                        for bone in &bones {
317                            for (k, v) in &bone.metadata {
318                                output.push_str(&format!(
319                                    "      <metadata key=\"{}\">{}</metadata>\n",
320                                    Self::xml_escape(k),
321                                    Self::xml_escape(v)
322                                ));
323                            }
324                        }
325                        output.push_str("    </bones>\n");
326                    }
327                    output.push_str("  </file>\n");
328                }
329                OutputFormat::Markdown => {
330                    output.push_str(&format!("## {}\n\n", path.display()));
331                    if !degrade_to_bones {
332                        // Find longest run of backticks in content and use one more as the fence
333                        // delimiter (CommonMark spec approach) to prevent fence injection.
334                        let max_backticks = {
335                            let mut max = 0usize;
336                            let mut cur = 0usize;
337                            for c in content.chars() {
338                                if c == '`' {
339                                    cur += 1;
340                                    max = max.max(cur);
341                                } else {
342                                    cur = 0;
343                                }
344                            }
345                            max
346                        };
347                        let fence_len = max_backticks.max(2) + 1;
348                        let fence = "`".repeat(fence_len);
349                        // Break up any backtick run of length >= (fence_len - 1) within the
350                        // content to prevent a closing-fence sequence from appearing verbatim.
351                        // A zero-width space (U+200B) is inserted after the (fence_len-1)-th
352                        // consecutive backtick so the run is interrupted while the characters
353                        // remain visible to the reader.
354                        let safe_content = if max_backticks >= fence_len - 1 {
355                            let threshold = fence_len - 1;
356                            let mut result = String::with_capacity(content.len());
357                            let mut run = 0usize;
358                            for c in content.chars() {
359                                result.push(c);
360                                if c == '`' {
361                                    run += 1;
362                                    if run == threshold {
363                                        result.push('\u{200B}'); // zero-width space
364                                        run = 0;
365                                    }
366                                } else {
367                                    run = 0;
368                                }
369                            }
370                            result
371                        } else {
372                            content.clone()
373                        };
374                        output.push_str(&format!("{}\n{}\n{}\n\n", fence, safe_content, fence));
375                    }
376                    // Only print Bones section if plugins added metadata
377                    let has_metadata = bones.iter().any(|b| !b.metadata.is_empty());
378                    if has_metadata {
379                        output.push_str("Bones:\n");
380                        for bone in &bones {
381                            for (k, v) in &bone.metadata {
382                                output.push_str(&format!("- {}: {}\n", k, v));
383                            }
384                        }
385                        output.push('\n');
386                    }
387                }
388            }
389        }
390
391        if let OutputFormat::Xml = self.format {
392            output.push_str("</repository>\n");
393        }
394
395        Ok(output)
396    }
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402    use std::io::Write;
403
404    struct MockPlugin;
405
406    impl ContextPlugin for MockPlugin {
407        fn name(&self) -> &str {
408            "mock"
409        }
410
411        fn detect(&self, _directory: &Path) -> bool {
412            true
413        }
414
415        fn enrich(&self, _file_path: &Path, base_bones: &mut Vec<Bone>) -> Result<()> {
416            for bone in base_bones.iter_mut() {
417                bone.metadata
418                    .insert("injected".to_string(), "true".to_string());
419            }
420            Ok(())
421        }
422    }
423
424    fn make_temp_rs_file(content: &str) -> (tempfile::TempDir, PathBuf) {
425        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
426        let file_path = dir.path().join("sample.rs");
427        let mut f = std::fs::File::create(&file_path).expect("failed to create temp file");
428        f.write_all(content.as_bytes())
429            .expect("failed to write file content");
430        (dir, file_path)
431    }
432
433    #[test]
434    fn test_plugin_detect_and_enrich() {
435        let plugin = MockPlugin;
436        assert!(plugin.detect(Path::new(".")));
437        let mut bones = vec![Bone::default()];
438        plugin
439            .enrich(Path::new("any_file.rs"), &mut bones)
440            .expect("enrich should succeed");
441        assert_eq!(
442            bones[0]
443                .metadata
444                .get("injected")
445                .expect("injected key must be present"),
446            "true"
447        );
448    }
449
450    #[test]
451    fn test_packer_xml_format() {
452        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
453        let packer = Packer::new(
454            SqliteCache::new_in_memory().expect("failed to create test cache"),
455            Parser {},
456            OutputFormat::Xml,
457            None,
458            false,
459            false,
460            false,
461            false,
462            false,
463        );
464        let result = packer.pack(&[file_path]);
465        assert!(result.is_ok());
466        let output = result.expect("pack should succeed");
467        assert!(output.contains("<repository>"));
468    }
469
470    #[test]
471    fn test_packer_markdown_format() {
472        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
473        let packer = Packer::new(
474            SqliteCache::new_in_memory().expect("failed to create test cache"),
475            Parser {},
476            OutputFormat::Markdown,
477            None,
478            false,
479            false,
480            false,
481            false,
482            false,
483        );
484        let result = packer.pack(std::slice::from_ref(&file_path));
485        assert!(result.is_ok());
486        let output = result.expect("pack should succeed");
487        assert!(output.contains(&format!("## {}", file_path.display())));
488    }
489
490    #[test]
491    fn test_packer_with_plugins() {
492        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
493        let mut packer = Packer::new(
494            SqliteCache::new_in_memory().expect("failed to create test cache"),
495            Parser {},
496            OutputFormat::Xml,
497            None,
498            false,
499            false,
500            false,
501            false,
502            false,
503        );
504        packer.register_plugin(Box::new(MockPlugin));
505        let result = packer.pack(&[file_path]);
506        assert!(result.is_ok());
507        let output = result.expect("pack should succeed");
508        assert!(output.contains("injected"));
509    }
510
511    #[test]
512    fn test_packer_empty_file_list() {
513        let packer = Packer::new(
514            SqliteCache::new_in_memory().expect("failed to create test cache"),
515            Parser {},
516            OutputFormat::Xml,
517            None,
518            false,
519            false,
520            false,
521            false,
522            false,
523        );
524        let result = packer.pack(&[]);
525        assert!(result.is_ok());
526    }
527
528    #[test]
529    fn test_packer_missing_file() {
530        let packer = Packer::new(
531            SqliteCache::new_in_memory().expect("failed to create test cache"),
532            Parser {},
533            OutputFormat::Xml,
534            None,
535            false,
536            false,
537            false,
538            false,
539            false,
540        );
541        let result = packer.pack(&[PathBuf::from("this_file_does_not_exist_xyz.rs")]);
542        // Missing files are skipped gracefully
543        assert!(result.is_ok());
544    }
545
546    #[test]
547    fn test_packer_generates_skeleton_map_at_top() {
548        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
549        let packer = Packer::new(
550            SqliteCache::new_in_memory().expect("failed to create test cache"),
551            Parser {},
552            OutputFormat::Xml,
553            None,
554            false,
555            false,
556            false,
557            false,
558            false,
559        );
560        let result = packer.pack(&[file_path]);
561        assert!(result.is_ok());
562        let output = result.expect("pack should succeed");
563        // The skeleton map should be at the top of the output
564        assert!(output.starts_with("<repository>\n  <skeleton_map>"));
565    }
566
567    #[test]
568    fn test_packer_token_governor_degrades_to_bones() {
569        // Set a very low max_tokens to force degradation to bones-only output
570        let (_dir, file_path) = make_temp_rs_file("fn main() { let x = 1; }\n");
571        let packer = Packer::new(
572            SqliteCache::new_in_memory().expect("failed to create test cache"),
573            Parser {},
574            OutputFormat::Xml,
575            Some(10),
576            false,
577            false,
578            false,
579            false,
580            false,
581        );
582        let result = packer.pack(&[file_path]);
583        assert!(result.is_ok());
584        let output = result.expect("pack should succeed");
585        // When degraded to bones, full file content should not appear in output
586        assert!(!output.contains("<content>"));
587    }
588
589    // -------------------------------------------------------------------------
590    // Helper: create a temp file with a given extension
591    // -------------------------------------------------------------------------
592    fn make_temp_file(dir: &tempfile::TempDir, filename: &str, content: &str) -> PathBuf {
593        let file_path = dir.path().join(filename);
594        let mut f = std::fs::File::create(&file_path).expect("failed to create temp file");
595        f.write_all(content.as_bytes())
596            .expect("failed to write file content");
597        file_path
598    }
599
600    // =========================================================================
601    // XML output correctness
602    // =========================================================================
603
604    /// Symbol names with XML special characters should be escaped in XML output.
605    /// This test describes CORRECT behavior. The current implementation does NOT
606    /// escape these characters in <signature> tags — so this test is expected to
607    /// FAIL until the implementation is fixed.
608    #[test]
609    fn test_xml_signature_special_chars_are_escaped() {
610        use crate::cache::CacheStore;
611
612        let cache = SqliteCache::new_in_memory().expect("failed to create test cache");
613        cache.init().expect("failed to init cache schema");
614
615        // Insert a file + symbol with XML-dangerous characters in the name.
616        let file_id = cache
617            .upsert_file("bad.rs", "h1", b"fn bad() {}")
618            .expect("upsert_file should succeed");
619        cache
620            .insert_symbol(&crate::cache::Symbol {
621                id: "s1".to_string(),
622                file_id,
623                name: "<script>&\"test\"</script>".to_string(),
624                kind: "function".to_string(),
625                byte_offset: 0,
626                byte_length: 11,
627            })
628            .expect("symbol insert should succeed");
629
630        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
631        let file_path = make_temp_file(&dir, "bad.rs", "fn bad() {}\n");
632
633        let packer = Packer::new(
634            cache,
635            Parser {},
636            OutputFormat::Xml,
637            None,
638            false, // no_file_summary
639            false, // no_files
640            false,
641            false,
642            false,
643        );
644        let output = packer.pack(&[file_path]).expect("pack should succeed");
645
646        // The raw unescaped characters must NOT appear outside of CDATA in XML attributes/tags.
647        // Correct output would use &lt; &gt; &amp; &quot; instead.
648        assert!(
649            !output.contains("<script>"),
650            "Bare <script> tag should not appear in XML output; expected escaped form"
651        );
652        assert!(
653            output.contains("&lt;script&gt;") || output.contains("&amp;"),
654            "XML special characters in symbol names must be escaped"
655        );
656    }
657
658    /// File paths with XML special characters should be escaped in path attributes.
659    /// This test describes CORRECT behavior and is expected to FAIL until fixed.
660    #[test]
661    fn test_xml_path_attribute_special_chars_are_escaped() {
662        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
663        // Use a filename that contains an ampersand (legal on most filesystems).
664        let file_path = make_temp_file(&dir, "a&b.txt", "hello world\n");
665
666        let packer = Packer::new(
667            SqliteCache::new_in_memory().expect("failed to create test cache"),
668            Parser {},
669            OutputFormat::Xml,
670            None,
671            false,
672            false,
673            false,
674            false,
675            false,
676        );
677        let output = packer.pack(&[file_path]).expect("pack should succeed");
678
679        // The bare & must be escaped as &amp; in XML attributes.
680        assert!(
681            !output.contains("path=\"") || !output.contains("a&b.txt\""),
682            "Bare & in path attribute must be escaped as &amp;"
683        );
684    }
685
686    /// File content containing `]]>` inside a CDATA section must be escaped so
687    /// the XML document stays well-formed.
688    #[test]
689    fn test_xml_cdata_cdata_end_sequence_is_escaped() {
690        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
691        // Content that would prematurely close a CDATA section.
692        let tricky = "let s = \"]]>\";\n";
693        let file_path = make_temp_file(&dir, "tricky.txt", tricky);
694
695        let packer = Packer::new(
696            SqliteCache::new_in_memory().expect("failed to create test cache"),
697            Parser {},
698            OutputFormat::Xml,
699            None,
700            false,
701            false,
702            false,
703            false,
704            false,
705        );
706        let output = packer.pack(&[file_path]).expect("pack should succeed");
707
708        // The raw ]]> sequence must not appear verbatim inside a CDATA section.
709        // The implementation splits it as ]]]]><![CDATA[>.
710        // After the transformation there should be no bare ]]> that closes CDATA prematurely.
711        // A simple check: every ]]> in the output must be followed immediately by </content>
712        // (i.e., it is the legitimate CDATA close).
713        let positions: Vec<_> = output.match_indices("]]>").collect();
714        for (idx, _) in &positions {
715            let after = &output[idx + 3..];
716            assert!(
717                after.starts_with("</content>"),
718                "Found ]]> at position {} that is not the CDATA closing sequence; \
719                 raw content may break XML well-formedness",
720                idx
721            );
722        }
723    }
724
725    /// A basic well-formedness check: the XML output should have balanced
726    /// `<repository>` / `</repository>` tags and no bare `<` or `>` outside CDATA.
727    #[test]
728    fn test_xml_output_basic_well_formedness() {
729        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
730
731        let packer = Packer::new(
732            SqliteCache::new_in_memory().expect("failed to create test cache"),
733            Parser {},
734            OutputFormat::Xml,
735            None,
736            false,
737            false,
738            false,
739            false,
740            false,
741        );
742        let output = packer.pack(&[file_path]).expect("pack should succeed");
743
744        assert!(
745            output.starts_with("<repository>"),
746            "XML output must start with <repository>"
747        );
748        assert!(
749            output.trim_end().ends_with("</repository>"),
750            "XML output must end with </repository>"
751        );
752
753        // Strip all CDATA sections before checking for bare angle brackets.
754        let cdata_re =
755            regex::Regex::new(r"(?s)<!\[CDATA\[.*?]]>").expect("failed to compile cdata regex");
756        let stripped = cdata_re.replace_all(&output, "");
757
758        // Any remaining < must be the start of a tag (followed by [/a-zA-Z!?])
759        for (i, ch) in stripped.char_indices() {
760            if ch == '<' {
761                let next = stripped[i + 1..].chars().next();
762                assert!(
763                    matches!(next, Some('/' | '!' | '?' | 'a'..='z' | 'A'..='Z')),
764                    "Bare < found at position {} outside of CDATA: ...{}...",
765                    i,
766                    &stripped[i.saturating_sub(10)..std::cmp::min(i + 20, stripped.len())]
767                );
768            }
769        }
770    }
771
772    // =========================================================================
773    // Markdown output correctness
774    // =========================================================================
775
776    /// Markdown skeleton map must indent symbol entries with two spaces under
777    /// their parent file bullet.
778    #[test]
779    fn test_markdown_skeleton_map_indentation() {
780        use crate::cache::CacheStore;
781
782        let cache = SqliteCache::new_in_memory().expect("failed to create test cache");
783        cache.init().expect("failed to init cache schema");
784
785        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
786        let file_path = make_temp_file(&dir, "lib.rs", "fn alpha() {}\n");
787
788        let file_id = cache
789            .upsert_file(file_path.to_string_lossy().as_ref(), "h2", b"fn alpha() {}")
790            .expect("upsert_file should succeed");
791        cache
792            .insert_symbol(&crate::cache::Symbol {
793                id: "s_alpha".to_string(),
794                file_id,
795                name: "alpha".to_string(),
796                kind: "function".to_string(),
797                byte_offset: 0,
798                byte_length: 13,
799            })
800            .expect("symbol insert should succeed");
801
802        let packer = Packer::new(
803            cache,
804            Parser {},
805            OutputFormat::Markdown,
806            None,
807            false,
808            true, // no_files — only generate skeleton map
809            false,
810            false,
811            false,
812        );
813        let output = packer.pack(&[file_path]).expect("pack should succeed");
814
815        // The file should appear as a bullet: "- <path>"
816        assert!(
817            output.contains("- "),
818            "File bullet not found in Markdown output"
819        );
820
821        // Each symbol under the file should be indented with two spaces: "  - kind name"
822        assert!(
823            output.contains("  - function alpha"),
824            "Symbol entries in skeleton map must be indented with two spaces; got:\n{}",
825            output
826        );
827    }
828
829    /// Markdown symbol names containing *, _, [, ], ` should appear verbatim and
830    /// must not break the overall Markdown skeleton structure (file bullet is still present).
831    #[test]
832    fn test_markdown_symbol_names_with_special_chars() {
833        use crate::cache::CacheStore;
834
835        let cache = SqliteCache::new_in_memory().expect("failed to create test cache");
836        cache.init().expect("failed to init cache schema");
837
838        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
839        let file_path = make_temp_file(&dir, "weird.rs", "fn weird() {}\n");
840
841        let file_id = cache
842            .upsert_file(file_path.to_string_lossy().as_ref(), "h3", b"fn weird() {}")
843            .expect("upsert_file should succeed");
844        // Symbol name with markdown special characters
845        cache
846            .insert_symbol(&crate::cache::Symbol {
847                id: "s_weird".to_string(),
848                file_id,
849                name: "*_[weird`_]*".to_string(),
850                kind: "function".to_string(),
851                byte_offset: 0,
852                byte_length: 13,
853            })
854            .expect("symbol insert should succeed");
855
856        let packer = Packer::new(
857            cache,
858            Parser {},
859            OutputFormat::Markdown,
860            None,
861            false,
862            true, // no_files
863            false,
864            false,
865            false,
866        );
867        let output = packer.pack(&[file_path]).expect("pack should succeed");
868
869        // The file bullet must still be present — structure is intact.
870        assert!(output.contains("- "), "File bullet disappeared");
871
872        // The weird symbol name should appear verbatim in the output.
873        assert!(
874            output.contains("*_[weird`_]*"),
875            "Symbol name with Markdown special chars should appear verbatim"
876        );
877    }
878
879    // =========================================================================
880    // Token governor
881    // =========================================================================
882
883    /// With a generous budget, all file content should be included.
884    #[test]
885    fn test_token_governor_generous_budget_includes_content() {
886        let (_dir, file_path) = make_temp_rs_file("fn main() { let x = 42; }\n");
887
888        let packer = Packer::new(
889            SqliteCache::new_in_memory().expect("failed to create test cache"),
890            Parser {},
891            OutputFormat::Xml,
892            Some(100_000), // very large budget
893            false,
894            false,
895            false,
896            false,
897            false,
898        );
899        let output = packer.pack(&[file_path]).expect("pack should succeed");
900
901        // Content block should be present.
902        assert!(
903            output.contains("<content><![CDATA["),
904            "Expected <content> block when budget is generous; got:\n{}",
905            output
906        );
907    }
908
909    /// With a budget of 1 token, content must be omitted (only skeleton map output).
910    #[test]
911    fn test_token_governor_one_token_budget_omits_content() {
912        let (_dir, file_path) = make_temp_rs_file("fn main() { let x = 42; }\n");
913
914        let packer = Packer::new(
915            SqliteCache::new_in_memory().expect("failed to create test cache"),
916            Parser {},
917            OutputFormat::Xml,
918            Some(1), // impossibly tight budget
919            false,
920            false,
921            false,
922            false,
923            false,
924        );
925        let result = packer.pack(&[file_path]);
926
927        // Must not panic or error.
928        assert!(result.is_ok(), "pack() must not error under tight budget");
929        let output = result.expect("pack should succeed");
930
931        // No file content should be present.
932        assert!(
933            !output.contains("<content>"),
934            "No <content> block expected when budget is 1 token"
935        );
936    }
937
938    /// Degradation due to token exhaustion must be graceful — no panic, no Err.
939    #[test]
940    fn test_token_governor_graceful_degradation_no_panic() {
941        let (_dir, file_path) =
942            make_temp_rs_file("fn a() { 1 }\nfn b() { 2 }\nfn c() { 3 }\nfn d() { 4 }\n");
943
944        for budget in [0usize, 1, 5, 50] {
945            let packer = Packer::new(
946                SqliteCache::new_in_memory().expect("failed to create test cache"),
947                Parser {},
948                OutputFormat::Xml,
949                Some(budget),
950                false,
951                false,
952                false,
953                false,
954                false,
955            );
956            let result = packer.pack(std::slice::from_ref(&file_path));
957            assert!(
958                result.is_ok(),
959                "pack() panicked or errored at max_tokens={}",
960                budget
961            );
962        }
963    }
964
965    // =========================================================================
966    // Flag combinations
967    // =========================================================================
968
969    /// no_files=true AND no_file_summary=true together — the output should be
970    /// minimal: just the opening/closing repository tags and nothing else.
971    #[test]
972    fn test_no_files_and_no_file_summary_together() {
973        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
974
975        let packer = Packer::new(
976            SqliteCache::new_in_memory().expect("failed to create test cache"),
977            Parser {},
978            OutputFormat::Xml,
979            None,
980            true, // no_file_summary
981            true, // no_files
982            false,
983            false,
984            false,
985        );
986        let output = packer.pack(&[file_path]).expect("pack should succeed");
987
988        // Only the repository wrapper should be present.
989        let trimmed = output.trim();
990        assert_eq!(
991            trimmed, "<repository>\n</repository>",
992            "With both no_files and no_file_summary, output should be just the repository tags; got:\n{}",
993            trimmed
994        );
995    }
996
997    /// remove_comments=true should strip `//` line comments from Rust source.
998    #[test]
999    fn test_remove_line_comments_from_rust() {
1000        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1001        // Use .txt so the parser falls back to raw content (no body elision complicates things).
1002        let file_path = make_temp_file(
1003            &dir,
1004            "comments.txt",
1005            "let x = 1; // this is a comment\nlet y = 2;\n",
1006        );
1007
1008        let packer = Packer::new(
1009            SqliteCache::new_in_memory().expect("failed to create test cache"),
1010            Parser {},
1011            OutputFormat::Xml,
1012            None,
1013            false,
1014            false,
1015            true, // remove_comments
1016            false,
1017            false,
1018        );
1019        let output = packer.pack(&[file_path]).expect("pack should succeed");
1020
1021        assert!(
1022            !output.contains("// this is a comment"),
1023            "Line comment should be stripped; got:\n{}",
1024            output
1025        );
1026        assert!(
1027            output.contains("let x = 1;"),
1028            "Non-comment code should remain after stripping line comments"
1029        );
1030    }
1031
1032    /// remove_comments=true should strip `/* */` block comments.
1033    #[test]
1034    fn test_remove_block_comments() {
1035        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1036        let file_path = make_temp_file(
1037            &dir,
1038            "block_comments.txt",
1039            "int x = /* inline block */ 42;\n/* multi\nline\ncomment */\nint y = 1;\n",
1040        );
1041
1042        let packer = Packer::new(
1043            SqliteCache::new_in_memory().expect("failed to create test cache"),
1044            Parser {},
1045            OutputFormat::Xml,
1046            None,
1047            false,
1048            false,
1049            true, // remove_comments
1050            false,
1051            false,
1052        );
1053        let output = packer.pack(&[file_path]).expect("pack should succeed");
1054
1055        assert!(
1056            !output.contains("inline block"),
1057            "Inline block comment should be stripped"
1058        );
1059        assert!(
1060            !output.contains("multi\nline\ncomment"),
1061            "Multi-line block comment should be stripped"
1062        );
1063        assert!(
1064            output.contains("int x ="),
1065            "Code outside block comment should be preserved"
1066        );
1067    }
1068
1069    /// remove_empty_lines=true should collapse multiple consecutive blank lines
1070    /// into a single newline.
1071    #[test]
1072    fn test_remove_empty_lines_collapses_blanks() {
1073        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1074        let file_path = make_temp_file(
1075            &dir,
1076            "blanks.txt",
1077            "line one\n\n\n\nline two\n\n\nline three\n",
1078        );
1079
1080        let packer = Packer::new(
1081            SqliteCache::new_in_memory().expect("failed to create test cache"),
1082            Parser {},
1083            OutputFormat::Xml,
1084            None,
1085            false,
1086            false,
1087            false,
1088            true, // remove_empty_lines
1089            false,
1090        );
1091        let output = packer.pack(&[file_path]).expect("pack should succeed");
1092
1093        // There must be no run of more than one blank line in the content.
1094        assert!(
1095            !output.contains("\n\n\n"),
1096            "Multiple consecutive blank lines should be collapsed to a single newline; got:\n{}",
1097            output
1098        );
1099        assert!(
1100            output.contains("line one"),
1101            "Non-blank lines must be preserved"
1102        );
1103        assert!(
1104            output.contains("line two"),
1105            "Non-blank lines must be preserved"
1106        );
1107    }
1108
1109    /// truncate_base64=true should replace strings of 100+ alphanumeric chars
1110    /// with the placeholder `[TRUNCATED_BASE64]`.
1111    #[test]
1112    fn test_truncate_base64_replaces_long_strings() {
1113        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1114        // Exactly 100 alphanumeric chars — the boundary that SHOULD be truncated.
1115        let long_token = "A".repeat(100);
1116        let content = format!("key = {}\n", long_token);
1117        let file_path = make_temp_file(&dir, "tokens.txt", &content);
1118
1119        let packer = Packer::new(
1120            SqliteCache::new_in_memory().expect("failed to create test cache"),
1121            Parser {},
1122            OutputFormat::Xml,
1123            None,
1124            false,
1125            false,
1126            false,
1127            false,
1128            true, // truncate_base64
1129        );
1130        let output = packer.pack(&[file_path]).expect("pack should succeed");
1131
1132        assert!(
1133            output.contains("[TRUNCATED_BASE64]"),
1134            "A 100-char alphanumeric string should be replaced with [TRUNCATED_BASE64]"
1135        );
1136        assert!(
1137            !output.contains(&long_token),
1138            "The original long token must not appear in output after truncation"
1139        );
1140    }
1141
1142    /// truncate_base64=true must NOT truncate strings of 99 characters or fewer.
1143    #[test]
1144    fn test_truncate_base64_preserves_short_strings() {
1145        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1146        // 99 alphanumeric chars — one below the truncation threshold.
1147        let short_token = "B".repeat(99);
1148        let content = format!("key = {}\n", short_token);
1149        let file_path = make_temp_file(&dir, "short_tokens.txt", &content);
1150
1151        let packer = Packer::new(
1152            SqliteCache::new_in_memory().expect("failed to create test cache"),
1153            Parser {},
1154            OutputFormat::Xml,
1155            None,
1156            false,
1157            false,
1158            false,
1159            false,
1160            true, // truncate_base64
1161        );
1162        let output = packer.pack(&[file_path]).expect("pack should succeed");
1163
1164        assert!(
1165            output.contains(&short_token),
1166            "A 99-char string must NOT be truncated"
1167        );
1168        assert!(
1169            !output.contains("[TRUNCATED_BASE64]"),
1170            "No truncation should occur for strings under 100 chars"
1171        );
1172    }
1173
1174    // =========================================================================
1175    // Multiple files
1176    // =========================================================================
1177
1178    /// Packer with 3 files: all three must appear in the skeleton map.
1179    #[test]
1180    fn test_three_files_all_appear_in_skeleton_map() {
1181        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1182        let f1 = make_temp_file(&dir, "one.txt", "content one\n");
1183        let f2 = make_temp_file(&dir, "two.txt", "content two\n");
1184        let f3 = make_temp_file(&dir, "three.txt", "content three\n");
1185
1186        let packer = Packer::new(
1187            SqliteCache::new_in_memory().expect("failed to create test cache"),
1188            Parser {},
1189            OutputFormat::Xml,
1190            None,
1191            false,
1192            false,
1193            false,
1194            false,
1195            false,
1196        );
1197        let output = packer.pack(&[f1, f2, f3]).expect("pack should succeed");
1198
1199        assert!(output.contains("one.txt"), "one.txt missing from output");
1200        assert!(output.contains("two.txt"), "two.txt missing from output");
1201        assert!(
1202            output.contains("three.txt"),
1203            "three.txt missing from output"
1204        );
1205    }
1206
1207    /// Files must appear in the skeleton map in the same order they were supplied
1208    /// to pack() — i.e., the ordering is deterministic.
1209    #[test]
1210    fn test_skeleton_map_preserves_input_order() {
1211        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1212        let f1 = make_temp_file(&dir, "alpha.txt", "alpha\n");
1213        let f2 = make_temp_file(&dir, "beta.txt", "beta\n");
1214        let f3 = make_temp_file(&dir, "gamma.txt", "gamma\n");
1215
1216        let packer = Packer::new(
1217            SqliteCache::new_in_memory().expect("failed to create test cache"),
1218            Parser {},
1219            OutputFormat::Xml,
1220            None,
1221            false,
1222            false,
1223            false,
1224            false,
1225            false,
1226        );
1227        let output = packer.pack(&[f1, f2, f3]).expect("pack should succeed");
1228
1229        let pos_alpha = output.find("alpha.txt").expect("alpha.txt not found");
1230        let pos_beta = output.find("beta.txt").expect("beta.txt not found");
1231        let pos_gamma = output.find("gamma.txt").expect("gamma.txt not found");
1232
1233        assert!(
1234            pos_alpha < pos_beta && pos_beta < pos_gamma,
1235            "Files must appear in the skeleton map in the order they were supplied"
1236        );
1237    }
1238
1239    // =========================================================================
1240    // Binary / missing files
1241    // =========================================================================
1242
1243    /// A file that exists when pack() starts being called but has been deleted
1244    /// before its content is read should be gracefully skipped — no panic, no Err,
1245    /// just a warning on stderr.
1246    #[test]
1247    fn test_deleted_file_is_gracefully_skipped() {
1248        let dir = tempfile::TempDir::new().expect("failed to create temp dir");
1249        let file_path = make_temp_file(&dir, "ephemeral.txt", "will be deleted\n");
1250
1251        // Delete the file before calling pack().
1252        std::fs::remove_file(&file_path).expect("failed to delete ephemeral file");
1253
1254        let packer = Packer::new(
1255            SqliteCache::new_in_memory().expect("failed to create test cache"),
1256            Parser {},
1257            OutputFormat::Xml,
1258            None,
1259            false,
1260            false,
1261            false,
1262            false,
1263            false,
1264        );
1265        let result = packer.pack(&[file_path]);
1266
1267        assert!(
1268            result.is_ok(),
1269            "pack() must not return Err when a file has been deleted; got: {:?}",
1270            result.err()
1271        );
1272
1273        let output = result.expect("pack should succeed even when file is deleted");
1274        // The output should still be a well-formed XML document.
1275        assert!(
1276            output.contains("<repository>"),
1277            "Output must start with <repository>"
1278        );
1279        assert!(
1280            output.trim_end().ends_with("</repository>"),
1281            "Output must end with </repository>"
1282        );
1283        // No content should be emitted for the missing file.
1284        assert!(
1285            !output.contains("will be deleted"),
1286            "Content of deleted file must not appear in output"
1287        );
1288    }
1289
1290    // =========================================================================
1291    // Metadata XML injection (Amber team gap #1)
1292    // =========================================================================
1293
1294    /// Plugin metadata keys and values containing XML-dangerous characters must be
1295    /// escaped before being written into the <metadata> element.
1296    ///
1297    /// This test describes CORRECT behavior. The current implementation does NOT
1298    /// escape metadata key/value strings — so this test is expected to FAIL until
1299    /// the implementation is fixed.
1300    #[test]
1301    fn test_plugin_metadata_xml_escaping() {
1302        struct XmlDangerousPlugin;
1303
1304        impl ContextPlugin for XmlDangerousPlugin {
1305            fn name(&self) -> &str {
1306                "xml_dangerous"
1307            }
1308
1309            fn detect(&self, _directory: &Path) -> bool {
1310                true
1311            }
1312
1313            fn enrich(&self, _file_path: &Path, base_bones: &mut Vec<Bone>) -> Result<()> {
1314                for bone in base_bones.iter_mut() {
1315                    // Key with XML-dangerous characters
1316                    bone.metadata.insert(
1317                        "key<with>&\"special".to_string(),
1318                        // Value that attempts XML injection: inject a sibling element
1319                        "</metadata><malicious>payload</malicious><metadata key=\"x\">".to_string(),
1320                    );
1321                }
1322                Ok(())
1323            }
1324        }
1325
1326        let (_dir, file_path) = make_temp_rs_file("fn main() {}\n");
1327        let mut packer = Packer::new(
1328            SqliteCache::new_in_memory().expect("failed to create test cache"),
1329            Parser {},
1330            OutputFormat::Xml,
1331            None,
1332            false,
1333            false,
1334            false,
1335            false,
1336            false,
1337        );
1338        packer.register_plugin(Box::new(XmlDangerousPlugin));
1339
1340        let output = packer.pack(&[file_path]).expect("pack should succeed");
1341
1342        // The raw injection string must NOT appear verbatim in the output.
1343        assert!(
1344            !output.contains("<malicious>"),
1345            "Bare <malicious> tag found in output — metadata value was not XML-escaped; got:\n{}",
1346            output
1347        );
1348        assert!(
1349            !output.contains("</malicious>"),
1350            "Bare </malicious> tag found in output — metadata value was not XML-escaped; got:\n{}",
1351            output
1352        );
1353
1354        // Escaped forms must be present instead.
1355        // The value contains '<' and '>' so at minimum &lt; and/or &gt; must appear.
1356        assert!(
1357            output.contains("&lt;") || output.contains("&gt;") || output.contains("&amp;"),
1358            "Expected XML-escaped entities (&lt;, &gt;, or &amp;) in metadata output; got:\n{}",
1359            output
1360        );
1361
1362        // The document must still be well-formed (closing tag present).
1363        assert!(
1364            output.contains("</repository>"),
1365            "Output must still contain </repository> after metadata injection; got:\n{}",
1366            output
1367        );
1368    }
1369}