Skip to main content

argyph_pack/
lib.rs

1#![forbid(unsafe_code)]
2
3mod priority;
4pub mod render;
5pub mod tokenize;
6pub mod truncate;
7
8use camino::{Utf8Path, Utf8PathBuf};
9use std::time::SystemTime;
10
11use priority::prioritize;
12use tokenize::TokenCounter;
13use truncate::truncate_file;
14
15/// Errors that can occur during repository packing.
16#[derive(Debug, thiserror::Error)]
17pub enum PackError {
18    /// The scope resolved to zero candidate files — nothing to pack.
19    #[error("empty scope: no files to pack")]
20    EmptyScope,
21    /// The token budget cannot even cover per-file overhead for the candidate
22    /// set. The value is the minimum budget required.
23    #[error("token budget too small: {0} bytes minimum required")]
24    BudgetTooSmall(usize),
25    /// An I/O or other infrastructure error prevented reading a file.
26    #[error("IO error: {0}")]
27    Io(String),
28}
29
30/// Convenience alias for [`std::result::Result`] with [`PackError`].
31pub type Result<T> = std::result::Result<T, PackError>;
32
33/// What subset of the repository to pack.
34///
35/// `All` includes every file the [`PackContext`] considers in scope. `Paths`
36/// restricts to explicit paths. `Symbol` packs files that contain a named
37/// symbol (lookup provided by the context).
38pub enum PackScope {
39    All,
40    Paths(Vec<Utf8PathBuf>),
41    Symbol(String),
42}
43
44/// Output format for the packed representation.
45#[derive(Debug)]
46pub enum PackFormat {
47    /// Primary format — XML with `<file>` elements and CDATA-wrapped content.
48    Xml,
49    /// Human-friendly format — markdown with fenced code blocks.
50    Markdown,
51}
52
53/// Flags controlling which categories of files are included in the pack.
54pub struct PackInclude {
55    /// When `true`, test files (e.g. `*_test.rs`, `tests/` directory) are
56    /// included.
57    pub tests: bool,
58    /// When `true`, documentation files (`.md`) are included.
59    pub docs: bool,
60}
61
62/// Describes a pack operation to be executed by a [`Packer`].
63pub struct PackRequest {
64    pub scope: PackScope,
65    pub format: PackFormat,
66    /// Soft token budget — the packer attempts to stay under this limit.
67    pub token_budget: usize,
68    pub include: PackInclude,
69}
70
71/// The result of a successful pack operation.
72#[derive(Debug)]
73pub struct PackResult {
74    pub format: PackFormat,
75    /// The rendered output string (XML or markdown).
76    pub content: String,
77    /// Token count of the final output as measured by the default tokenizer.
78    pub token_count: usize,
79    /// Files that were included in full.
80    pub files_included: Vec<Utf8PathBuf>,
81    /// Files that were included but truncated to fit the budget.
82    pub files_truncated: Vec<Utf8PathBuf>,
83    /// Files that could not be included at all under the budget.
84    pub files_omitted: Vec<Utf8PathBuf>,
85}
86
87/// Abstracts filesystem access and metadata needed by the packer.
88///
89/// All path arguments use [`Utf8Path`] from the `camino` crate. Implementors
90/// do not need to perform `std::path` conversions — the packer only operates
91/// on UTF-8 paths.
92pub trait PackContext {
93    /// Return every file path that matches the given scope.
94    fn list_files(&self, scope: &PackScope) -> Vec<Utf8PathBuf>;
95    /// Read the full contents of `file` as a string.
96    fn read(&self, file: &Utf8Path) -> Result<String>;
97    /// Return the last-modified timestamp of `file`, if available.
98    fn modified(&self, file: &Utf8Path) -> Option<SystemTime>;
99    /// Return the count of inbound edges (callers, importers) for `file` in
100    /// the symbol graph. Returns 0 if no graph data is available.
101    fn in_edges(&self, file: &Utf8Path) -> Result<usize>;
102}
103
104/// Produces a token-budgeted, flattened representation of a repository or
105/// subset.
106pub trait Packer {
107    /// Execute a pack operation given a request and a context for file access.
108    fn pack(&self, req: &PackRequest, ctx: &dyn PackContext) -> Result<PackResult>;
109}
110
111/// Default implementation of [`Packer`] using the `cl100k_base` tokenizer and
112/// a priority heuristic that sorts files by entry points, docs, recency, and
113/// symbol-graph centrality before lexicographic order.
114pub struct DefaultPacker {
115    counter: TokenCounter,
116}
117
118impl DefaultPacker {
119    /// Create a new `DefaultPacker`, initialising the tokenizer.
120    pub fn new() -> Result<Self> {
121        Ok(Self {
122            counter: TokenCounter::new()?,
123        })
124    }
125}
126
127impl Packer for DefaultPacker {
128    fn pack(&self, req: &PackRequest, ctx: &dyn PackContext) -> Result<PackResult> {
129        let mut files = ctx.list_files(&req.scope);
130
131        if !req.include.tests {
132            files.retain(|f| !is_test_file(f));
133        }
134        if !req.include.docs {
135            files.retain(|f| !is_doc_file(f));
136        }
137
138        if files.is_empty() {
139            return Err(PackError::EmptyScope);
140        }
141
142        let ordered = prioritize(&files, ctx);
143
144        let (file_entries, _budget_used) =
145            self.read_with_budget(&ordered, req.token_budget, &req.format, ctx)?;
146
147        let mut files_included = Vec::new();
148        let mut files_truncated = Vec::new();
149        for (path, _, is_truncated, _) in &file_entries {
150            if *is_truncated {
151                files_truncated.push(path.clone());
152            } else {
153                files_included.push(path.clone());
154            }
155        }
156
157        let included_set: std::collections::HashSet<_> =
158            file_entries.iter().map(|(p, _, _, _)| p).collect();
159        let files_omitted: Vec<_> = ordered
160            .iter()
161            .filter(|p| !included_set.contains(p))
162            .cloned()
163            .collect();
164
165        let file_refs: Vec<(Utf8PathBuf, &str, bool, usize)> = file_entries
166            .iter()
167            .map(|(p, c, t, n)| (p.clone(), c.as_str(), *t, *n))
168            .collect();
169
170        let content = match req.format {
171            PackFormat::Xml => render::xml::render_xml(&file_refs, "repository"),
172            PackFormat::Markdown => render::markdown::render_markdown(&file_refs, "repository"),
173        };
174
175        let token_count = self.counter.count(&content);
176
177        Ok(PackResult {
178            format: match req.format {
179                PackFormat::Xml => PackFormat::Xml,
180                PackFormat::Markdown => PackFormat::Markdown,
181            },
182            content,
183            token_count,
184            files_included,
185            files_truncated,
186            files_omitted,
187        })
188    }
189}
190
191// ── Private helpers ──────────────────────────────────────────────────────────
192
193/// Internal representation of a file packed into the result.
194type FileEntry = (Utf8PathBuf, String, bool, usize);
195
196impl DefaultPacker {
197    fn read_with_budget(
198        &self,
199        ordered: &[Utf8PathBuf],
200        budget: usize,
201        format: &PackFormat,
202        ctx: &dyn PackContext,
203    ) -> Result<(Vec<FileEntry>, usize)> {
204        let overhead_per_file: usize = match format {
205            PackFormat::Xml => 120,
206            PackFormat::Markdown => 80,
207        };
208
209        let total_overhead = overhead_per_file.saturating_mul(ordered.len());
210        if total_overhead >= budget {
211            return Err(PackError::BudgetTooSmall(total_overhead));
212        }
213
214        let mut remaining_budget = budget.saturating_sub(total_overhead);
215        let mut entries: Vec<(Utf8PathBuf, String, bool, usize)> = Vec::new();
216        let file_count = ordered.len();
217
218        for (idx, file) in ordered.iter().enumerate() {
219            if remaining_budget == 0 {
220                break;
221            }
222
223            let remaining_files = file_count.saturating_sub(entries.len());
224            let per_file = remaining_budget / remaining_files.max(1);
225            if per_file == 0 {
226                break;
227            }
228
229            let content = match ctx.read(file) {
230                Ok(c) => c,
231                Err(_) => continue,
232            };
233
234            let full_count = self.counter.count(&content);
235
236            if full_count <= per_file {
237                remaining_budget = remaining_budget.saturating_sub(full_count);
238                entries.push((file.clone(), content, false, full_count));
239            } else {
240                let (truncated, trunc_count) = truncate_file(&content, per_file, &self.counter);
241                remaining_budget = remaining_budget.saturating_sub(trunc_count);
242                // Only include if we got some content
243                if trunc_count > 0 {
244                    entries.push((file.clone(), truncated, true, trunc_count));
245                }
246                // If truncation produced nothing useful, the file is omitted
247            }
248
249            // Mark the file position as used (handled implicitly by loop)
250            let _ = idx;
251        }
252
253        let total_used = entries.iter().map(|(_, _, _, c)| c).sum::<usize>()
254            + overhead_per_file.saturating_mul(entries.len());
255
256        Ok((entries, total_used))
257    }
258}
259
260/// Returns `true` when `path` looks like a test file or lives inside a test
261/// directory.
262fn is_test_file(path: &Utf8Path) -> bool {
263    let file_name = path.file_name().unwrap_or("");
264    // Filename patterns
265    if file_name.ends_with("_test.rs")
266        || file_name.ends_with("_test.ts")
267        || file_name.ends_with("_test.tsx")
268        || file_name.ends_with("_test.js")
269        || file_name.ends_with("_test.jsx")
270        || file_name.ends_with("_test.py")
271        || file_name.ends_with("_spec.ts")
272        || file_name.ends_with("_spec.js")
273        || file_name.ends_with("test.py")
274    {
275        return true;
276    }
277    // Directory patterns
278    let path_str = path.as_str();
279    if path_str.contains("/test/")
280        || path_str.contains("/tests/")
281        || path_str.starts_with("test/")
282        || path_str.starts_with("tests/")
283        || path_str.contains("/__tests__/")
284        || path_str.contains("/spec/")
285    {
286        return true;
287    }
288    false
289}
290
291/// Returns `true` when `path` refers to a documentation file (currently only
292/// `.md` files).
293fn is_doc_file(path: &Utf8Path) -> bool {
294    path.extension() == Some("md")
295}
296
297// ── Tests ────────────────────────────────────────────────────────────────────
298
299#[cfg(test)]
300mod test_util;
301
302#[cfg(test)]
303mod snapshot_tests;
304
305#[cfg(test)]
306#[allow(clippy::unwrap_used)]
307mod tests {
308    use super::*;
309    use crate::test_util::*;
310    use std::collections::HashMap;
311
312    // ── types + errors ───────────────────────────────────────────────────
313
314    #[test]
315    fn empty_scope_errors() {
316        let packer = DefaultPacker::new().unwrap();
317        let ctx = TestContext::with_content_files(HashMap::new());
318        let req = PackRequest {
319            scope: PackScope::All,
320            format: PackFormat::Xml,
321            token_budget: 1000,
322            include: PackInclude {
323                tests: false,
324                docs: false,
325            },
326        };
327        let result = packer.pack(&req, &ctx);
328        assert!(result.is_err());
329        match result.unwrap_err() {
330            PackError::EmptyScope => {}
331            other => panic!("expected EmptyScope, got {other:?}"),
332        }
333    }
334
335    #[test]
336    fn budget_too_small_errors() {
337        let packer = DefaultPacker::new().unwrap();
338        let mut files = HashMap::new();
339        files.insert(
340            path("src/main.rs"),
341            "fn main() { println!(\"hello\"); }".to_string(),
342        );
343        let ctx = TestContext::with_content_files(files);
344        let req = PackRequest {
345            scope: PackScope::All,
346            format: PackFormat::Xml,
347            token_budget: 10, // far too small for overhead
348            include: PackInclude {
349                tests: false,
350                docs: false,
351            },
352        };
353        let result = packer.pack(&req, &ctx);
354        assert!(result.is_err());
355    }
356
357    // ── basic packing ────────────────────────────────────────────────────
358
359    #[test]
360    fn pack_single_file_in_full() {
361        let packer = DefaultPacker::new().unwrap();
362        let mut files = HashMap::new();
363        files.insert(path("src/main.rs"), "fn main() {}".to_string());
364        let ctx = TestContext::with_content_files(files);
365        let req = PackRequest {
366            scope: PackScope::All,
367            format: PackFormat::Xml,
368            token_budget: 500,
369            include: PackInclude {
370                tests: false,
371                docs: false,
372            },
373        };
374        let result = packer.pack(&req, &ctx).unwrap();
375        assert_eq!(result.files_included.len(), 1);
376        assert_eq!(result.files_truncated.len(), 0);
377        assert_eq!(result.files_omitted.len(), 0);
378        assert!(result.token_count > 0);
379        assert!(result.content.contains("fn main() {}"));
380    }
381
382    #[test]
383    fn pack_multiple_files_orders_by_priority() {
384        let packer = DefaultPacker::new().unwrap();
385        let mut files = HashMap::new();
386        files.insert(path("src/utils.rs"), "// utils".to_string());
387        files.insert(path("src/lib.rs"), "// lib".to_string());
388        files.insert(path("README.md"), "# Readme".to_string());
389        let ctx = TestContext::with_content_files(files);
390        let req = PackRequest {
391            scope: PackScope::All,
392            format: PackFormat::Xml,
393            token_budget: 2000,
394            include: PackInclude {
395                tests: true,
396                docs: true,
397            },
398        };
399        let result = packer.pack(&req, &ctx).unwrap();
400        // lib.rs (entry point) should appear before README.md (top-level doc)
401        // which should appear before utils.rs
402        let lib_pos = result
403            .files_included
404            .iter()
405            .position(|p| p.as_str() == "src/lib.rs")
406            .unwrap();
407        let readme_pos = result
408            .files_included
409            .iter()
410            .position(|p| p.as_str() == "README.md")
411            .unwrap();
412        let utils_pos = result
413            .files_included
414            .iter()
415            .position(|p| p.as_str() == "src/utils.rs")
416            .unwrap();
417        assert!(lib_pos < readme_pos);
418        assert!(readme_pos < utils_pos);
419    }
420
421    #[test]
422    fn pack_excludes_tests_when_flag_false() {
423        let packer = DefaultPacker::new().unwrap();
424        let mut files = HashMap::new();
425        files.insert(path("src/lib.rs"), "// lib".to_string());
426        files.insert(path("src/lib_test.rs"), "// test".to_string());
427        files.insert(path("tests/integration.rs"), "// integration".to_string());
428        let ctx = TestContext::with_content_files(files);
429        let req = PackRequest {
430            scope: PackScope::All,
431            format: PackFormat::Xml,
432            token_budget: 2000,
433            include: PackInclude {
434                tests: false,
435                docs: true,
436            },
437        };
438        let result = packer.pack(&req, &ctx).unwrap();
439        assert_eq!(result.files_included.len(), 1);
440        assert_eq!(result.files_included[0].as_str(), "src/lib.rs");
441    }
442
443    #[test]
444    fn pack_includes_tests_when_flag_true() {
445        let packer = DefaultPacker::new().unwrap();
446        let mut files = HashMap::new();
447        files.insert(path("src/lib.rs"), "// lib".to_string());
448        files.insert(path("src/lib_test.rs"), "// test".to_string());
449        let ctx = TestContext::with_content_files(files);
450        let req = PackRequest {
451            scope: PackScope::All,
452            format: PackFormat::Xml,
453            token_budget: 2000,
454            include: PackInclude {
455                tests: true,
456                docs: true,
457            },
458        };
459        let result = packer.pack(&req, &ctx).unwrap();
460        assert_eq!(result.files_included.len(), 2);
461    }
462
463    #[test]
464    fn pack_truncates_when_budget_tight() {
465        let packer = DefaultPacker::new().unwrap();
466        let mut files = HashMap::new();
467        // A large file that can't fit under a small budget — use varied
468        // words so the BPE tokenizer does not compress them into few tokens.
469        let big_content: String = std::iter::repeat_n("fn unique_word_", 200)
470            .collect::<Vec<_>>()
471            .join("\n");
472        files.insert(path("src/big.rs"), big_content);
473        let ctx = TestContext::with_content_files(files);
474        let req = PackRequest {
475            scope: PackScope::All,
476            format: PackFormat::Xml,
477            token_budget: 200,
478            include: PackInclude {
479                tests: false,
480                docs: false,
481            },
482        };
483        let result = packer.pack(&req, &ctx).unwrap();
484        // The large file should not be included in full — it is either
485        // truncated or omitted entirely depending on budget allocation.
486        assert_eq!(result.files_included.len(), 0);
487        assert!(result.files_truncated.len() + result.files_omitted.len() == 1);
488        if !result.files_truncated.is_empty() {
489            assert!(result.content.contains("[truncated"));
490        }
491    }
492
493    #[test]
494    fn markdown_format_output() {
495        let packer = DefaultPacker::new().unwrap();
496        let mut files = HashMap::new();
497        files.insert(
498            path("src/lib.rs"),
499            "pub fn add(a: i32, b: i32) -> i32 { a + b }".to_string(),
500        );
501        let ctx = TestContext::with_content_files(files);
502        let req = PackRequest {
503            scope: PackScope::All,
504            format: PackFormat::Markdown,
505            token_budget: 2000,
506            include: PackInclude {
507                tests: false,
508                docs: false,
509            },
510        };
511        let result = packer.pack(&req, &ctx).unwrap();
512        assert!(result.content.starts_with("# Repository:"));
513        assert!(result.content.contains("```rust"));
514        assert!(result.content.contains("## File:"));
515    }
516
517    #[test]
518    fn pack_result_is_deterministic() {
519        let packer = DefaultPacker::new().unwrap();
520        let mut files = HashMap::new();
521        files.insert(path("a.rs"), "// a".to_string());
522        files.insert(path("b.rs"), "// b".to_string());
523        let ctx = TestContext::with_content_files(files);
524        let req = PackRequest {
525            scope: PackScope::All,
526            format: PackFormat::Xml,
527            token_budget: 2000,
528            include: PackInclude {
529                tests: false,
530                docs: false,
531            },
532        };
533        let r1 = packer.pack(&req, &ctx).unwrap();
534        let r2 = packer.pack(&req, &ctx).unwrap();
535        assert_eq!(r1.content, r2.content);
536        assert_eq!(r1.token_count, r2.token_count);
537    }
538
539    #[test]
540    fn file_content_is_preserved_in_output() {
541        let packer = DefaultPacker::new().unwrap();
542        let content = "fn hello() -> &'static str { \"world\" }";
543        let mut files = HashMap::new();
544        files.insert(path("src/greeting.rs"), content.to_string());
545        let ctx = TestContext::with_content_files(files);
546        let req = PackRequest {
547            scope: PackScope::All,
548            format: PackFormat::Markdown,
549            token_budget: 2000,
550            include: PackInclude {
551                tests: false,
552                docs: false,
553            },
554        };
555        let result = packer.pack(&req, &ctx).unwrap();
556        assert!(result.content.contains(content));
557    }
558}