Skip to main content

devboy_format_pipeline/
lib.rs

1//! Format pipeline for tool output transformation.
2//!
3//! Formats tool responses into an optimal format for LLM:
4//!
5//! - **TOON** (default): Token-Oriented Object Notation -- saves 39-90% of tokens
6//! - **JSON**: for programmatic processing
7//! - **Budget trimming**: smart strategy-based trimming when output exceeds budget
8//!
9//! # Example
10//!
11//! ```ignore
12//! use devboy_format_pipeline::{Pipeline, PipelineConfig, OutputFormat};
13//! use devboy_core::Issue;
14//!
15//! let pipeline = Pipeline::with_config(PipelineConfig {
16//!     format: OutputFormat::Toon,
17//!     max_chars: 100_000,
18//!     ..Default::default()
19//! });
20//!
21//! let output = pipeline.transform_issues(issues)?;
22//! println!("{}", output.to_string_with_hints());
23//! ```
24
25#![deny(rustdoc::broken_intra_doc_links)]
26#![deny(rustdoc::private_intra_doc_links)]
27#![deny(rustdoc::invalid_html_tags)]
28pub mod adaptive_config;
29pub mod budget;
30pub mod dedup;
31pub(crate) mod dedup_util;
32pub mod enrichment;
33pub mod layered_pipeline;
34pub mod mckp_router;
35pub mod near_ref;
36pub mod page_index;
37pub mod pagination;
38pub mod projection;
39pub mod round_trip;
40pub mod shape;
41pub mod strategy;
42pub mod telemetry;
43pub mod templates;
44pub mod token_counter;
45pub mod tool_defaults;
46pub mod toon;
47pub mod tree;
48pub mod trim;
49pub mod truncation;
50
51pub use token_counter::{Tokenizer, estimate_tokens, tokens_to_chars};
52pub use truncation::TruncationPlugin;
53
54use devboy_core::{Comment, Discussion, FileDiff, Issue, MergeRequest, Result};
55
56use budget::BudgetConfig;
57use strategy::StrategyResolver;
58
59/// Convert character budget to token estimate (chars / 3.5).
60fn estimate_tokens_from_chars(chars: usize) -> usize {
61    (chars as f64 / 3.5).ceil() as usize
62}
63
64/// Serialize a `Serialize` slice to JSON pretty, then route the JSON
65/// through the L2 MCKP shape dispatcher. Falls back to the pretty-printed
66/// JSON when no shape applies. The L0 dedup layer is host-side (per
67/// session) and is wired separately in P-203-04.
68fn encode_mckp<T: serde::Serialize>(items: &[T]) -> Result<String> {
69    let json = serde_json::to_string_pretty(items)?;
70    let cls = shape::classify(&json);
71    let cfg = adaptive_config::MckpConfig::default();
72    if let Some((_id, body)) = mckp_router::route(&cfg, &json, &cls) {
73        Ok(body)
74    } else {
75        Ok(json)
76    }
77}
78
79/// Output from a pipeline transformation.
80///
81/// Contains the transformed data and metadata about truncation/pagination.
82#[derive(Debug, Clone)]
83pub struct TransformOutput {
84    /// The transformed output (TOON or JSON string)
85    pub content: String,
86    /// Whether the output was truncated
87    pub truncated: bool,
88    /// Total count before truncation (if known)
89    pub total_count: Option<usize>,
90    /// Number of items actually included
91    pub included_count: usize,
92    /// Hint for the agent about hidden content
93    pub agent_hint: Option<String>,
94    /// Cursor for fetching the next page (if overflow exists)
95    pub page_cursor: Option<String>,
96    /// Page index for large results (when budget trimming is applied)
97    pub page_index: Option<page_index::PageIndex>,
98    /// Provider-level pagination metadata
99    pub provider_pagination: Option<devboy_core::Pagination>,
100    /// Provider-level sort metadata
101    pub provider_sort: Option<devboy_core::SortInfo>,
102    /// Size of raw input data before formatting (UTF-8 bytes)
103    pub raw_chars: usize,
104    /// Size of formatted output (UTF-8 bytes) — updated after apply_char_limit
105    pub output_chars: usize,
106    /// Size of output BEFORE budget trimming (UTF-8 bytes).
107    /// Set by apply_char_limit when truncation occurs.
108    pub pre_trim_chars: usize,
109}
110
111impl TransformOutput {
112    /// Create a new output with content.
113    pub fn new(content: String) -> Self {
114        let output_chars = content.len();
115        Self {
116            content,
117            truncated: false,
118            total_count: None,
119            included_count: 0,
120            agent_hint: None,
121            page_cursor: None,
122            page_index: None,
123            provider_pagination: None,
124            provider_sort: None,
125            raw_chars: 0,
126            output_chars,
127            pre_trim_chars: 0,
128        }
129    }
130
131    /// Set raw input size (before formatting).
132    pub fn with_raw_chars(mut self, raw_chars: usize) -> Self {
133        self.raw_chars = raw_chars;
134        self
135    }
136
137    /// Mark output as truncated with a hint.
138    pub fn with_truncation(mut self, total: usize, included: usize, hint: String) -> Self {
139        self.truncated = true;
140        self.total_count = Some(total);
141        self.included_count = included;
142        self.agent_hint = Some(hint);
143        self
144    }
145
146    /// Get the final output including page index and agent hints.
147    pub fn to_string_with_hints(&self) -> String {
148        let mut parts = Vec::new();
149
150        // Page index header (when budget trimming produced pages)
151        if let Some(index) = &self.page_index {
152            parts.push(index.to_toon());
153        }
154
155        // Main content
156        parts.push(self.content.clone());
157
158        // Agent hint footer
159        if let Some(hint) = &self.agent_hint {
160            parts.push(hint.clone());
161        }
162
163        parts.join("\n\n")
164    }
165}
166
167/// Configuration for pipeline transformations.
168#[derive(Debug, Clone)]
169pub struct PipelineConfig {
170    /// Maximum characters for the entire output (0 = no limit).
171    /// Used as budget ceiling — converted to tokens via `max_chars / 3.5`.
172    pub max_chars: usize,
173    /// Maximum characters per item (e.g., diff content)
174    pub max_chars_per_item: usize,
175    /// Maximum description/body length before truncation (only outliers get truncated)
176    pub max_description_len: usize,
177    pub format: OutputFormat,
178    /// Whether to include agent hints about truncation
179    pub include_hints: bool,
180    /// Page cursor from a previous request (for pagination)
181    pub page_cursor: Option<String>,
182    /// Tool name for strategy resolution (e.g., "get_issues", "get_merge_request_diffs")
183    pub tool_name: Option<String>,
184    /// Chunk number to return (1-based). When set, pipeline skips to that chunk
185    /// instead of returning chunk 1. Used for chunk index navigation.
186    pub chunk: Option<usize>,
187}
188
189impl Default for PipelineConfig {
190    fn default() -> Self {
191        Self {
192            max_chars: 100_000,
193            max_chars_per_item: 10_000,
194            max_description_len: 10_000,
195            format: OutputFormat::Toon,
196            include_hints: true,
197            page_cursor: None,
198            tool_name: None,
199            chunk: None,
200        }
201    }
202}
203
204/// Output format for transformations.
205#[derive(Debug, Clone, Copy, PartialEq, Eq)]
206pub enum OutputFormat {
207    /// TOON format -- token-optimized custom format. Wins on `cl100k_base`
208    /// tokenizers but *loses* ~26% on `o200k_base` (the modern Anthropic /
209    /// OpenAI family). Kept as a baseline; not the recommended default.
210    /// See Paper 2 §Savings Accounting.
211    Toon,
212    /// JSON pretty-printed -- for programmatic processing.
213    Json,
214    /// MCKP v2 -- format-adaptive encoder dispatched by structural shape.
215    /// Routes object-wrapping-array shapes through the union-of-keys table
216    /// renderer (`deep_mckp_with_inner_table`) and falls back to compact
217    /// JSON when no shape applies. Tokenizer-agnostic — see Paper 2
218    /// §Encoder Bug Postmortem and §Savings Accounting.
219    Mckp,
220}
221
222/// Pipeline for chaining output transformations.
223pub struct Pipeline {
224    config: PipelineConfig,
225}
226
227impl Pipeline {
228    /// Create a new pipeline with default configuration.
229    pub fn new() -> Self {
230        Self {
231            config: PipelineConfig::default(),
232        }
233    }
234
235    /// Create a pipeline with custom configuration.
236    pub fn with_config(config: PipelineConfig) -> Self {
237        Self { config }
238    }
239
240    /// Transform a list of issues using budget pipeline.
241    pub fn transform_issues(&self, issues: Vec<Issue>) -> Result<TransformOutput> {
242        let total = issues.len();
243        let raw_json = serde_json::to_string(&issues)?;
244        let raw_chars = raw_json.len();
245
246        // First pass: check if all data fits in budget
247        let full_content = match self.config.format {
248            OutputFormat::Json => serde_json::to_string_pretty(&issues)?,
249            OutputFormat::Toon => toon::encode_issues(&issues, toon::TrimLevel::Full)?,
250            OutputFormat::Mckp => encode_mckp(&issues)?,
251        };
252
253        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
254            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
255            output.included_count = total;
256            return Ok(output);
257        }
258
259        // Budget pipeline: find how many items fit
260        let budget_config = self.budget_config();
261        let strategy_kind = self.resolve_strategy("get_issues");
262        let result = budget::process_issues(&issues, strategy_kind, &budget_config)?;
263        let chunk_size = result.included_items;
264
265        // Chunk navigation: if chunk > 1, slice to that chunk
266        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&issues, chunk_size);
267        if is_chunk_request {
268            let content = match self.config.format {
269                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
270                OutputFormat::Toon => toon::encode_issues(chunk_items, toon::TrimLevel::Full)?,
271                OutputFormat::Mckp => encode_mckp(chunk_items)?,
272            };
273            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
274            output.included_count = chunk_items.len();
275            output.total_count = Some(total);
276            return Ok(output);
277        }
278
279        // Chunk 1 (default): budget-trimmed best items + chunk index
280        let json_fallback = self.json_fallback(&full_content);
281        let index = page_index::build_issues_index(&issues, result.included_items);
282        self.build_budget_output(
283            result,
284            raw_chars,
285            total,
286            "issues",
287            Some(index),
288            json_fallback,
289        )
290    }
291
292    /// Transform a list of merge requests using budget pipeline.
293    pub fn transform_merge_requests(&self, mrs: Vec<MergeRequest>) -> Result<TransformOutput> {
294        let total = mrs.len();
295        let raw_json = serde_json::to_string(&mrs)?;
296        let raw_chars = raw_json.len();
297
298        let full_content = match self.config.format {
299            OutputFormat::Json => serde_json::to_string_pretty(&mrs)?,
300            OutputFormat::Toon => toon::encode_merge_requests(&mrs, toon::TrimLevel::Full)?,
301            OutputFormat::Mckp => encode_mckp(&mrs)?,
302        };
303
304        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
305            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
306            output.included_count = total;
307            return Ok(output);
308        }
309
310        let budget_config = self.budget_config();
311        let strategy_kind = self.resolve_strategy("get_merge_requests");
312        let result = budget::process_merge_requests(&mrs, strategy_kind, &budget_config)?;
313        let chunk_size = result.included_items;
314
315        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&mrs, chunk_size);
316        if is_chunk_request {
317            let content = match self.config.format {
318                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
319                OutputFormat::Toon => {
320                    toon::encode_merge_requests(chunk_items, toon::TrimLevel::Full)?
321                }
322                OutputFormat::Mckp => encode_mckp(chunk_items)?,
323            };
324            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
325            output.included_count = chunk_items.len();
326            output.total_count = Some(total);
327            return Ok(output);
328        }
329
330        let json_fallback = self.json_fallback(&full_content);
331        let index = page_index::build_merge_requests_index(&mrs, result.included_items);
332        self.build_budget_output(
333            result,
334            raw_chars,
335            total,
336            "merge_requests",
337            Some(index),
338            json_fallback,
339        )
340    }
341
342    /// Transform a list of file diffs using budget pipeline.
343    ///
344    /// Individual diff content is truncated per `max_chars_per_item` before
345    /// budget trimming to protect against giant lock/generated files.
346    pub fn transform_diffs(&self, diffs: Vec<FileDiff>) -> Result<TransformOutput> {
347        let total = diffs.len();
348
349        // Per-item truncation for individual diff content (protection against giant files)
350        let diffs: Vec<FileDiff> = diffs
351            .into_iter()
352            .map(|mut d| {
353                d.diff = truncation::truncate_string(&d.diff, self.config.max_chars_per_item);
354                d
355            })
356            .collect();
357
358        let raw_json = serde_json::to_string(&diffs)?;
359        let raw_chars = raw_json.len();
360
361        let full_content = match self.config.format {
362            OutputFormat::Json => serde_json::to_string_pretty(&diffs)?,
363            OutputFormat::Toon => toon::encode_diffs(&diffs)?,
364            OutputFormat::Mckp => encode_mckp(&diffs)?,
365        };
366
367        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
368            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
369            output.included_count = total;
370            return Ok(output);
371        }
372
373        let budget_config = self.budget_config();
374        let strategy_kind = self.resolve_strategy("get_merge_request_diffs");
375        let result = budget::process_diffs(&diffs, strategy_kind, &budget_config)?;
376        let chunk_size = result.included_items;
377
378        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&diffs, chunk_size);
379        if is_chunk_request {
380            let content = match self.config.format {
381                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
382                OutputFormat::Toon => toon::encode_diffs(chunk_items)?,
383                OutputFormat::Mckp => encode_mckp(chunk_items)?,
384            };
385            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
386            output.included_count = chunk_items.len();
387            output.total_count = Some(total);
388            return Ok(output);
389        }
390
391        let json_fallback = self.json_fallback(&full_content);
392        let index = page_index::build_diffs_index(&diffs, result.included_items);
393        self.build_budget_output(
394            result,
395            raw_chars,
396            total,
397            "diffs",
398            Some(index),
399            json_fallback,
400        )
401    }
402
403    /// Transform a list of comments using budget pipeline.
404    pub fn transform_comments(&self, comments: Vec<Comment>) -> Result<TransformOutput> {
405        let total = comments.len();
406        let raw_json = serde_json::to_string(&comments)?;
407        let raw_chars = raw_json.len();
408
409        let full_content = match self.config.format {
410            OutputFormat::Json => serde_json::to_string_pretty(&comments)?,
411            OutputFormat::Toon => toon::encode_comments(&comments)?,
412            OutputFormat::Mckp => encode_mckp(&comments)?,
413        };
414
415        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
416            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
417            output.included_count = total;
418            return Ok(output);
419        }
420
421        let budget_config = self.budget_config();
422        let strategy_kind = self.resolve_strategy("get_issue_comments");
423        let result = budget::process_comments(&comments, strategy_kind, &budget_config)?;
424        let chunk_size = result.included_items;
425
426        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&comments, chunk_size);
427        if is_chunk_request {
428            let content = match self.config.format {
429                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
430                OutputFormat::Toon => toon::encode_comments(chunk_items)?,
431                OutputFormat::Mckp => encode_mckp(chunk_items)?,
432            };
433            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
434            output.included_count = chunk_items.len();
435            output.total_count = Some(total);
436            return Ok(output);
437        }
438
439        let json_fallback = self.json_fallback(&full_content);
440        let index = page_index::build_comments_index(&comments, result.included_items);
441        self.build_budget_output(
442            result,
443            raw_chars,
444            total,
445            "comments",
446            Some(index),
447            json_fallback,
448        )
449    }
450
451    /// Transform a list of discussions using budget pipeline.
452    pub fn transform_discussions(&self, discussions: Vec<Discussion>) -> Result<TransformOutput> {
453        let total = discussions.len();
454        let raw_json = serde_json::to_string(&discussions)?;
455        let raw_chars = raw_json.len();
456
457        let full_content = match self.config.format {
458            OutputFormat::Json => serde_json::to_string_pretty(&discussions)?,
459            OutputFormat::Toon => toon::encode_discussions(&discussions)?,
460            OutputFormat::Mckp => encode_mckp(&discussions)?,
461        };
462
463        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
464            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
465            output.included_count = total;
466            return Ok(output);
467        }
468
469        let budget_config = self.budget_config();
470        let strategy_kind = self.resolve_strategy("get_merge_request_discussions");
471        let result = budget::process_discussions(&discussions, strategy_kind, &budget_config)?;
472        let chunk_size = result.included_items;
473
474        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&discussions, chunk_size);
475        if is_chunk_request {
476            let content = match self.config.format {
477                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
478                OutputFormat::Toon => toon::encode_discussions(chunk_items)?,
479                OutputFormat::Mckp => encode_mckp(chunk_items)?,
480            };
481            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
482            output.included_count = chunk_items.len();
483            output.total_count = Some(total);
484            return Ok(output);
485        }
486
487        let json_fallback = self.json_fallback(&full_content);
488        let index = page_index::build_discussions_index(&discussions, result.included_items);
489        self.build_budget_output(
490            result,
491            raw_chars,
492            total,
493            "discussions",
494            Some(index),
495            json_fallback,
496        )
497    }
498
499    /// When format is JSON, return the content for truncation fallback.
500    /// Budget pipeline always produces TOON, so for JSON we truncate the original JSON.
501    fn json_fallback(&self, content: &str) -> Option<String> {
502        if matches!(self.config.format, OutputFormat::Json) {
503            Some(content.to_string())
504        } else {
505            None
506        }
507    }
508
509    /// Slice items for a specific chunk number.
510    ///
511    /// When `config.chunk` is Some(n) with n > 1, we need to compute
512    /// the chunk boundaries and return only items for that chunk.
513    /// Returns (slice_items, is_chunk_request) — if not a chunk request,
514    /// returns all items.
515    fn slice_for_chunk<'a, T>(&self, items: &'a [T], chunk_size: usize) -> (&'a [T], bool) {
516        match self.config.chunk {
517            Some(n) if n > 1 && chunk_size > 0 => {
518                let offset = (n - 1) * chunk_size;
519                if offset >= items.len() {
520                    (&[], true) // chunk beyond data
521                } else {
522                    let end = (offset + chunk_size).min(items.len());
523                    (&items[offset..end], true)
524                }
525            }
526            _ => (items, false),
527        }
528    }
529
530    /// Convert max_chars to budget pipeline config.
531    fn budget_config(&self) -> BudgetConfig {
532        BudgetConfig {
533            budget_tokens: estimate_tokens_from_chars(self.config.max_chars),
534            ..Default::default()
535        }
536    }
537
538    /// Resolve trimming strategy for tool name.
539    fn resolve_strategy(&self, default_tool: &str) -> strategy::TrimStrategyKind {
540        let resolver = StrategyResolver::new();
541        let tool = self.config.tool_name.as_deref().unwrap_or(default_tool);
542        resolver.resolve(tool)
543    }
544
545    /// Build TransformOutput from BudgetResult with chunk index.
546    ///
547    /// Returns: chunk 1 (best items by strategy) + index of ALL chunks.
548    /// Agent can fetch remaining chunks via offset/limit in subsequent tool calls.
549    ///
550    /// Note: budget pipeline always produces TOON content. When format is JSON,
551    /// we fall back to simple character truncation of the JSON output instead.
552    fn build_budget_output(
553        &self,
554        result: budget::BudgetResult,
555        raw_chars: usize,
556        total: usize,
557        item_type: &str,
558        index: Option<page_index::PageIndex>,
559        json_fallback: Option<String>,
560    ) -> Result<TransformOutput> {
561        // Budget pipeline produces TOON. For JSON format, use truncated JSON instead.
562        let content = if matches!(self.config.format, OutputFormat::Json) {
563            if let Some(json) = json_fallback {
564                truncation::truncate_string(&json, self.config.max_chars)
565            } else {
566                result.content
567            }
568        } else {
569            result.content
570        };
571
572        let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
573        output.included_count = result.included_items;
574
575        // Always set truncation metadata when trimmed, regardless of include_hints
576        if result.trimmed {
577            output.truncated = true;
578            output.total_count = Some(total);
579
580            if self.config.include_hints {
581                if let Some(idx) = index {
582                    if idx.total_pages > 1 {
583                        let hint = format!(
584                            "Chunk 1/{}: {} most relevant {} (by priority). {} total items across {} chunks. \
585                            Use `chunk: N` parameter to fetch a specific chunk, or request all remaining data.",
586                            idx.total_pages,
587                            result.included_items,
588                            item_type,
589                            total,
590                            idx.total_pages
591                        );
592                        output.page_index = Some(idx);
593                        output.agent_hint = Some(hint);
594                    } else {
595                        let remaining = total.saturating_sub(result.included_items);
596                        output.agent_hint = Some(format!(
597                            "Showing {}/{} {}. {} items trimmed by budget.",
598                            result.included_items, total, item_type, remaining
599                        ));
600                    }
601                } else {
602                    let remaining = total.saturating_sub(result.included_items);
603                    output.agent_hint = Some(format!(
604                        "Showing {}/{} {}. {} items trimmed by budget. Use `chunk: N` parameter to fetch a specific chunk.",
605                        result.included_items, total, item_type, remaining
606                    ));
607                }
608            }
609        }
610
611        Ok(output)
612    }
613}
614
615impl Default for Pipeline {
616    fn default() -> Self {
617        Self::new()
618    }
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624    use devboy_core::User;
625
626    fn sample_issues() -> Vec<Issue> {
627        (1..=25)
628            .map(|i| Issue {
629                key: format!("gh#{}", i),
630                title: format!("Issue {}", i),
631                description: Some(format!("Description for issue {}", i)),
632                state: "open".to_string(),
633                source: "github".to_string(),
634                priority: None,
635                labels: vec!["bug".to_string()],
636                author: Some(User {
637                    id: "1".to_string(),
638                    username: "test".to_string(),
639                    name: None,
640                    email: None,
641                    avatar_url: None,
642                }),
643                assignees: vec![],
644                url: Some(format!("https://github.com/test/repo/issues/{}", i)),
645                created_at: Some("2024-01-01T00:00:00Z".to_string()),
646                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
647                attachments_count: None,
648                parent: None,
649                subtasks: vec![],
650            })
651            .collect()
652    }
653
654    fn sample_merge_requests() -> Vec<MergeRequest> {
655        (1..=5)
656            .map(|i| MergeRequest {
657                key: format!("mr#{}", i),
658                title: format!("MR {}", i),
659                description: Some(format!("MR description {}", i)),
660                state: "opened".to_string(),
661                source: "gitlab".to_string(),
662                source_branch: format!("feature-{}", i),
663                target_branch: "main".to_string(),
664                author: None,
665                assignees: vec![],
666                reviewers: vec![],
667                labels: vec![],
668                url: Some(format!(
669                    "https://gitlab.com/test/repo/-/merge_requests/{}",
670                    i
671                )),
672                created_at: Some("2024-01-01T00:00:00Z".to_string()),
673                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
674                draft: false,
675            })
676            .collect()
677    }
678
679    fn sample_diffs() -> Vec<FileDiff> {
680        (1..=5)
681            .map(|i| FileDiff {
682                file_path: format!("src/file_{}.rs", i),
683                old_path: None,
684                new_file: i == 1,
685                deleted_file: false,
686                renamed_file: false,
687                diff: format!("+added line {}\n-removed line {}", i, i),
688                additions: Some(1),
689                deletions: Some(1),
690            })
691            .collect()
692    }
693
694    fn sample_comments() -> Vec<Comment> {
695        (1..=5)
696            .map(|i| Comment {
697                id: format!("{}", i),
698                body: format!("Comment body {}", i),
699                author: None,
700                created_at: Some("2024-01-01T00:00:00Z".to_string()),
701                updated_at: None,
702                position: None,
703            })
704            .collect()
705    }
706
707    fn sample_discussions() -> Vec<Discussion> {
708        (1..=5)
709            .map(|i| Discussion {
710                id: format!("{}", i),
711                resolved: i % 2 == 0,
712                resolved_by: None,
713                comments: vec![Comment {
714                    id: format!("c{}", i),
715                    body: format!("Discussion comment {}", i),
716                    author: None,
717                    created_at: None,
718                    updated_at: None,
719                    position: None,
720                }],
721                position: None,
722            })
723            .collect()
724    }
725
726    // --- Pipeline truncation (budget-based) ---
727
728    #[test]
729    fn test_pipeline_truncates_items() {
730        // Use a small max_chars to force budget trimming
731        let pipeline = Pipeline::with_config(PipelineConfig {
732            max_chars: 200,
733            ..Default::default()
734        });
735
736        let issues = sample_issues();
737        let output = pipeline.transform_issues(issues).unwrap();
738
739        assert!(output.truncated);
740        assert_eq!(output.total_count, Some(25));
741        assert!(output.included_count < 25);
742        assert!(output.agent_hint.is_some());
743    }
744
745    #[test]
746    fn test_pipeline_no_truncation_when_under_limit() {
747        let pipeline = Pipeline::with_config(PipelineConfig {
748            max_chars: 100_000,
749            ..Default::default()
750        });
751
752        let issues: Vec<Issue> = sample_issues().into_iter().take(5).collect();
753        let output = pipeline.transform_issues(issues).unwrap();
754
755        assert!(!output.truncated);
756        assert!(output.agent_hint.is_none());
757    }
758
759    // --- Toon format ---
760
761    #[test]
762    fn test_toon_format_issues() {
763        let pipeline = Pipeline::with_config(PipelineConfig {
764            format: OutputFormat::Toon,
765            max_chars: 100_000,
766            ..Default::default()
767        });
768
769        let issues: Vec<Issue> = sample_issues().into_iter().take(3).collect();
770        let output = pipeline.transform_issues(issues).unwrap();
771
772        assert!(output.content.contains("gh#1"));
773        assert!(output.content.contains("Issue 1"));
774    }
775
776    #[test]
777    fn test_toon_format_merge_requests() {
778        // Use max_chars large enough to include some but not all MRs
779        let pipeline = Pipeline::with_config(PipelineConfig {
780            format: OutputFormat::Toon,
781            max_chars: 500,
782            ..Default::default()
783        });
784
785        let mrs = sample_merge_requests();
786        let output = pipeline.transform_merge_requests(mrs).unwrap();
787
788        assert!(output.content.contains("mr#1"));
789        assert!(output.content.contains("MR 1"));
790        assert!(output.truncated);
791        assert!(output.included_count < 5);
792    }
793
794    #[test]
795    fn test_toon_format_diffs() {
796        // Use max_chars small enough to force budget trimming of 5 diffs
797        let pipeline = Pipeline::with_config(PipelineConfig {
798            format: OutputFormat::Toon,
799            max_chars: 200,
800            ..Default::default()
801        });
802
803        let diffs = sample_diffs();
804        let output = pipeline.transform_diffs(diffs).unwrap();
805
806        assert!(output.content.contains("src/file_1.rs"));
807        assert!(output.truncated);
808        assert!(output.included_count < 5);
809    }
810
811    #[test]
812    fn test_toon_format_comments() {
813        // Use max_chars small enough to force budget trimming of 5 comments
814        // but large enough to include at least one comment with body text
815        let pipeline = Pipeline::with_config(PipelineConfig {
816            format: OutputFormat::Toon,
817            max_chars: 300,
818            ..Default::default()
819        });
820
821        let comments = sample_comments();
822        let output = pipeline.transform_comments(comments).unwrap();
823
824        // Budget trimming may drop early items; check that some comment body is present
825        assert!(output.content.contains("Comment body"));
826        assert!(output.truncated);
827        assert!(output.included_count < 5);
828    }
829
830    #[test]
831    fn test_toon_format_discussions() {
832        // Use max_chars large enough to include some but not all discussions
833        let pipeline = Pipeline::with_config(PipelineConfig {
834            format: OutputFormat::Toon,
835            max_chars: 500,
836            ..Default::default()
837        });
838
839        let discussions = sample_discussions();
840        let output = pipeline.transform_discussions(discussions).unwrap();
841
842        assert!(output.content.contains("Discussion comment 1"));
843        assert!(output.truncated);
844        assert!(output.included_count < 5);
845    }
846
847    // --- JSON format ---
848
849    #[test]
850    fn test_json_format_issues() {
851        let pipeline = Pipeline::with_config(PipelineConfig {
852            format: OutputFormat::Json,
853            max_chars: 100_000,
854            ..Default::default()
855        });
856
857        let issues: Vec<Issue> = sample_issues().into_iter().take(2).collect();
858        let output = pipeline.transform_issues(issues).unwrap();
859
860        let parsed: Vec<Issue> = serde_json::from_str(&output.content).unwrap();
861        assert_eq!(parsed.len(), 2);
862    }
863
864    #[test]
865    fn test_json_format_merge_requests() {
866        let pipeline = Pipeline::with_config(PipelineConfig {
867            format: OutputFormat::Json,
868            max_chars: 100_000,
869            ..Default::default()
870        });
871
872        let mrs: Vec<MergeRequest> = sample_merge_requests().into_iter().take(2).collect();
873        let output = pipeline.transform_merge_requests(mrs).unwrap();
874
875        let parsed: Vec<MergeRequest> = serde_json::from_str(&output.content).unwrap();
876        assert_eq!(parsed.len(), 2);
877    }
878
879    #[test]
880    fn test_json_format_diffs() {
881        let pipeline = Pipeline::with_config(PipelineConfig {
882            format: OutputFormat::Json,
883            max_chars: 100_000,
884            ..Default::default()
885        });
886
887        let diffs: Vec<FileDiff> = sample_diffs().into_iter().take(2).collect();
888        let output = pipeline.transform_diffs(diffs).unwrap();
889
890        let parsed: Vec<FileDiff> = serde_json::from_str(&output.content).unwrap();
891        assert_eq!(parsed.len(), 2);
892    }
893
894    #[test]
895    fn test_json_format_comments() {
896        let pipeline = Pipeline::with_config(PipelineConfig {
897            format: OutputFormat::Json,
898            max_chars: 100_000,
899            ..Default::default()
900        });
901
902        let comments: Vec<Comment> = sample_comments().into_iter().take(2).collect();
903        let output = pipeline.transform_comments(comments).unwrap();
904
905        let parsed: Vec<Comment> = serde_json::from_str(&output.content).unwrap();
906        assert_eq!(parsed.len(), 2);
907    }
908
909    #[test]
910    fn test_json_format_discussions() {
911        let pipeline = Pipeline::with_config(PipelineConfig {
912            format: OutputFormat::Json,
913            max_chars: 100_000,
914            ..Default::default()
915        });
916
917        let discussions: Vec<Discussion> = sample_discussions().into_iter().take(2).collect();
918        let output = pipeline.transform_discussions(discussions).unwrap();
919
920        let parsed: Vec<Discussion> = serde_json::from_str(&output.content).unwrap();
921        assert_eq!(parsed.len(), 2);
922    }
923
924    // --- TransformOutput ---
925
926    #[test]
927    fn test_transform_output_to_string_with_hints() {
928        let output = TransformOutput::new("content".to_string());
929        assert_eq!(output.to_string_with_hints(), "content");
930
931        let output = TransformOutput::new("content".to_string()).with_truncation(
932            10,
933            5,
934            "hint text".to_string(),
935        );
936        assert!(output.to_string_with_hints().contains("content"));
937        assert!(output.to_string_with_hints().contains("hint text"));
938    }
939
940    #[test]
941    fn test_transform_output_with_truncation() {
942        let output =
943            TransformOutput::new("data".into()).with_truncation(100, 10, "90 more items".into());
944        assert!(output.truncated);
945        assert_eq!(output.total_count, Some(100));
946        assert_eq!(output.included_count, 10);
947        assert_eq!(output.agent_hint.as_deref(), Some("90 more items"));
948    }
949
950    // --- PipelineConfig ---
951
952    #[test]
953    fn test_pipeline_config_default_values() {
954        let config = PipelineConfig::default();
955        assert_eq!(config.max_chars, 100_000);
956        assert_eq!(config.max_chars_per_item, 10_000);
957        assert_eq!(config.max_description_len, 10_000);
958        assert!(matches!(config.format, OutputFormat::Toon));
959        assert!(config.include_hints);
960    }
961
962    #[test]
963    fn test_pipeline_default() {
964        let pipeline = Pipeline::default();
965        let issues: Vec<Issue> = sample_issues().into_iter().take(1).collect();
966        let output = pipeline.transform_issues(issues).unwrap();
967        assert!(!output.content.is_empty());
968    }
969
970    #[test]
971    fn test_pipeline_hints_disabled() {
972        // Use small max_chars to trigger budget trimming, but with hints disabled
973        let pipeline = Pipeline::with_config(PipelineConfig {
974            max_chars: 200,
975            include_hints: false,
976            ..Default::default()
977        });
978
979        let issues = sample_issues();
980        let output = pipeline.transform_issues(issues).unwrap();
981
982        assert!(output.included_count < 25);
983        // truncated flag is always set when trimming occurs (for metadata consumers)
984        assert!(output.truncated);
985        // but agent_hint and page_index are suppressed when include_hints is false
986        assert!(output.agent_hint.is_none());
987        assert!(output.page_index.is_none());
988    }
989
990    // --- Character limit (budget-based) ---
991
992    #[test]
993    fn test_char_limit_applied() {
994        let pipeline = Pipeline::with_config(PipelineConfig {
995            max_chars: 100,
996            ..Default::default()
997        });
998
999        let issues = sample_issues();
1000        let output = pipeline.transform_issues(issues).unwrap();
1001
1002        assert!(output.truncated);
1003    }
1004
1005    #[test]
1006    fn test_char_limit_triggers_trimming() {
1007        let pipeline = Pipeline::with_config(PipelineConfig {
1008            max_chars: 50,
1009            ..Default::default()
1010        });
1011
1012        let issues: Vec<Issue> = sample_issues().into_iter().take(3).collect();
1013        let output = pipeline.transform_issues(issues).unwrap();
1014        assert!(output.truncated);
1015    }
1016
1017    // --- Empty collections ---
1018
1019    #[test]
1020    fn test_transform_empty_issues() {
1021        let pipeline = Pipeline::new();
1022        let output = pipeline.transform_issues(vec![]).unwrap();
1023        assert!(!output.truncated);
1024        assert_eq!(output.included_count, 0);
1025    }
1026
1027    #[test]
1028    fn test_transform_empty_merge_requests() {
1029        let pipeline = Pipeline::new();
1030        let output = pipeline.transform_merge_requests(vec![]).unwrap();
1031        assert!(!output.truncated);
1032        assert_eq!(output.included_count, 0);
1033    }
1034
1035    #[test]
1036    fn test_transform_empty_diffs() {
1037        let pipeline = Pipeline::new();
1038        let output = pipeline.transform_diffs(vec![]).unwrap();
1039        assert!(!output.truncated);
1040        assert_eq!(output.included_count, 0);
1041    }
1042
1043    #[test]
1044    fn test_transform_empty_comments() {
1045        let pipeline = Pipeline::new();
1046        let output = pipeline.transform_comments(vec![]).unwrap();
1047        assert!(!output.truncated);
1048        assert_eq!(output.included_count, 0);
1049    }
1050
1051    #[test]
1052    fn test_transform_empty_discussions() {
1053        let pipeline = Pipeline::new();
1054        let output = pipeline.transform_discussions(vec![]).unwrap();
1055        assert!(!output.truncated);
1056        assert_eq!(output.included_count, 0);
1057    }
1058
1059    // --- Diff truncation per item ---
1060
1061    #[test]
1062    fn test_diff_content_truncated_per_item() {
1063        let pipeline = Pipeline::with_config(PipelineConfig {
1064            max_chars_per_item: 10,
1065            max_chars: 100_000,
1066            ..Default::default()
1067        });
1068
1069        let diffs = vec![FileDiff {
1070            file_path: "big.rs".into(),
1071            old_path: None,
1072            new_file: false,
1073            deleted_file: false,
1074            renamed_file: false,
1075            diff: "x".repeat(1000),
1076            additions: Some(100),
1077            deletions: Some(0),
1078        }];
1079
1080        let output = pipeline.transform_diffs(diffs).unwrap();
1081        assert!(output.content.len() < 1000);
1082    }
1083
1084    // --- TOON smaller than JSON ---
1085
1086    // --- JSON format with budget trimming (triggers json_fallback) ---
1087
1088    #[test]
1089    fn test_json_format_with_budget_trimming_issues() {
1090        let pipeline = Pipeline::with_config(PipelineConfig {
1091            format: OutputFormat::Json,
1092            max_chars: 200,
1093            ..Default::default()
1094        });
1095
1096        let issues = sample_issues();
1097        let output = pipeline.transform_issues(issues).unwrap();
1098
1099        assert!(output.truncated);
1100        assert!(output.included_count < 25);
1101        // Content should be truncated JSON (not TOON)
1102        assert!(!output.content.is_empty());
1103    }
1104
1105    #[test]
1106    fn test_json_format_with_budget_trimming_merge_requests() {
1107        let pipeline = Pipeline::with_config(PipelineConfig {
1108            format: OutputFormat::Json,
1109            max_chars: 200,
1110            ..Default::default()
1111        });
1112
1113        let mrs = sample_merge_requests();
1114        let output = pipeline.transform_merge_requests(mrs).unwrap();
1115
1116        assert!(output.truncated);
1117        assert!(!output.content.is_empty());
1118    }
1119
1120    #[test]
1121    fn test_json_format_with_budget_trimming_diffs() {
1122        let pipeline = Pipeline::with_config(PipelineConfig {
1123            format: OutputFormat::Json,
1124            max_chars: 100,
1125            ..Default::default()
1126        });
1127
1128        let diffs = sample_diffs();
1129        let output = pipeline.transform_diffs(diffs).unwrap();
1130
1131        assert!(output.truncated);
1132        assert!(!output.content.is_empty());
1133    }
1134
1135    #[test]
1136    fn test_json_format_with_budget_trimming_comments() {
1137        let pipeline = Pipeline::with_config(PipelineConfig {
1138            format: OutputFormat::Json,
1139            max_chars: 100,
1140            ..Default::default()
1141        });
1142
1143        let comments = sample_comments();
1144        let output = pipeline.transform_comments(comments).unwrap();
1145
1146        assert!(output.truncated);
1147        assert!(!output.content.is_empty());
1148    }
1149
1150    #[test]
1151    fn test_json_format_with_budget_trimming_discussions() {
1152        let pipeline = Pipeline::with_config(PipelineConfig {
1153            format: OutputFormat::Json,
1154            max_chars: 100,
1155            ..Default::default()
1156        });
1157
1158        let discussions = sample_discussions();
1159        let output = pipeline.transform_discussions(discussions).unwrap();
1160
1161        assert!(output.truncated);
1162        assert!(!output.content.is_empty());
1163    }
1164
1165    // --- Chunk index hints (total_pages > 1) ---
1166
1167    #[test]
1168    fn test_pipeline_chunk_index_with_many_issues() {
1169        // Use enough issues and small budget to trigger multi-page chunk index
1170        let issues: Vec<Issue> = (1..=50)
1171            .map(|i| Issue {
1172                key: format!("gh#{}", i),
1173                title: format!("Issue {} with a moderately long title for sizing", i),
1174                description: Some(format!(
1175                    "Description for issue {} with substantial content to inflate token count significantly beyond budget",
1176                    i
1177                )),
1178                state: "open".to_string(),
1179                source: "github".to_string(),
1180                priority: None,
1181                labels: vec!["bug".to_string(), "critical".to_string()],
1182                author: Some(User {
1183                    id: "1".to_string(),
1184                    username: "test".to_string(),
1185                    name: None,
1186                    email: None,
1187                    avatar_url: None,
1188                }),
1189                assignees: vec![],
1190                url: Some(format!("https://github.com/test/repo/issues/{}", i)),
1191                created_at: Some("2024-01-01T00:00:00Z".to_string()),
1192                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
1193                attachments_count: None,
1194            parent: None,
1195                subtasks: vec![],
1196            })
1197            .collect();
1198
1199        let pipeline = Pipeline::with_config(PipelineConfig {
1200            max_chars: 500,
1201            include_hints: true,
1202            ..Default::default()
1203        });
1204
1205        let output = pipeline.transform_issues(issues).unwrap();
1206
1207        assert!(output.truncated);
1208        assert!(output.included_count < 50);
1209        // When many items are trimmed, we expect page_index and chunk hint
1210        if let Some(ref hint) = output.agent_hint {
1211            assert!(
1212                hint.contains("Chunk") || hint.contains("Showing"),
1213                "Expected chunk or showing hint, got: {}",
1214                hint
1215            );
1216        }
1217    }
1218
1219    #[test]
1220    fn test_toon_smaller_than_json_for_issues() {
1221        let issues: Vec<Issue> = sample_issues().into_iter().take(10).collect();
1222
1223        let json_pipeline = Pipeline::with_config(PipelineConfig {
1224            format: OutputFormat::Json,
1225            max_chars: 1_000_000,
1226            ..Default::default()
1227        });
1228        let toon_pipeline = Pipeline::with_config(PipelineConfig {
1229            format: OutputFormat::Toon,
1230            max_chars: 1_000_000,
1231            ..Default::default()
1232        });
1233
1234        let json_output = json_pipeline.transform_issues(issues.clone()).unwrap();
1235        let toon_output = toon_pipeline.transform_issues(issues).unwrap();
1236
1237        assert!(
1238            toon_output.content.len() < json_output.content.len(),
1239            "TOON ({}) should be smaller than JSON ({})",
1240            toon_output.content.len(),
1241            json_output.content.len()
1242        );
1243    }
1244
1245    #[test]
1246    fn test_mckp_routes_issues_through_inner_table() {
1247        let issues: Vec<Issue> = sample_issues().into_iter().take(10).collect();
1248
1249        let mckp_pipeline = Pipeline::with_config(PipelineConfig {
1250            format: OutputFormat::Mckp,
1251            max_chars: 1_000_000,
1252            ..Default::default()
1253        });
1254        let json_pipeline = Pipeline::with_config(PipelineConfig {
1255            format: OutputFormat::Json,
1256            max_chars: 1_000_000,
1257            ..Default::default()
1258        });
1259
1260        let mckp_out = mckp_pipeline.transform_issues(issues.clone()).unwrap();
1261        let json_out = json_pipeline.transform_issues(issues).unwrap();
1262
1263        // MCKP must beat the pretty-printed JSON baseline on this shape
1264        // (array of objects → routes to `csv` via try_array_csv).
1265        assert!(
1266            mckp_out.content.len() < json_out.content.len(),
1267            "MCKP ({}) should be smaller than JSON ({})",
1268            mckp_out.content.len(),
1269            json_out.content.len(),
1270        );
1271        // Round-trip key parity: every Issue field still appears in the
1272        // output (the encoder bug regression).
1273        for k in ["key", "title", "state", "source"] {
1274            assert!(
1275                mckp_out.content.contains(k),
1276                "MCKP output is missing field `{k}`: {}",
1277                &mckp_out.content[..mckp_out.content.len().min(200)]
1278            );
1279        }
1280    }
1281
1282    #[test]
1283    fn test_mckp_falls_back_to_pretty_json_on_unstable_keys() {
1284        // Single issue → array length 1, below the min_items threshold for
1285        // try_array_csv. encode_mckp must not crash; it should fall back
1286        // to the pretty JSON.
1287        let issues: Vec<Issue> = sample_issues().into_iter().take(1).collect();
1288        let mckp_pipeline = Pipeline::with_config(PipelineConfig {
1289            format: OutputFormat::Mckp,
1290            max_chars: 1_000_000,
1291            ..Default::default()
1292        });
1293        let out = mckp_pipeline.transform_issues(issues).unwrap();
1294        assert!(out.content.contains("gh#1"));
1295    }
1296}