Skip to main content

devboy_format_pipeline/
lib.rs

1//! Format pipeline for tool output transformation.
2//!
3//! Formats tool responses into an optimal format for LLM:
4//!
5//! - **TOON** (default): Token-Oriented Object Notation -- saves 39-90% of tokens
6//! - **JSON**: for programmatic processing
7//! - **Budget trimming**: smart strategy-based trimming when output exceeds budget
8//!
9//! # Example
10//!
11//! ```ignore
12//! use devboy_format_pipeline::{Pipeline, PipelineConfig, OutputFormat};
13//! use devboy_core::Issue;
14//!
15//! let pipeline = Pipeline::with_config(PipelineConfig {
16//!     format: OutputFormat::Toon,
17//!     max_chars: 100_000,
18//!     ..Default::default()
19//! });
20//!
21//! let output = pipeline.transform_issues(issues)?;
22//! println!("{}", output.to_string_with_hints());
23//! ```
24
25#![deny(rustdoc::broken_intra_doc_links)]
26#![deny(rustdoc::private_intra_doc_links)]
27#![deny(rustdoc::invalid_html_tags)]
28pub mod adaptive_config;
29pub mod budget;
30pub mod dedup;
31pub(crate) mod dedup_util;
32pub mod enrichment;
33pub mod layered_pipeline;
34pub mod mckp_router;
35pub mod near_ref;
36pub mod page_index;
37pub mod pagination;
38pub mod projection;
39pub mod round_trip;
40pub mod shape;
41pub mod strategy;
42pub mod telemetry;
43pub mod templates;
44pub mod token_counter;
45pub mod tool_defaults;
46pub mod toon;
47pub mod tree;
48pub mod trim;
49pub mod truncation;
50
51pub use token_counter::{Tokenizer, estimate_tokens, tokens_to_chars};
52pub use truncation::TruncationPlugin;
53
54use devboy_core::{Comment, Discussion, FileDiff, Issue, MergeRequest, Result};
55
56use budget::BudgetConfig;
57use strategy::StrategyResolver;
58
59/// Convert character budget to token estimate (chars / 3.5).
60fn estimate_tokens_from_chars(chars: usize) -> usize {
61    (chars as f64 / 3.5).ceil() as usize
62}
63
64/// Serialize a `Serialize` slice to JSON pretty, then route the JSON
65/// through the L2 MCKP shape dispatcher. Falls back to the pretty-printed
66/// JSON when no shape applies. The L0 dedup layer is host-side (per
67/// session) and is wired separately in P-203-04.
68fn encode_mckp<T: serde::Serialize>(items: &[T]) -> Result<String> {
69    let json = serde_json::to_string_pretty(items)?;
70    let cls = shape::classify(&json);
71    let cfg = adaptive_config::MckpConfig::default();
72    if let Some((_id, body)) = mckp_router::route(&cfg, &json, &cls) {
73        Ok(body)
74    } else {
75        Ok(json)
76    }
77}
78
79/// Output from a pipeline transformation.
80///
81/// Contains the transformed data and metadata about truncation/pagination.
82#[derive(Debug, Clone)]
83pub struct TransformOutput {
84    /// The transformed output (TOON or JSON string)
85    pub content: String,
86    /// Whether the output was truncated
87    pub truncated: bool,
88    /// Total count before truncation (if known)
89    pub total_count: Option<usize>,
90    /// Number of items actually included
91    pub included_count: usize,
92    /// Hint for the agent about hidden content
93    pub agent_hint: Option<String>,
94    /// Cursor for fetching the next page (if overflow exists)
95    pub page_cursor: Option<String>,
96    /// Page index for large results (when budget trimming is applied)
97    pub page_index: Option<page_index::PageIndex>,
98    /// Provider-level pagination metadata
99    pub provider_pagination: Option<devboy_core::Pagination>,
100    /// Provider-level sort metadata
101    pub provider_sort: Option<devboy_core::SortInfo>,
102    /// Size of raw input data before formatting (UTF-8 bytes)
103    pub raw_chars: usize,
104    /// Size of formatted output (UTF-8 bytes) — updated after apply_char_limit
105    pub output_chars: usize,
106    /// Size of output BEFORE budget trimming (UTF-8 bytes).
107    /// Set by apply_char_limit when truncation occurs.
108    pub pre_trim_chars: usize,
109}
110
111impl TransformOutput {
112    /// Create a new output with content.
113    pub fn new(content: String) -> Self {
114        let output_chars = content.len();
115        Self {
116            content,
117            truncated: false,
118            total_count: None,
119            included_count: 0,
120            agent_hint: None,
121            page_cursor: None,
122            page_index: None,
123            provider_pagination: None,
124            provider_sort: None,
125            raw_chars: 0,
126            output_chars,
127            pre_trim_chars: 0,
128        }
129    }
130
131    /// Set raw input size (before formatting).
132    pub fn with_raw_chars(mut self, raw_chars: usize) -> Self {
133        self.raw_chars = raw_chars;
134        self
135    }
136
137    /// Mark output as truncated with a hint.
138    pub fn with_truncation(mut self, total: usize, included: usize, hint: String) -> Self {
139        self.truncated = true;
140        self.total_count = Some(total);
141        self.included_count = included;
142        self.agent_hint = Some(hint);
143        self
144    }
145
146    /// Get the final output including page index and agent hints.
147    pub fn to_string_with_hints(&self) -> String {
148        let mut parts = Vec::new();
149
150        // Page index header (when budget trimming produced pages)
151        if let Some(index) = &self.page_index {
152            parts.push(index.to_toon());
153        }
154
155        // Main content
156        parts.push(self.content.clone());
157
158        // Agent hint footer
159        if let Some(hint) = &self.agent_hint {
160            parts.push(hint.clone());
161        }
162
163        parts.join("\n\n")
164    }
165}
166
167/// Configuration for pipeline transformations.
168#[derive(Debug, Clone)]
169pub struct PipelineConfig {
170    /// Maximum characters for the entire output (0 = no limit).
171    /// Used as budget ceiling — converted to tokens via `max_chars / 3.5`.
172    pub max_chars: usize,
173    /// Maximum characters per item (e.g., diff content)
174    pub max_chars_per_item: usize,
175    /// Maximum description/body length before truncation (only outliers get truncated)
176    pub max_description_len: usize,
177    pub format: OutputFormat,
178    /// Whether to include agent hints about truncation
179    pub include_hints: bool,
180    /// Page cursor from a previous request (for pagination)
181    pub page_cursor: Option<String>,
182    /// Tool name for strategy resolution (e.g., "get_issues", "get_merge_request_diffs")
183    pub tool_name: Option<String>,
184    /// Chunk number to return (1-based). When set, pipeline skips to that chunk
185    /// instead of returning chunk 1. Used for chunk index navigation.
186    pub chunk: Option<usize>,
187}
188
189impl Default for PipelineConfig {
190    fn default() -> Self {
191        Self {
192            max_chars: 100_000,
193            max_chars_per_item: 10_000,
194            max_description_len: 10_000,
195            format: OutputFormat::Toon,
196            include_hints: true,
197            page_cursor: None,
198            tool_name: None,
199            chunk: None,
200        }
201    }
202}
203
204/// Output format for transformations.
205#[derive(Debug, Clone, Copy, PartialEq, Eq)]
206pub enum OutputFormat {
207    /// TOON format -- token-optimized custom format. Wins on `cl100k_base`
208    /// tokenizers but *loses* ~26% on `o200k_base` (the modern Anthropic /
209    /// OpenAI family). Kept as a baseline; not the recommended default.
210    /// See Paper 2 §Savings Accounting.
211    Toon,
212    /// JSON pretty-printed -- for programmatic processing.
213    Json,
214    /// MCKP v2 -- format-adaptive encoder dispatched by structural shape.
215    /// Routes object-wrapping-array shapes through the union-of-keys table
216    /// renderer (`deep_mckp_with_inner_table`) and falls back to compact
217    /// JSON when no shape applies. Tokenizer-agnostic — see Paper 2
218    /// §Encoder Bug Postmortem and §Savings Accounting.
219    Mckp,
220}
221
222/// Pipeline for chaining output transformations.
223pub struct Pipeline {
224    config: PipelineConfig,
225}
226
227impl Pipeline {
228    /// Create a new pipeline with default configuration.
229    pub fn new() -> Self {
230        Self {
231            config: PipelineConfig::default(),
232        }
233    }
234
235    /// Create a pipeline with custom configuration.
236    pub fn with_config(config: PipelineConfig) -> Self {
237        Self { config }
238    }
239
240    /// Transform a list of issues using budget pipeline.
241    pub fn transform_issues(&self, issues: Vec<Issue>) -> Result<TransformOutput> {
242        let total = issues.len();
243        let raw_json = serde_json::to_string(&issues)?;
244        let raw_chars = raw_json.len();
245
246        // First pass: check if all data fits in budget
247        let full_content = match self.config.format {
248            OutputFormat::Json => serde_json::to_string_pretty(&issues)?,
249            OutputFormat::Toon => toon::encode_issues(&issues, toon::TrimLevel::Full)?,
250            OutputFormat::Mckp => encode_mckp(&issues)?,
251        };
252
253        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
254            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
255            output.included_count = total;
256            return Ok(output);
257        }
258
259        // Budget pipeline: find how many items fit
260        let budget_config = self.budget_config();
261        let strategy_kind = self.resolve_strategy("get_issues");
262        let result = budget::process_issues(&issues, strategy_kind, &budget_config)?;
263        let chunk_size = result.included_items;
264
265        // Chunk navigation: if chunk > 1, slice to that chunk
266        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&issues, chunk_size);
267        if is_chunk_request {
268            let content = match self.config.format {
269                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
270                OutputFormat::Toon => toon::encode_issues(chunk_items, toon::TrimLevel::Full)?,
271                OutputFormat::Mckp => encode_mckp(chunk_items)?,
272            };
273            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
274            output.included_count = chunk_items.len();
275            output.total_count = Some(total);
276            return Ok(output);
277        }
278
279        // Chunk 1 (default): budget-trimmed best items + chunk index
280        let json_fallback = self.json_fallback(&full_content);
281        let index = page_index::build_issues_index(&issues, result.included_items);
282        self.build_budget_output(
283            result,
284            raw_chars,
285            total,
286            "issues",
287            Some(index),
288            json_fallback,
289        )
290    }
291
292    /// Transform a list of merge requests using budget pipeline.
293    pub fn transform_merge_requests(&self, mrs: Vec<MergeRequest>) -> Result<TransformOutput> {
294        let total = mrs.len();
295        let raw_json = serde_json::to_string(&mrs)?;
296        let raw_chars = raw_json.len();
297
298        let full_content = match self.config.format {
299            OutputFormat::Json => serde_json::to_string_pretty(&mrs)?,
300            OutputFormat::Toon => toon::encode_merge_requests(&mrs, toon::TrimLevel::Full)?,
301            OutputFormat::Mckp => encode_mckp(&mrs)?,
302        };
303
304        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
305            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
306            output.included_count = total;
307            return Ok(output);
308        }
309
310        let budget_config = self.budget_config();
311        let strategy_kind = self.resolve_strategy("get_merge_requests");
312        let result = budget::process_merge_requests(&mrs, strategy_kind, &budget_config)?;
313        let chunk_size = result.included_items;
314
315        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&mrs, chunk_size);
316        if is_chunk_request {
317            let content = match self.config.format {
318                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
319                OutputFormat::Toon => {
320                    toon::encode_merge_requests(chunk_items, toon::TrimLevel::Full)?
321                }
322                OutputFormat::Mckp => encode_mckp(chunk_items)?,
323            };
324            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
325            output.included_count = chunk_items.len();
326            output.total_count = Some(total);
327            return Ok(output);
328        }
329
330        let json_fallback = self.json_fallback(&full_content);
331        let index = page_index::build_merge_requests_index(&mrs, result.included_items);
332        self.build_budget_output(
333            result,
334            raw_chars,
335            total,
336            "merge_requests",
337            Some(index),
338            json_fallback,
339        )
340    }
341
342    /// Transform a list of file diffs using budget pipeline.
343    ///
344    /// Individual diff content is truncated per `max_chars_per_item` before
345    /// budget trimming to protect against giant lock/generated files.
346    pub fn transform_diffs(&self, diffs: Vec<FileDiff>) -> Result<TransformOutput> {
347        let total = diffs.len();
348
349        // Per-item truncation for individual diff content (protection against giant files)
350        let diffs: Vec<FileDiff> = diffs
351            .into_iter()
352            .map(|mut d| {
353                d.diff = truncation::truncate_string(&d.diff, self.config.max_chars_per_item);
354                d
355            })
356            .collect();
357
358        let raw_json = serde_json::to_string(&diffs)?;
359        let raw_chars = raw_json.len();
360
361        let full_content = match self.config.format {
362            OutputFormat::Json => serde_json::to_string_pretty(&diffs)?,
363            OutputFormat::Toon => toon::encode_diffs(&diffs)?,
364            OutputFormat::Mckp => encode_mckp(&diffs)?,
365        };
366
367        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
368            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
369            output.included_count = total;
370            return Ok(output);
371        }
372
373        let budget_config = self.budget_config();
374        let strategy_kind = self.resolve_strategy("get_merge_request_diffs");
375        let result = budget::process_diffs(&diffs, strategy_kind, &budget_config)?;
376        let chunk_size = result.included_items;
377
378        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&diffs, chunk_size);
379        if is_chunk_request {
380            let content = match self.config.format {
381                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
382                OutputFormat::Toon => toon::encode_diffs(chunk_items)?,
383                OutputFormat::Mckp => encode_mckp(chunk_items)?,
384            };
385            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
386            output.included_count = chunk_items.len();
387            output.total_count = Some(total);
388            return Ok(output);
389        }
390
391        let json_fallback = self.json_fallback(&full_content);
392        let index = page_index::build_diffs_index(&diffs, result.included_items);
393        self.build_budget_output(
394            result,
395            raw_chars,
396            total,
397            "diffs",
398            Some(index),
399            json_fallback,
400        )
401    }
402
403    /// Transform a list of comments using budget pipeline.
404    pub fn transform_comments(&self, comments: Vec<Comment>) -> Result<TransformOutput> {
405        let total = comments.len();
406        let raw_json = serde_json::to_string(&comments)?;
407        let raw_chars = raw_json.len();
408
409        let full_content = match self.config.format {
410            OutputFormat::Json => serde_json::to_string_pretty(&comments)?,
411            OutputFormat::Toon => toon::encode_comments(&comments)?,
412            OutputFormat::Mckp => encode_mckp(&comments)?,
413        };
414
415        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
416            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
417            output.included_count = total;
418            return Ok(output);
419        }
420
421        let budget_config = self.budget_config();
422        let strategy_kind = self.resolve_strategy("get_issue_comments");
423        let result = budget::process_comments(&comments, strategy_kind, &budget_config)?;
424        let chunk_size = result.included_items;
425
426        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&comments, chunk_size);
427        if is_chunk_request {
428            let content = match self.config.format {
429                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
430                OutputFormat::Toon => toon::encode_comments(chunk_items)?,
431                OutputFormat::Mckp => encode_mckp(chunk_items)?,
432            };
433            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
434            output.included_count = chunk_items.len();
435            output.total_count = Some(total);
436            return Ok(output);
437        }
438
439        let json_fallback = self.json_fallback(&full_content);
440        let index = page_index::build_comments_index(&comments, result.included_items);
441        self.build_budget_output(
442            result,
443            raw_chars,
444            total,
445            "comments",
446            Some(index),
447            json_fallback,
448        )
449    }
450
451    /// Transform a list of discussions using budget pipeline.
452    pub fn transform_discussions(&self, discussions: Vec<Discussion>) -> Result<TransformOutput> {
453        let total = discussions.len();
454        let raw_json = serde_json::to_string(&discussions)?;
455        let raw_chars = raw_json.len();
456
457        let full_content = match self.config.format {
458            OutputFormat::Json => serde_json::to_string_pretty(&discussions)?,
459            OutputFormat::Toon => toon::encode_discussions(&discussions)?,
460            OutputFormat::Mckp => encode_mckp(&discussions)?,
461        };
462
463        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
464            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
465            output.included_count = total;
466            return Ok(output);
467        }
468
469        let budget_config = self.budget_config();
470        let strategy_kind = self.resolve_strategy("get_merge_request_discussions");
471        let result = budget::process_discussions(&discussions, strategy_kind, &budget_config)?;
472        let chunk_size = result.included_items;
473
474        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&discussions, chunk_size);
475        if is_chunk_request {
476            let content = match self.config.format {
477                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
478                OutputFormat::Toon => toon::encode_discussions(chunk_items)?,
479                OutputFormat::Mckp => encode_mckp(chunk_items)?,
480            };
481            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
482            output.included_count = chunk_items.len();
483            output.total_count = Some(total);
484            return Ok(output);
485        }
486
487        let json_fallback = self.json_fallback(&full_content);
488        let index = page_index::build_discussions_index(&discussions, result.included_items);
489        self.build_budget_output(
490            result,
491            raw_chars,
492            total,
493            "discussions",
494            Some(index),
495            json_fallback,
496        )
497    }
498
499    /// When format is JSON, return the content for truncation fallback.
500    /// Budget pipeline always produces TOON, so for JSON we truncate the original JSON.
501    fn json_fallback(&self, content: &str) -> Option<String> {
502        if matches!(self.config.format, OutputFormat::Json) {
503            Some(content.to_string())
504        } else {
505            None
506        }
507    }
508
509    /// Slice items for a specific chunk number.
510    ///
511    /// When `config.chunk` is Some(n) with n > 1, we need to compute
512    /// the chunk boundaries and return only items for that chunk.
513    /// Returns (slice_items, is_chunk_request) — if not a chunk request,
514    /// returns all items.
515    fn slice_for_chunk<'a, T>(&self, items: &'a [T], chunk_size: usize) -> (&'a [T], bool) {
516        match self.config.chunk {
517            Some(n) if n > 1 && chunk_size > 0 => {
518                let offset = (n - 1) * chunk_size;
519                if offset >= items.len() {
520                    (&[], true) // chunk beyond data
521                } else {
522                    let end = (offset + chunk_size).min(items.len());
523                    (&items[offset..end], true)
524                }
525            }
526            _ => (items, false),
527        }
528    }
529
530    /// Convert max_chars to budget pipeline config.
531    fn budget_config(&self) -> BudgetConfig {
532        BudgetConfig {
533            budget_tokens: estimate_tokens_from_chars(self.config.max_chars),
534            ..Default::default()
535        }
536    }
537
538    /// Resolve trimming strategy for tool name.
539    fn resolve_strategy(&self, default_tool: &str) -> strategy::TrimStrategyKind {
540        let resolver = StrategyResolver::new();
541        let tool = self.config.tool_name.as_deref().unwrap_or(default_tool);
542        resolver.resolve(tool)
543    }
544
545    /// Build TransformOutput from BudgetResult with chunk index.
546    ///
547    /// Returns: chunk 1 (best items by strategy) + index of ALL chunks.
548    /// Agent can fetch remaining chunks via offset/limit in subsequent tool calls.
549    ///
550    /// Note: budget pipeline always produces TOON content. When format is JSON,
551    /// we fall back to simple character truncation of the JSON output instead.
552    fn build_budget_output(
553        &self,
554        result: budget::BudgetResult,
555        raw_chars: usize,
556        total: usize,
557        item_type: &str,
558        index: Option<page_index::PageIndex>,
559        json_fallback: Option<String>,
560    ) -> Result<TransformOutput> {
561        // Budget pipeline produces TOON. For JSON format, use truncated JSON instead.
562        let content = if matches!(self.config.format, OutputFormat::Json) {
563            if let Some(json) = json_fallback {
564                truncation::truncate_string(&json, self.config.max_chars)
565            } else {
566                result.content
567            }
568        } else {
569            result.content
570        };
571
572        let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
573        output.included_count = result.included_items;
574
575        // Always set truncation metadata when trimmed, regardless of include_hints
576        if result.trimmed {
577            output.truncated = true;
578            output.total_count = Some(total);
579
580            if self.config.include_hints {
581                if let Some(idx) = index {
582                    if idx.total_pages > 1 {
583                        let hint = format!(
584                            "Chunk 1/{}: {} most relevant {} (by priority). {} total items across {} chunks. \
585                            Use `chunk: N` parameter to fetch a specific chunk, or request all remaining data.",
586                            idx.total_pages,
587                            result.included_items,
588                            item_type,
589                            total,
590                            idx.total_pages
591                        );
592                        output.page_index = Some(idx);
593                        output.agent_hint = Some(hint);
594                    } else {
595                        let remaining = total.saturating_sub(result.included_items);
596                        output.agent_hint = Some(format!(
597                            "Showing {}/{} {}. {} items trimmed by budget.",
598                            result.included_items, total, item_type, remaining
599                        ));
600                    }
601                } else {
602                    let remaining = total.saturating_sub(result.included_items);
603                    output.agent_hint = Some(format!(
604                        "Showing {}/{} {}. {} items trimmed by budget. Use `chunk: N` parameter to fetch a specific chunk.",
605                        result.included_items, total, item_type, remaining
606                    ));
607                }
608            }
609        }
610
611        Ok(output)
612    }
613}
614
615impl Default for Pipeline {
616    fn default() -> Self {
617        Self::new()
618    }
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624    use devboy_core::User;
625
626    fn sample_issues() -> Vec<Issue> {
627        (1..=25)
628            .map(|i| Issue {
629                key: format!("gh#{}", i),
630                title: format!("Issue {}", i),
631                description: Some(format!("Description for issue {}", i)),
632                state: "open".to_string(),
633                source: "github".to_string(),
634                priority: None,
635                labels: vec!["bug".to_string()],
636                author: Some(User {
637                    id: "1".to_string(),
638                    username: "test".to_string(),
639                    name: None,
640                    email: None,
641                    avatar_url: None,
642                }),
643                assignees: vec![],
644                url: Some(format!("https://github.com/test/repo/issues/{}", i)),
645                created_at: Some("2024-01-01T00:00:00Z".to_string()),
646                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
647                attachments_count: None,
648                parent: None,
649                subtasks: vec![],
650                custom_fields: std::collections::HashMap::new(),
651                ..Default::default()
652            })
653            .collect()
654    }
655
656    fn sample_merge_requests() -> Vec<MergeRequest> {
657        (1..=5)
658            .map(|i| MergeRequest {
659                key: format!("mr#{}", i),
660                title: format!("MR {}", i),
661                description: Some(format!("MR description {}", i)),
662                state: "opened".to_string(),
663                source: "gitlab".to_string(),
664                source_branch: format!("feature-{}", i),
665                target_branch: "main".to_string(),
666                author: None,
667                assignees: vec![],
668                reviewers: vec![],
669                labels: vec![],
670                url: Some(format!(
671                    "https://gitlab.com/test/repo/-/merge_requests/{}",
672                    i
673                )),
674                created_at: Some("2024-01-01T00:00:00Z".to_string()),
675                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
676                draft: false,
677            })
678            .collect()
679    }
680
681    fn sample_diffs() -> Vec<FileDiff> {
682        (1..=5)
683            .map(|i| FileDiff {
684                file_path: format!("src/file_{}.rs", i),
685                old_path: None,
686                new_file: i == 1,
687                deleted_file: false,
688                renamed_file: false,
689                diff: format!("+added line {}\n-removed line {}", i, i),
690                additions: Some(1),
691                deletions: Some(1),
692            })
693            .collect()
694    }
695
696    fn sample_comments() -> Vec<Comment> {
697        (1..=5)
698            .map(|i| Comment {
699                id: format!("{}", i),
700                body: format!("Comment body {}", i),
701                author: None,
702                created_at: Some("2024-01-01T00:00:00Z".to_string()),
703                updated_at: None,
704                position: None,
705            })
706            .collect()
707    }
708
709    fn sample_discussions() -> Vec<Discussion> {
710        (1..=5)
711            .map(|i| Discussion {
712                id: format!("{}", i),
713                resolved: i % 2 == 0,
714                resolved_by: None,
715                comments: vec![Comment {
716                    id: format!("c{}", i),
717                    body: format!("Discussion comment {}", i),
718                    author: None,
719                    created_at: None,
720                    updated_at: None,
721                    position: None,
722                }],
723                position: None,
724            })
725            .collect()
726    }
727
728    // --- Pipeline truncation (budget-based) ---
729
730    #[test]
731    fn test_pipeline_truncates_items() {
732        // Use a small max_chars to force budget trimming
733        let pipeline = Pipeline::with_config(PipelineConfig {
734            max_chars: 200,
735            ..Default::default()
736        });
737
738        let issues = sample_issues();
739        let output = pipeline.transform_issues(issues).unwrap();
740
741        assert!(output.truncated);
742        assert_eq!(output.total_count, Some(25));
743        assert!(output.included_count < 25);
744        assert!(output.agent_hint.is_some());
745    }
746
747    #[test]
748    fn test_pipeline_no_truncation_when_under_limit() {
749        let pipeline = Pipeline::with_config(PipelineConfig {
750            max_chars: 100_000,
751            ..Default::default()
752        });
753
754        let issues: Vec<Issue> = sample_issues().into_iter().take(5).collect();
755        let output = pipeline.transform_issues(issues).unwrap();
756
757        assert!(!output.truncated);
758        assert!(output.agent_hint.is_none());
759    }
760
761    // --- Toon format ---
762
763    #[test]
764    fn test_toon_format_issues() {
765        let pipeline = Pipeline::with_config(PipelineConfig {
766            format: OutputFormat::Toon,
767            max_chars: 100_000,
768            ..Default::default()
769        });
770
771        let issues: Vec<Issue> = sample_issues().into_iter().take(3).collect();
772        let output = pipeline.transform_issues(issues).unwrap();
773
774        assert!(output.content.contains("gh#1"));
775        assert!(output.content.contains("Issue 1"));
776    }
777
778    #[test]
779    fn test_toon_format_merge_requests() {
780        // Use max_chars large enough to include some but not all MRs
781        let pipeline = Pipeline::with_config(PipelineConfig {
782            format: OutputFormat::Toon,
783            max_chars: 500,
784            ..Default::default()
785        });
786
787        let mrs = sample_merge_requests();
788        let output = pipeline.transform_merge_requests(mrs).unwrap();
789
790        assert!(output.content.contains("mr#1"));
791        assert!(output.content.contains("MR 1"));
792        assert!(output.truncated);
793        assert!(output.included_count < 5);
794    }
795
796    #[test]
797    fn test_toon_format_diffs() {
798        // Use max_chars small enough to force budget trimming of 5 diffs
799        let pipeline = Pipeline::with_config(PipelineConfig {
800            format: OutputFormat::Toon,
801            max_chars: 200,
802            ..Default::default()
803        });
804
805        let diffs = sample_diffs();
806        let output = pipeline.transform_diffs(diffs).unwrap();
807
808        assert!(output.content.contains("src/file_1.rs"));
809        assert!(output.truncated);
810        assert!(output.included_count < 5);
811    }
812
813    #[test]
814    fn test_toon_format_comments() {
815        // Use max_chars small enough to force budget trimming of 5 comments
816        // but large enough to include at least one comment with body text
817        let pipeline = Pipeline::with_config(PipelineConfig {
818            format: OutputFormat::Toon,
819            max_chars: 300,
820            ..Default::default()
821        });
822
823        let comments = sample_comments();
824        let output = pipeline.transform_comments(comments).unwrap();
825
826        // Budget trimming may drop early items; check that some comment body is present
827        assert!(output.content.contains("Comment body"));
828        assert!(output.truncated);
829        assert!(output.included_count < 5);
830    }
831
832    #[test]
833    fn test_toon_format_discussions() {
834        // Use max_chars large enough to include some but not all discussions
835        let pipeline = Pipeline::with_config(PipelineConfig {
836            format: OutputFormat::Toon,
837            max_chars: 500,
838            ..Default::default()
839        });
840
841        let discussions = sample_discussions();
842        let output = pipeline.transform_discussions(discussions).unwrap();
843
844        assert!(output.content.contains("Discussion comment 1"));
845        assert!(output.truncated);
846        assert!(output.included_count < 5);
847    }
848
849    // --- JSON format ---
850
851    #[test]
852    fn test_json_format_issues() {
853        let pipeline = Pipeline::with_config(PipelineConfig {
854            format: OutputFormat::Json,
855            max_chars: 100_000,
856            ..Default::default()
857        });
858
859        let issues: Vec<Issue> = sample_issues().into_iter().take(2).collect();
860        let output = pipeline.transform_issues(issues).unwrap();
861
862        let parsed: Vec<Issue> = serde_json::from_str(&output.content).unwrap();
863        assert_eq!(parsed.len(), 2);
864    }
865
866    #[test]
867    fn test_json_format_merge_requests() {
868        let pipeline = Pipeline::with_config(PipelineConfig {
869            format: OutputFormat::Json,
870            max_chars: 100_000,
871            ..Default::default()
872        });
873
874        let mrs: Vec<MergeRequest> = sample_merge_requests().into_iter().take(2).collect();
875        let output = pipeline.transform_merge_requests(mrs).unwrap();
876
877        let parsed: Vec<MergeRequest> = serde_json::from_str(&output.content).unwrap();
878        assert_eq!(parsed.len(), 2);
879    }
880
881    #[test]
882    fn test_json_format_diffs() {
883        let pipeline = Pipeline::with_config(PipelineConfig {
884            format: OutputFormat::Json,
885            max_chars: 100_000,
886            ..Default::default()
887        });
888
889        let diffs: Vec<FileDiff> = sample_diffs().into_iter().take(2).collect();
890        let output = pipeline.transform_diffs(diffs).unwrap();
891
892        let parsed: Vec<FileDiff> = serde_json::from_str(&output.content).unwrap();
893        assert_eq!(parsed.len(), 2);
894    }
895
896    #[test]
897    fn test_json_format_comments() {
898        let pipeline = Pipeline::with_config(PipelineConfig {
899            format: OutputFormat::Json,
900            max_chars: 100_000,
901            ..Default::default()
902        });
903
904        let comments: Vec<Comment> = sample_comments().into_iter().take(2).collect();
905        let output = pipeline.transform_comments(comments).unwrap();
906
907        let parsed: Vec<Comment> = serde_json::from_str(&output.content).unwrap();
908        assert_eq!(parsed.len(), 2);
909    }
910
911    #[test]
912    fn test_json_format_discussions() {
913        let pipeline = Pipeline::with_config(PipelineConfig {
914            format: OutputFormat::Json,
915            max_chars: 100_000,
916            ..Default::default()
917        });
918
919        let discussions: Vec<Discussion> = sample_discussions().into_iter().take(2).collect();
920        let output = pipeline.transform_discussions(discussions).unwrap();
921
922        let parsed: Vec<Discussion> = serde_json::from_str(&output.content).unwrap();
923        assert_eq!(parsed.len(), 2);
924    }
925
926    // --- TransformOutput ---
927
928    #[test]
929    fn test_transform_output_to_string_with_hints() {
930        let output = TransformOutput::new("content".to_string());
931        assert_eq!(output.to_string_with_hints(), "content");
932
933        let output = TransformOutput::new("content".to_string()).with_truncation(
934            10,
935            5,
936            "hint text".to_string(),
937        );
938        assert!(output.to_string_with_hints().contains("content"));
939        assert!(output.to_string_with_hints().contains("hint text"));
940    }
941
942    #[test]
943    fn test_transform_output_with_truncation() {
944        let output =
945            TransformOutput::new("data".into()).with_truncation(100, 10, "90 more items".into());
946        assert!(output.truncated);
947        assert_eq!(output.total_count, Some(100));
948        assert_eq!(output.included_count, 10);
949        assert_eq!(output.agent_hint.as_deref(), Some("90 more items"));
950    }
951
952    // --- PipelineConfig ---
953
954    #[test]
955    fn test_pipeline_config_default_values() {
956        let config = PipelineConfig::default();
957        assert_eq!(config.max_chars, 100_000);
958        assert_eq!(config.max_chars_per_item, 10_000);
959        assert_eq!(config.max_description_len, 10_000);
960        assert!(matches!(config.format, OutputFormat::Toon));
961        assert!(config.include_hints);
962    }
963
964    #[test]
965    fn test_pipeline_default() {
966        let pipeline = Pipeline::default();
967        let issues: Vec<Issue> = sample_issues().into_iter().take(1).collect();
968        let output = pipeline.transform_issues(issues).unwrap();
969        assert!(!output.content.is_empty());
970    }
971
972    #[test]
973    fn test_pipeline_hints_disabled() {
974        // Use small max_chars to trigger budget trimming, but with hints disabled
975        let pipeline = Pipeline::with_config(PipelineConfig {
976            max_chars: 200,
977            include_hints: false,
978            ..Default::default()
979        });
980
981        let issues = sample_issues();
982        let output = pipeline.transform_issues(issues).unwrap();
983
984        assert!(output.included_count < 25);
985        // truncated flag is always set when trimming occurs (for metadata consumers)
986        assert!(output.truncated);
987        // but agent_hint and page_index are suppressed when include_hints is false
988        assert!(output.agent_hint.is_none());
989        assert!(output.page_index.is_none());
990    }
991
992    // --- Character limit (budget-based) ---
993
994    #[test]
995    fn test_char_limit_applied() {
996        let pipeline = Pipeline::with_config(PipelineConfig {
997            max_chars: 100,
998            ..Default::default()
999        });
1000
1001        let issues = sample_issues();
1002        let output = pipeline.transform_issues(issues).unwrap();
1003
1004        assert!(output.truncated);
1005    }
1006
1007    #[test]
1008    fn test_char_limit_triggers_trimming() {
1009        let pipeline = Pipeline::with_config(PipelineConfig {
1010            max_chars: 50,
1011            ..Default::default()
1012        });
1013
1014        let issues: Vec<Issue> = sample_issues().into_iter().take(3).collect();
1015        let output = pipeline.transform_issues(issues).unwrap();
1016        assert!(output.truncated);
1017    }
1018
1019    // --- Empty collections ---
1020
1021    #[test]
1022    fn test_transform_empty_issues() {
1023        let pipeline = Pipeline::new();
1024        let output = pipeline.transform_issues(vec![]).unwrap();
1025        assert!(!output.truncated);
1026        assert_eq!(output.included_count, 0);
1027    }
1028
1029    #[test]
1030    fn test_transform_empty_merge_requests() {
1031        let pipeline = Pipeline::new();
1032        let output = pipeline.transform_merge_requests(vec![]).unwrap();
1033        assert!(!output.truncated);
1034        assert_eq!(output.included_count, 0);
1035    }
1036
1037    #[test]
1038    fn test_transform_empty_diffs() {
1039        let pipeline = Pipeline::new();
1040        let output = pipeline.transform_diffs(vec![]).unwrap();
1041        assert!(!output.truncated);
1042        assert_eq!(output.included_count, 0);
1043    }
1044
1045    #[test]
1046    fn test_transform_empty_comments() {
1047        let pipeline = Pipeline::new();
1048        let output = pipeline.transform_comments(vec![]).unwrap();
1049        assert!(!output.truncated);
1050        assert_eq!(output.included_count, 0);
1051    }
1052
1053    #[test]
1054    fn test_transform_empty_discussions() {
1055        let pipeline = Pipeline::new();
1056        let output = pipeline.transform_discussions(vec![]).unwrap();
1057        assert!(!output.truncated);
1058        assert_eq!(output.included_count, 0);
1059    }
1060
1061    // --- Diff truncation per item ---
1062
1063    #[test]
1064    fn test_diff_content_truncated_per_item() {
1065        let pipeline = Pipeline::with_config(PipelineConfig {
1066            max_chars_per_item: 10,
1067            max_chars: 100_000,
1068            ..Default::default()
1069        });
1070
1071        let diffs = vec![FileDiff {
1072            file_path: "big.rs".into(),
1073            old_path: None,
1074            new_file: false,
1075            deleted_file: false,
1076            renamed_file: false,
1077            diff: "x".repeat(1000),
1078            additions: Some(100),
1079            deletions: Some(0),
1080        }];
1081
1082        let output = pipeline.transform_diffs(diffs).unwrap();
1083        assert!(output.content.len() < 1000);
1084    }
1085
1086    // --- TOON smaller than JSON ---
1087
1088    // --- JSON format with budget trimming (triggers json_fallback) ---
1089
1090    #[test]
1091    fn test_json_format_with_budget_trimming_issues() {
1092        let pipeline = Pipeline::with_config(PipelineConfig {
1093            format: OutputFormat::Json,
1094            max_chars: 200,
1095            ..Default::default()
1096        });
1097
1098        let issues = sample_issues();
1099        let output = pipeline.transform_issues(issues).unwrap();
1100
1101        assert!(output.truncated);
1102        assert!(output.included_count < 25);
1103        // Content should be truncated JSON (not TOON)
1104        assert!(!output.content.is_empty());
1105    }
1106
1107    #[test]
1108    fn test_json_format_with_budget_trimming_merge_requests() {
1109        let pipeline = Pipeline::with_config(PipelineConfig {
1110            format: OutputFormat::Json,
1111            max_chars: 200,
1112            ..Default::default()
1113        });
1114
1115        let mrs = sample_merge_requests();
1116        let output = pipeline.transform_merge_requests(mrs).unwrap();
1117
1118        assert!(output.truncated);
1119        assert!(!output.content.is_empty());
1120    }
1121
1122    #[test]
1123    fn test_json_format_with_budget_trimming_diffs() {
1124        let pipeline = Pipeline::with_config(PipelineConfig {
1125            format: OutputFormat::Json,
1126            max_chars: 100,
1127            ..Default::default()
1128        });
1129
1130        let diffs = sample_diffs();
1131        let output = pipeline.transform_diffs(diffs).unwrap();
1132
1133        assert!(output.truncated);
1134        assert!(!output.content.is_empty());
1135    }
1136
1137    #[test]
1138    fn test_json_format_with_budget_trimming_comments() {
1139        let pipeline = Pipeline::with_config(PipelineConfig {
1140            format: OutputFormat::Json,
1141            max_chars: 100,
1142            ..Default::default()
1143        });
1144
1145        let comments = sample_comments();
1146        let output = pipeline.transform_comments(comments).unwrap();
1147
1148        assert!(output.truncated);
1149        assert!(!output.content.is_empty());
1150    }
1151
1152    #[test]
1153    fn test_json_format_with_budget_trimming_discussions() {
1154        let pipeline = Pipeline::with_config(PipelineConfig {
1155            format: OutputFormat::Json,
1156            max_chars: 100,
1157            ..Default::default()
1158        });
1159
1160        let discussions = sample_discussions();
1161        let output = pipeline.transform_discussions(discussions).unwrap();
1162
1163        assert!(output.truncated);
1164        assert!(!output.content.is_empty());
1165    }
1166
1167    // --- Chunk index hints (total_pages > 1) ---
1168
1169    #[test]
1170    fn test_pipeline_chunk_index_with_many_issues() {
1171        // Use enough issues and small budget to trigger multi-page chunk index
1172        let issues: Vec<Issue> = (1..=50)
1173            .map(|i| Issue {
1174                key: format!("gh#{}", i),
1175                title: format!("Issue {} with a moderately long title for sizing", i),
1176                description: Some(format!(
1177                    "Description for issue {} with substantial content to inflate token count significantly beyond budget",
1178                    i
1179                )),
1180                state: "open".to_string(),
1181                source: "github".to_string(),
1182                priority: None,
1183                labels: vec!["bug".to_string(), "critical".to_string()],
1184                author: Some(User {
1185                    id: "1".to_string(),
1186                    username: "test".to_string(),
1187                    name: None,
1188                    email: None,
1189                    avatar_url: None,
1190                }),
1191                assignees: vec![],
1192                url: Some(format!("https://github.com/test/repo/issues/{}", i)),
1193                created_at: Some("2024-01-01T00:00:00Z".to_string()),
1194                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
1195                attachments_count: None,
1196            parent: None,
1197                subtasks: vec![],
1198                custom_fields: std::collections::HashMap::new(),
1199                ..Default::default()
1200            })
1201            .collect();
1202
1203        let pipeline = Pipeline::with_config(PipelineConfig {
1204            max_chars: 500,
1205            include_hints: true,
1206            ..Default::default()
1207        });
1208
1209        let output = pipeline.transform_issues(issues).unwrap();
1210
1211        assert!(output.truncated);
1212        assert!(output.included_count < 50);
1213        // When many items are trimmed, we expect page_index and chunk hint
1214        if let Some(ref hint) = output.agent_hint {
1215            assert!(
1216                hint.contains("Chunk") || hint.contains("Showing"),
1217                "Expected chunk or showing hint, got: {}",
1218                hint
1219            );
1220        }
1221    }
1222
1223    #[test]
1224    fn test_toon_smaller_than_json_for_issues() {
1225        let issues: Vec<Issue> = sample_issues().into_iter().take(10).collect();
1226
1227        let json_pipeline = Pipeline::with_config(PipelineConfig {
1228            format: OutputFormat::Json,
1229            max_chars: 1_000_000,
1230            ..Default::default()
1231        });
1232        let toon_pipeline = Pipeline::with_config(PipelineConfig {
1233            format: OutputFormat::Toon,
1234            max_chars: 1_000_000,
1235            ..Default::default()
1236        });
1237
1238        let json_output = json_pipeline.transform_issues(issues.clone()).unwrap();
1239        let toon_output = toon_pipeline.transform_issues(issues).unwrap();
1240
1241        assert!(
1242            toon_output.content.len() < json_output.content.len(),
1243            "TOON ({}) should be smaller than JSON ({})",
1244            toon_output.content.len(),
1245            json_output.content.len()
1246        );
1247    }
1248
1249    #[test]
1250    fn test_mckp_routes_issues_through_inner_table() {
1251        let issues: Vec<Issue> = sample_issues().into_iter().take(10).collect();
1252
1253        let mckp_pipeline = Pipeline::with_config(PipelineConfig {
1254            format: OutputFormat::Mckp,
1255            max_chars: 1_000_000,
1256            ..Default::default()
1257        });
1258        let json_pipeline = Pipeline::with_config(PipelineConfig {
1259            format: OutputFormat::Json,
1260            max_chars: 1_000_000,
1261            ..Default::default()
1262        });
1263
1264        let mckp_out = mckp_pipeline.transform_issues(issues.clone()).unwrap();
1265        let json_out = json_pipeline.transform_issues(issues).unwrap();
1266
1267        // MCKP must beat the pretty-printed JSON baseline on this shape
1268        // (array of objects → routes to `csv` via try_array_csv).
1269        assert!(
1270            mckp_out.content.len() < json_out.content.len(),
1271            "MCKP ({}) should be smaller than JSON ({})",
1272            mckp_out.content.len(),
1273            json_out.content.len(),
1274        );
1275        // Round-trip key parity: every Issue field still appears in the
1276        // output (the encoder bug regression).
1277        for k in ["key", "title", "state", "source"] {
1278            assert!(
1279                mckp_out.content.contains(k),
1280                "MCKP output is missing field `{k}`: {}",
1281                &mckp_out.content[..mckp_out.content.len().min(200)]
1282            );
1283        }
1284    }
1285
1286    #[test]
1287    fn test_mckp_falls_back_to_pretty_json_on_unstable_keys() {
1288        // Single issue → array length 1, below the min_items threshold for
1289        // try_array_csv. encode_mckp must not crash; it should fall back
1290        // to the pretty JSON.
1291        let issues: Vec<Issue> = sample_issues().into_iter().take(1).collect();
1292        let mckp_pipeline = Pipeline::with_config(PipelineConfig {
1293            format: OutputFormat::Mckp,
1294            max_chars: 1_000_000,
1295            ..Default::default()
1296        });
1297        let out = mckp_pipeline.transform_issues(issues).unwrap();
1298        assert!(out.content.contains("gh#1"));
1299    }
1300}