devboy_format_pipeline/
lib.rs

1//! Format pipeline for tool output transformation.
2//!
3//! Formats tool responses into an optimal format for LLM:
4//!
5//! - **TOON** (default): Token-Oriented Object Notation -- saves 39-90% of tokens
6//! - **JSON**: for programmatic processing
7//! - **Budget trimming**: smart strategy-based trimming when output exceeds budget
8//!
9//! # Example
10//!
11//! ```ignore
12//! use devboy_format_pipeline::{Pipeline, PipelineConfig, OutputFormat};
13//! use devboy_core::Issue;
14//!
15//! let pipeline = Pipeline::with_config(PipelineConfig {
16//!     format: OutputFormat::Toon,
17//!     max_chars: 100_000,
18//!     ..Default::default()
19//! });
20//!
21//! let output = pipeline.transform_issues(issues)?;
22//! println!("{}", output.to_string_with_hints());
23//! ```
24
25#![deny(rustdoc::broken_intra_doc_links)]
26#![deny(rustdoc::private_intra_doc_links)]
27#![deny(rustdoc::invalid_html_tags)]
28pub mod adaptive_config;
29pub mod budget;
30pub mod dedup;
31pub(crate) mod dedup_util;
32pub mod enrichment;
33pub mod layered_pipeline;
34pub mod mckp_router;
35pub mod near_ref;
36pub mod page_index;
37pub mod pagination;
38pub mod projection;
39pub mod round_trip;
40pub mod shape;
41pub mod strategy;
42pub mod telemetry;
43pub mod templates;
44pub mod token_counter;
45pub mod tool_defaults;
46pub mod toon;
47pub mod tree;
48pub mod trim;
49pub mod truncation;
50
51pub use token_counter::{Tokenizer, estimate_tokens, tokens_to_chars};
52pub use truncation::TruncationPlugin;
53
54use devboy_core::{Comment, Discussion, FileDiff, Issue, MergeRequest, Result};
55
56use budget::BudgetConfig;
57use strategy::StrategyResolver;
58
59/// Convert character budget to token estimate (chars / 3.5).
60fn estimate_tokens_from_chars(chars: usize) -> usize {
61    (chars as f64 / 3.5).ceil() as usize
62}
63
64/// Serialize a `Serialize` slice to JSON pretty, then route the JSON
65/// through the L2 MCKP shape dispatcher. Falls back to the pretty-printed
66/// JSON when no shape applies. The L0 dedup layer is host-side (per
67/// session) and is wired separately in P-203-04.
68fn encode_mckp<T: serde::Serialize>(items: &[T]) -> Result<String> {
69    let json = serde_json::to_string_pretty(items)?;
70    let cls = shape::classify(&json);
71    let cfg = adaptive_config::MckpConfig::default();
72    if let Some((_id, body)) = mckp_router::route(&cfg, &json, &cls) {
73        Ok(body)
74    } else {
75        Ok(json)
76    }
77}
78
79/// Output from a pipeline transformation.
80///
81/// Contains the transformed data and metadata about truncation/pagination.
82#[derive(Debug, Clone)]
83pub struct TransformOutput {
84    /// The transformed output (TOON or JSON string)
85    pub content: String,
86    /// Whether the output was truncated
87    pub truncated: bool,
88    /// Total count before truncation (if known)
89    pub total_count: Option<usize>,
90    /// Number of items actually included
91    pub included_count: usize,
92    /// Hint for the agent about hidden content
93    pub agent_hint: Option<String>,
94    /// Cursor for fetching the next page (if overflow exists)
95    pub page_cursor: Option<String>,
96    /// Page index for large results (when budget trimming is applied)
97    pub page_index: Option<page_index::PageIndex>,
98    /// Provider-level pagination metadata
99    pub provider_pagination: Option<devboy_core::Pagination>,
100    /// Provider-level sort metadata
101    pub provider_sort: Option<devboy_core::SortInfo>,
102    /// Size of raw input data before formatting (UTF-8 bytes)
103    pub raw_chars: usize,
104    /// Size of formatted output (UTF-8 bytes) — updated after apply_char_limit
105    pub output_chars: usize,
106    /// Size of output BEFORE budget trimming (UTF-8 bytes).
107    /// Set by apply_char_limit when truncation occurs.
108    pub pre_trim_chars: usize,
109}
110
111impl TransformOutput {
112    /// Create a new output with content.
113    pub fn new(content: String) -> Self {
114        let output_chars = content.len();
115        Self {
116            content,
117            truncated: false,
118            total_count: None,
119            included_count: 0,
120            agent_hint: None,
121            page_cursor: None,
122            page_index: None,
123            provider_pagination: None,
124            provider_sort: None,
125            raw_chars: 0,
126            output_chars,
127            pre_trim_chars: 0,
128        }
129    }
130
131    /// Set raw input size (before formatting).
132    pub fn with_raw_chars(mut self, raw_chars: usize) -> Self {
133        self.raw_chars = raw_chars;
134        self
135    }
136
137    /// Mark output as truncated with a hint.
138    pub fn with_truncation(mut self, total: usize, included: usize, hint: String) -> Self {
139        self.truncated = true;
140        self.total_count = Some(total);
141        self.included_count = included;
142        self.agent_hint = Some(hint);
143        self
144    }
145
146    /// Get the final output including page index and agent hints.
147    pub fn to_string_with_hints(&self) -> String {
148        let mut parts = Vec::new();
149
150        // Page index header (when budget trimming produced pages)
151        if let Some(index) = &self.page_index {
152            parts.push(index.to_toon());
153        }
154
155        // Main content
156        parts.push(self.content.clone());
157
158        // Agent hint footer
159        if let Some(hint) = &self.agent_hint {
160            parts.push(hint.clone());
161        }
162
163        parts.join("\n\n")
164    }
165}
166
167/// Configuration for pipeline transformations.
168#[derive(Debug, Clone)]
169pub struct PipelineConfig {
170    /// Maximum characters for the entire output (0 = no limit).
171    /// Used as budget ceiling — converted to tokens via `max_chars / 3.5`.
172    pub max_chars: usize,
173    /// Maximum characters per item (e.g., diff content)
174    pub max_chars_per_item: usize,
175    /// Maximum description/body length before truncation (only outliers get truncated)
176    pub max_description_len: usize,
177    pub format: OutputFormat,
178    /// Whether to include agent hints about truncation
179    pub include_hints: bool,
180    /// Page cursor from a previous request (for pagination)
181    pub page_cursor: Option<String>,
182    /// Tool name for strategy resolution (e.g., "get_issues", "get_merge_request_diffs")
183    pub tool_name: Option<String>,
184    /// Chunk number to return (1-based). When set, pipeline skips to that chunk
185    /// instead of returning chunk 1. Used for chunk index navigation.
186    pub chunk: Option<usize>,
187}
188
189impl Default for PipelineConfig {
190    fn default() -> Self {
191        Self {
192            max_chars: 100_000,
193            max_chars_per_item: 10_000,
194            max_description_len: 10_000,
195            format: OutputFormat::Toon,
196            include_hints: true,
197            page_cursor: None,
198            tool_name: None,
199            chunk: None,
200        }
201    }
202}
203
204/// Output format for transformations.
205#[derive(Debug, Clone, Copy, PartialEq, Eq)]
206pub enum OutputFormat {
207    /// TOON format -- token-optimized custom format. Wins on `cl100k_base`
208    /// tokenizers but *loses* ~26% on `o200k_base` (the modern Anthropic /
209    /// OpenAI family). Kept as a baseline; not the recommended default.
210    /// See Paper 2 §Savings Accounting.
211    Toon,
212    /// JSON pretty-printed -- for programmatic processing.
213    Json,
214    /// MCKP v2 -- format-adaptive encoder dispatched by structural shape.
215    /// Routes object-wrapping-array shapes through the union-of-keys table
216    /// renderer (`deep_mckp_with_inner_table`) and falls back to compact
217    /// JSON when no shape applies. Tokenizer-agnostic — see Paper 2
218    /// §Encoder Bug Postmortem and §Savings Accounting.
219    Mckp,
220}
221
222/// Pipeline for chaining output transformations.
223pub struct Pipeline {
224    config: PipelineConfig,
225}
226
227impl Pipeline {
228    /// Create a new pipeline with default configuration.
229    pub fn new() -> Self {
230        Self {
231            config: PipelineConfig::default(),
232        }
233    }
234
235    /// Create a pipeline with custom configuration.
236    pub fn with_config(config: PipelineConfig) -> Self {
237        Self { config }
238    }
239
240    /// Transform a list of issues using budget pipeline.
241    pub fn transform_issues(&self, issues: Vec<Issue>) -> Result<TransformOutput> {
242        let total = issues.len();
243        let raw_json = serde_json::to_string(&issues)?;
244        let raw_chars = raw_json.len();
245
246        // First pass: check if all data fits in budget
247        let full_content = match self.config.format {
248            OutputFormat::Json => serde_json::to_string_pretty(&issues)?,
249            OutputFormat::Toon => toon::encode_issues(&issues, toon::TrimLevel::Full)?,
250            OutputFormat::Mckp => encode_mckp(&issues)?,
251        };
252
253        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
254            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
255            output.included_count = total;
256            return Ok(output);
257        }
258
259        // Budget pipeline: find how many items fit
260        let budget_config = self.budget_config();
261        let strategy_kind = self.resolve_strategy("get_issues");
262        let result = budget::process_issues(&issues, strategy_kind, &budget_config)?;
263        let chunk_size = result.included_items;
264
265        // Chunk navigation: if chunk > 1, slice to that chunk
266        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&issues, chunk_size);
267        if is_chunk_request {
268            let content = match self.config.format {
269                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
270                OutputFormat::Toon => toon::encode_issues(chunk_items, toon::TrimLevel::Full)?,
271                OutputFormat::Mckp => encode_mckp(chunk_items)?,
272            };
273            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
274            output.included_count = chunk_items.len();
275            output.total_count = Some(total);
276            return Ok(output);
277        }
278
279        // Chunk 1 (default): budget-trimmed best items + chunk index
280        let json_fallback = self.json_fallback(&full_content);
281        let index = page_index::build_issues_index(&issues, result.included_items);
282        self.build_budget_output(
283            result,
284            raw_chars,
285            total,
286            "issues",
287            Some(index),
288            json_fallback,
289        )
290    }
291
292    /// Transform a list of merge requests using budget pipeline.
293    pub fn transform_merge_requests(&self, mrs: Vec<MergeRequest>) -> Result<TransformOutput> {
294        let total = mrs.len();
295        let raw_json = serde_json::to_string(&mrs)?;
296        let raw_chars = raw_json.len();
297
298        let full_content = match self.config.format {
299            OutputFormat::Json => serde_json::to_string_pretty(&mrs)?,
300            OutputFormat::Toon => toon::encode_merge_requests(&mrs, toon::TrimLevel::Full)?,
301            OutputFormat::Mckp => encode_mckp(&mrs)?,
302        };
303
304        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
305            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
306            output.included_count = total;
307            return Ok(output);
308        }
309
310        let budget_config = self.budget_config();
311        let strategy_kind = self.resolve_strategy("get_merge_requests");
312        let result = budget::process_merge_requests(&mrs, strategy_kind, &budget_config)?;
313        let chunk_size = result.included_items;
314
315        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&mrs, chunk_size);
316        if is_chunk_request {
317            let content = match self.config.format {
318                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
319                OutputFormat::Toon => {
320                    toon::encode_merge_requests(chunk_items, toon::TrimLevel::Full)?
321                }
322                OutputFormat::Mckp => encode_mckp(chunk_items)?,
323            };
324            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
325            output.included_count = chunk_items.len();
326            output.total_count = Some(total);
327            return Ok(output);
328        }
329
330        let json_fallback = self.json_fallback(&full_content);
331        let index = page_index::build_merge_requests_index(&mrs, result.included_items);
332        self.build_budget_output(
333            result,
334            raw_chars,
335            total,
336            "merge_requests",
337            Some(index),
338            json_fallback,
339        )
340    }
341
342    /// Transform a list of file diffs using budget pipeline.
343    ///
344    /// Individual diff content is truncated per `max_chars_per_item` before
345    /// budget trimming to protect against giant lock/generated files.
346    pub fn transform_diffs(&self, diffs: Vec<FileDiff>) -> Result<TransformOutput> {
347        let total = diffs.len();
348
349        // Per-item truncation for individual diff content (protection against giant files)
350        let diffs: Vec<FileDiff> = diffs
351            .into_iter()
352            .map(|mut d| {
353                d.diff = truncation::truncate_string(&d.diff, self.config.max_chars_per_item);
354                d
355            })
356            .collect();
357
358        let raw_json = serde_json::to_string(&diffs)?;
359        let raw_chars = raw_json.len();
360
361        let full_content = match self.config.format {
362            OutputFormat::Json => serde_json::to_string_pretty(&diffs)?,
363            OutputFormat::Toon => toon::encode_diffs(&diffs)?,
364            OutputFormat::Mckp => encode_mckp(&diffs)?,
365        };
366
367        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
368            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
369            output.included_count = total;
370            return Ok(output);
371        }
372
373        let budget_config = self.budget_config();
374        let strategy_kind = self.resolve_strategy("get_merge_request_diffs");
375        let result = budget::process_diffs(&diffs, strategy_kind, &budget_config)?;
376        let chunk_size = result.included_items;
377
378        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&diffs, chunk_size);
379        if is_chunk_request {
380            let content = match self.config.format {
381                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
382                OutputFormat::Toon => toon::encode_diffs(chunk_items)?,
383                OutputFormat::Mckp => encode_mckp(chunk_items)?,
384            };
385            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
386            output.included_count = chunk_items.len();
387            output.total_count = Some(total);
388            return Ok(output);
389        }
390
391        let json_fallback = self.json_fallback(&full_content);
392        let index = page_index::build_diffs_index(&diffs, result.included_items);
393        self.build_budget_output(
394            result,
395            raw_chars,
396            total,
397            "diffs",
398            Some(index),
399            json_fallback,
400        )
401    }
402
403    /// Transform a list of comments using budget pipeline.
404    pub fn transform_comments(&self, comments: Vec<Comment>) -> Result<TransformOutput> {
405        let total = comments.len();
406        let raw_json = serde_json::to_string(&comments)?;
407        let raw_chars = raw_json.len();
408
409        let full_content = match self.config.format {
410            OutputFormat::Json => serde_json::to_string_pretty(&comments)?,
411            OutputFormat::Toon => toon::encode_comments(&comments)?,
412            OutputFormat::Mckp => encode_mckp(&comments)?,
413        };
414
415        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
416            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
417            output.included_count = total;
418            return Ok(output);
419        }
420
421        let budget_config = self.budget_config();
422        let strategy_kind = self.resolve_strategy("get_issue_comments");
423        let result = budget::process_comments(&comments, strategy_kind, &budget_config)?;
424        let chunk_size = result.included_items;
425
426        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&comments, chunk_size);
427        if is_chunk_request {
428            let content = match self.config.format {
429                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
430                OutputFormat::Toon => toon::encode_comments(chunk_items)?,
431                OutputFormat::Mckp => encode_mckp(chunk_items)?,
432            };
433            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
434            output.included_count = chunk_items.len();
435            output.total_count = Some(total);
436            return Ok(output);
437        }
438
439        let json_fallback = self.json_fallback(&full_content);
440        let index = page_index::build_comments_index(&comments, result.included_items);
441        self.build_budget_output(
442            result,
443            raw_chars,
444            total,
445            "comments",
446            Some(index),
447            json_fallback,
448        )
449    }
450
451    /// Transform a list of discussions using budget pipeline.
452    pub fn transform_discussions(&self, discussions: Vec<Discussion>) -> Result<TransformOutput> {
453        let total = discussions.len();
454        let raw_json = serde_json::to_string(&discussions)?;
455        let raw_chars = raw_json.len();
456
457        let full_content = match self.config.format {
458            OutputFormat::Json => serde_json::to_string_pretty(&discussions)?,
459            OutputFormat::Toon => toon::encode_discussions(&discussions)?,
460            OutputFormat::Mckp => encode_mckp(&discussions)?,
461        };
462
463        if self.config.max_chars == 0 || full_content.len() <= self.config.max_chars {
464            let mut output = TransformOutput::new(full_content).with_raw_chars(raw_chars);
465            output.included_count = total;
466            return Ok(output);
467        }
468
469        let budget_config = self.budget_config();
470        let strategy_kind = self.resolve_strategy("get_merge_request_discussions");
471        let result = budget::process_discussions(&discussions, strategy_kind, &budget_config)?;
472        let chunk_size = result.included_items;
473
474        let (chunk_items, is_chunk_request) = self.slice_for_chunk(&discussions, chunk_size);
475        if is_chunk_request {
476            let content = match self.config.format {
477                OutputFormat::Json => serde_json::to_string_pretty(chunk_items)?,
478                OutputFormat::Toon => toon::encode_discussions(chunk_items)?,
479                OutputFormat::Mckp => encode_mckp(chunk_items)?,
480            };
481            let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
482            output.included_count = chunk_items.len();
483            output.total_count = Some(total);
484            return Ok(output);
485        }
486
487        let json_fallback = self.json_fallback(&full_content);
488        let index = page_index::build_discussions_index(&discussions, result.included_items);
489        self.build_budget_output(
490            result,
491            raw_chars,
492            total,
493            "discussions",
494            Some(index),
495            json_fallback,
496        )
497    }
498
499    /// When format is JSON, return the content for truncation fallback.
500    /// Budget pipeline always produces TOON, so for JSON we truncate the original JSON.
501    fn json_fallback(&self, content: &str) -> Option<String> {
502        if matches!(self.config.format, OutputFormat::Json) {
503            Some(content.to_string())
504        } else {
505            None
506        }
507    }
508
509    /// Slice items for a specific chunk number.
510    ///
511    /// When `config.chunk` is Some(n) with n > 1, we need to compute
512    /// the chunk boundaries and return only items for that chunk.
513    /// Returns (slice_items, is_chunk_request) — if not a chunk request,
514    /// returns all items.
515    fn slice_for_chunk<'a, T>(&self, items: &'a [T], chunk_size: usize) -> (&'a [T], bool) {
516        match self.config.chunk {
517            Some(n) if n > 1 && chunk_size > 0 => {
518                let offset = (n - 1) * chunk_size;
519                if offset >= items.len() {
520                    (&[], true) // chunk beyond data
521                } else {
522                    let end = (offset + chunk_size).min(items.len());
523                    (&items[offset..end], true)
524                }
525            }
526            _ => (items, false),
527        }
528    }
529
530    /// Convert max_chars to budget pipeline config.
531    fn budget_config(&self) -> BudgetConfig {
532        BudgetConfig {
533            budget_tokens: estimate_tokens_from_chars(self.config.max_chars),
534            ..Default::default()
535        }
536    }
537
538    /// Resolve trimming strategy for tool name.
539    fn resolve_strategy(&self, default_tool: &str) -> strategy::TrimStrategyKind {
540        let resolver = StrategyResolver::new();
541        let tool = self.config.tool_name.as_deref().unwrap_or(default_tool);
542        resolver.resolve(tool)
543    }
544
545    /// Build TransformOutput from BudgetResult with chunk index.
546    ///
547    /// Returns: chunk 1 (best items by strategy) + index of ALL chunks.
548    /// Agent can fetch remaining chunks via offset/limit in subsequent tool calls.
549    ///
550    /// Note: budget pipeline always produces TOON content. When format is JSON,
551    /// we fall back to simple character truncation of the JSON output instead.
552    fn build_budget_output(
553        &self,
554        result: budget::BudgetResult,
555        raw_chars: usize,
556        total: usize,
557        item_type: &str,
558        index: Option<page_index::PageIndex>,
559        json_fallback: Option<String>,
560    ) -> Result<TransformOutput> {
561        // Budget pipeline produces TOON. For JSON format, use truncated JSON instead.
562        let content = if matches!(self.config.format, OutputFormat::Json) {
563            if let Some(json) = json_fallback {
564                truncation::truncate_string(&json, self.config.max_chars)
565            } else {
566                result.content
567            }
568        } else {
569            result.content
570        };
571
572        let mut output = TransformOutput::new(content).with_raw_chars(raw_chars);
573        output.included_count = result.included_items;
574
575        // Always set truncation metadata when trimmed, regardless of include_hints
576        if result.trimmed {
577            output.truncated = true;
578            output.total_count = Some(total);
579
580            if self.config.include_hints {
581                if let Some(idx) = index {
582                    if idx.total_pages > 1 {
583                        let hint = format!(
584                            "Chunk 1/{}: {} most relevant {} (by priority). {} total items across {} chunks. \
585                            Use `chunk: N` parameter to fetch a specific chunk, or request all remaining data.",
586                            idx.total_pages,
587                            result.included_items,
588                            item_type,
589                            total,
590                            idx.total_pages
591                        );
592                        output.page_index = Some(idx);
593                        output.agent_hint = Some(hint);
594                    } else {
595                        let remaining = total.saturating_sub(result.included_items);
596                        output.agent_hint = Some(format!(
597                            "Showing {}/{} {}. {} items trimmed by budget.",
598                            result.included_items, total, item_type, remaining
599                        ));
600                    }
601                } else {
602                    let remaining = total.saturating_sub(result.included_items);
603                    output.agent_hint = Some(format!(
604                        "Showing {}/{} {}. {} items trimmed by budget. Use `chunk: N` parameter to fetch a specific chunk.",
605                        result.included_items, total, item_type, remaining
606                    ));
607                }
608            }
609        }
610
611        Ok(output)
612    }
613}
614
615impl Default for Pipeline {
616    fn default() -> Self {
617        Self::new()
618    }
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624    use devboy_core::User;
625
626    fn sample_issues() -> Vec<Issue> {
627        (1..=25)
628            .map(|i| Issue {
629                key: format!("gh#{}", i),
630                title: format!("Issue {}", i),
631                description: Some(format!("Description for issue {}", i)),
632                state: "open".to_string(),
633                source: "github".to_string(),
634                priority: None,
635                labels: vec!["bug".to_string()],
636                author: Some(User {
637                    id: "1".to_string(),
638                    username: "test".to_string(),
639                    name: None,
640                    email: None,
641                    avatar_url: None,
642                }),
643                assignees: vec![],
644                url: Some(format!("https://github.com/test/repo/issues/{}", i)),
645                created_at: Some("2024-01-01T00:00:00Z".to_string()),
646                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
647                attachments_count: None,
648                parent: None,
649                subtasks: vec![],
650                custom_fields: std::collections::HashMap::new(),
651            })
652            .collect()
653    }
654
655    fn sample_merge_requests() -> Vec<MergeRequest> {
656        (1..=5)
657            .map(|i| MergeRequest {
658                key: format!("mr#{}", i),
659                title: format!("MR {}", i),
660                description: Some(format!("MR description {}", i)),
661                state: "opened".to_string(),
662                source: "gitlab".to_string(),
663                source_branch: format!("feature-{}", i),
664                target_branch: "main".to_string(),
665                author: None,
666                assignees: vec![],
667                reviewers: vec![],
668                labels: vec![],
669                url: Some(format!(
670                    "https://gitlab.com/test/repo/-/merge_requests/{}",
671                    i
672                )),
673                created_at: Some("2024-01-01T00:00:00Z".to_string()),
674                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
675                draft: false,
676            })
677            .collect()
678    }
679
680    fn sample_diffs() -> Vec<FileDiff> {
681        (1..=5)
682            .map(|i| FileDiff {
683                file_path: format!("src/file_{}.rs", i),
684                old_path: None,
685                new_file: i == 1,
686                deleted_file: false,
687                renamed_file: false,
688                diff: format!("+added line {}\n-removed line {}", i, i),
689                additions: Some(1),
690                deletions: Some(1),
691            })
692            .collect()
693    }
694
695    fn sample_comments() -> Vec<Comment> {
696        (1..=5)
697            .map(|i| Comment {
698                id: format!("{}", i),
699                body: format!("Comment body {}", i),
700                author: None,
701                created_at: Some("2024-01-01T00:00:00Z".to_string()),
702                updated_at: None,
703                position: None,
704            })
705            .collect()
706    }
707
708    fn sample_discussions() -> Vec<Discussion> {
709        (1..=5)
710            .map(|i| Discussion {
711                id: format!("{}", i),
712                resolved: i % 2 == 0,
713                resolved_by: None,
714                comments: vec![Comment {
715                    id: format!("c{}", i),
716                    body: format!("Discussion comment {}", i),
717                    author: None,
718                    created_at: None,
719                    updated_at: None,
720                    position: None,
721                }],
722                position: None,
723            })
724            .collect()
725    }
726
727    // --- Pipeline truncation (budget-based) ---
728
729    #[test]
730    fn test_pipeline_truncates_items() {
731        // Use a small max_chars to force budget trimming
732        let pipeline = Pipeline::with_config(PipelineConfig {
733            max_chars: 200,
734            ..Default::default()
735        });
736
737        let issues = sample_issues();
738        let output = pipeline.transform_issues(issues).unwrap();
739
740        assert!(output.truncated);
741        assert_eq!(output.total_count, Some(25));
742        assert!(output.included_count < 25);
743        assert!(output.agent_hint.is_some());
744    }
745
746    #[test]
747    fn test_pipeline_no_truncation_when_under_limit() {
748        let pipeline = Pipeline::with_config(PipelineConfig {
749            max_chars: 100_000,
750            ..Default::default()
751        });
752
753        let issues: Vec<Issue> = sample_issues().into_iter().take(5).collect();
754        let output = pipeline.transform_issues(issues).unwrap();
755
756        assert!(!output.truncated);
757        assert!(output.agent_hint.is_none());
758    }
759
760    // --- Toon format ---
761
762    #[test]
763    fn test_toon_format_issues() {
764        let pipeline = Pipeline::with_config(PipelineConfig {
765            format: OutputFormat::Toon,
766            max_chars: 100_000,
767            ..Default::default()
768        });
769
770        let issues: Vec<Issue> = sample_issues().into_iter().take(3).collect();
771        let output = pipeline.transform_issues(issues).unwrap();
772
773        assert!(output.content.contains("gh#1"));
774        assert!(output.content.contains("Issue 1"));
775    }
776
777    #[test]
778    fn test_toon_format_merge_requests() {
779        // Use max_chars large enough to include some but not all MRs
780        let pipeline = Pipeline::with_config(PipelineConfig {
781            format: OutputFormat::Toon,
782            max_chars: 500,
783            ..Default::default()
784        });
785
786        let mrs = sample_merge_requests();
787        let output = pipeline.transform_merge_requests(mrs).unwrap();
788
789        assert!(output.content.contains("mr#1"));
790        assert!(output.content.contains("MR 1"));
791        assert!(output.truncated);
792        assert!(output.included_count < 5);
793    }
794
795    #[test]
796    fn test_toon_format_diffs() {
797        // Use max_chars small enough to force budget trimming of 5 diffs
798        let pipeline = Pipeline::with_config(PipelineConfig {
799            format: OutputFormat::Toon,
800            max_chars: 200,
801            ..Default::default()
802        });
803
804        let diffs = sample_diffs();
805        let output = pipeline.transform_diffs(diffs).unwrap();
806
807        assert!(output.content.contains("src/file_1.rs"));
808        assert!(output.truncated);
809        assert!(output.included_count < 5);
810    }
811
812    #[test]
813    fn test_toon_format_comments() {
814        // Use max_chars small enough to force budget trimming of 5 comments
815        // but large enough to include at least one comment with body text
816        let pipeline = Pipeline::with_config(PipelineConfig {
817            format: OutputFormat::Toon,
818            max_chars: 300,
819            ..Default::default()
820        });
821
822        let comments = sample_comments();
823        let output = pipeline.transform_comments(comments).unwrap();
824
825        // Budget trimming may drop early items; check that some comment body is present
826        assert!(output.content.contains("Comment body"));
827        assert!(output.truncated);
828        assert!(output.included_count < 5);
829    }
830
831    #[test]
832    fn test_toon_format_discussions() {
833        // Use max_chars large enough to include some but not all discussions
834        let pipeline = Pipeline::with_config(PipelineConfig {
835            format: OutputFormat::Toon,
836            max_chars: 500,
837            ..Default::default()
838        });
839
840        let discussions = sample_discussions();
841        let output = pipeline.transform_discussions(discussions).unwrap();
842
843        assert!(output.content.contains("Discussion comment 1"));
844        assert!(output.truncated);
845        assert!(output.included_count < 5);
846    }
847
848    // --- JSON format ---
849
850    #[test]
851    fn test_json_format_issues() {
852        let pipeline = Pipeline::with_config(PipelineConfig {
853            format: OutputFormat::Json,
854            max_chars: 100_000,
855            ..Default::default()
856        });
857
858        let issues: Vec<Issue> = sample_issues().into_iter().take(2).collect();
859        let output = pipeline.transform_issues(issues).unwrap();
860
861        let parsed: Vec<Issue> = serde_json::from_str(&output.content).unwrap();
862        assert_eq!(parsed.len(), 2);
863    }
864
865    #[test]
866    fn test_json_format_merge_requests() {
867        let pipeline = Pipeline::with_config(PipelineConfig {
868            format: OutputFormat::Json,
869            max_chars: 100_000,
870            ..Default::default()
871        });
872
873        let mrs: Vec<MergeRequest> = sample_merge_requests().into_iter().take(2).collect();
874        let output = pipeline.transform_merge_requests(mrs).unwrap();
875
876        let parsed: Vec<MergeRequest> = serde_json::from_str(&output.content).unwrap();
877        assert_eq!(parsed.len(), 2);
878    }
879
880    #[test]
881    fn test_json_format_diffs() {
882        let pipeline = Pipeline::with_config(PipelineConfig {
883            format: OutputFormat::Json,
884            max_chars: 100_000,
885            ..Default::default()
886        });
887
888        let diffs: Vec<FileDiff> = sample_diffs().into_iter().take(2).collect();
889        let output = pipeline.transform_diffs(diffs).unwrap();
890
891        let parsed: Vec<FileDiff> = serde_json::from_str(&output.content).unwrap();
892        assert_eq!(parsed.len(), 2);
893    }
894
895    #[test]
896    fn test_json_format_comments() {
897        let pipeline = Pipeline::with_config(PipelineConfig {
898            format: OutputFormat::Json,
899            max_chars: 100_000,
900            ..Default::default()
901        });
902
903        let comments: Vec<Comment> = sample_comments().into_iter().take(2).collect();
904        let output = pipeline.transform_comments(comments).unwrap();
905
906        let parsed: Vec<Comment> = serde_json::from_str(&output.content).unwrap();
907        assert_eq!(parsed.len(), 2);
908    }
909
910    #[test]
911    fn test_json_format_discussions() {
912        let pipeline = Pipeline::with_config(PipelineConfig {
913            format: OutputFormat::Json,
914            max_chars: 100_000,
915            ..Default::default()
916        });
917
918        let discussions: Vec<Discussion> = sample_discussions().into_iter().take(2).collect();
919        let output = pipeline.transform_discussions(discussions).unwrap();
920
921        let parsed: Vec<Discussion> = serde_json::from_str(&output.content).unwrap();
922        assert_eq!(parsed.len(), 2);
923    }
924
925    // --- TransformOutput ---
926
927    #[test]
928    fn test_transform_output_to_string_with_hints() {
929        let output = TransformOutput::new("content".to_string());
930        assert_eq!(output.to_string_with_hints(), "content");
931
932        let output = TransformOutput::new("content".to_string()).with_truncation(
933            10,
934            5,
935            "hint text".to_string(),
936        );
937        assert!(output.to_string_with_hints().contains("content"));
938        assert!(output.to_string_with_hints().contains("hint text"));
939    }
940
941    #[test]
942    fn test_transform_output_with_truncation() {
943        let output =
944            TransformOutput::new("data".into()).with_truncation(100, 10, "90 more items".into());
945        assert!(output.truncated);
946        assert_eq!(output.total_count, Some(100));
947        assert_eq!(output.included_count, 10);
948        assert_eq!(output.agent_hint.as_deref(), Some("90 more items"));
949    }
950
951    // --- PipelineConfig ---
952
953    #[test]
954    fn test_pipeline_config_default_values() {
955        let config = PipelineConfig::default();
956        assert_eq!(config.max_chars, 100_000);
957        assert_eq!(config.max_chars_per_item, 10_000);
958        assert_eq!(config.max_description_len, 10_000);
959        assert!(matches!(config.format, OutputFormat::Toon));
960        assert!(config.include_hints);
961    }
962
963    #[test]
964    fn test_pipeline_default() {
965        let pipeline = Pipeline::default();
966        let issues: Vec<Issue> = sample_issues().into_iter().take(1).collect();
967        let output = pipeline.transform_issues(issues).unwrap();
968        assert!(!output.content.is_empty());
969    }
970
971    #[test]
972    fn test_pipeline_hints_disabled() {
973        // Use small max_chars to trigger budget trimming, but with hints disabled
974        let pipeline = Pipeline::with_config(PipelineConfig {
975            max_chars: 200,
976            include_hints: false,
977            ..Default::default()
978        });
979
980        let issues = sample_issues();
981        let output = pipeline.transform_issues(issues).unwrap();
982
983        assert!(output.included_count < 25);
984        // truncated flag is always set when trimming occurs (for metadata consumers)
985        assert!(output.truncated);
986        // but agent_hint and page_index are suppressed when include_hints is false
987        assert!(output.agent_hint.is_none());
988        assert!(output.page_index.is_none());
989    }
990
991    // --- Character limit (budget-based) ---
992
993    #[test]
994    fn test_char_limit_applied() {
995        let pipeline = Pipeline::with_config(PipelineConfig {
996            max_chars: 100,
997            ..Default::default()
998        });
999
1000        let issues = sample_issues();
1001        let output = pipeline.transform_issues(issues).unwrap();
1002
1003        assert!(output.truncated);
1004    }
1005
1006    #[test]
1007    fn test_char_limit_triggers_trimming() {
1008        let pipeline = Pipeline::with_config(PipelineConfig {
1009            max_chars: 50,
1010            ..Default::default()
1011        });
1012
1013        let issues: Vec<Issue> = sample_issues().into_iter().take(3).collect();
1014        let output = pipeline.transform_issues(issues).unwrap();
1015        assert!(output.truncated);
1016    }
1017
1018    // --- Empty collections ---
1019
1020    #[test]
1021    fn test_transform_empty_issues() {
1022        let pipeline = Pipeline::new();
1023        let output = pipeline.transform_issues(vec![]).unwrap();
1024        assert!(!output.truncated);
1025        assert_eq!(output.included_count, 0);
1026    }
1027
1028    #[test]
1029    fn test_transform_empty_merge_requests() {
1030        let pipeline = Pipeline::new();
1031        let output = pipeline.transform_merge_requests(vec![]).unwrap();
1032        assert!(!output.truncated);
1033        assert_eq!(output.included_count, 0);
1034    }
1035
1036    #[test]
1037    fn test_transform_empty_diffs() {
1038        let pipeline = Pipeline::new();
1039        let output = pipeline.transform_diffs(vec![]).unwrap();
1040        assert!(!output.truncated);
1041        assert_eq!(output.included_count, 0);
1042    }
1043
1044    #[test]
1045    fn test_transform_empty_comments() {
1046        let pipeline = Pipeline::new();
1047        let output = pipeline.transform_comments(vec![]).unwrap();
1048        assert!(!output.truncated);
1049        assert_eq!(output.included_count, 0);
1050    }
1051
1052    #[test]
1053    fn test_transform_empty_discussions() {
1054        let pipeline = Pipeline::new();
1055        let output = pipeline.transform_discussions(vec![]).unwrap();
1056        assert!(!output.truncated);
1057        assert_eq!(output.included_count, 0);
1058    }
1059
1060    // --- Diff truncation per item ---
1061
1062    #[test]
1063    fn test_diff_content_truncated_per_item() {
1064        let pipeline = Pipeline::with_config(PipelineConfig {
1065            max_chars_per_item: 10,
1066            max_chars: 100_000,
1067            ..Default::default()
1068        });
1069
1070        let diffs = vec![FileDiff {
1071            file_path: "big.rs".into(),
1072            old_path: None,
1073            new_file: false,
1074            deleted_file: false,
1075            renamed_file: false,
1076            diff: "x".repeat(1000),
1077            additions: Some(100),
1078            deletions: Some(0),
1079        }];
1080
1081        let output = pipeline.transform_diffs(diffs).unwrap();
1082        assert!(output.content.len() < 1000);
1083    }
1084
1085    // --- TOON smaller than JSON ---
1086
1087    // --- JSON format with budget trimming (triggers json_fallback) ---
1088
1089    #[test]
1090    fn test_json_format_with_budget_trimming_issues() {
1091        let pipeline = Pipeline::with_config(PipelineConfig {
1092            format: OutputFormat::Json,
1093            max_chars: 200,
1094            ..Default::default()
1095        });
1096
1097        let issues = sample_issues();
1098        let output = pipeline.transform_issues(issues).unwrap();
1099
1100        assert!(output.truncated);
1101        assert!(output.included_count < 25);
1102        // Content should be truncated JSON (not TOON)
1103        assert!(!output.content.is_empty());
1104    }
1105
1106    #[test]
1107    fn test_json_format_with_budget_trimming_merge_requests() {
1108        let pipeline = Pipeline::with_config(PipelineConfig {
1109            format: OutputFormat::Json,
1110            max_chars: 200,
1111            ..Default::default()
1112        });
1113
1114        let mrs = sample_merge_requests();
1115        let output = pipeline.transform_merge_requests(mrs).unwrap();
1116
1117        assert!(output.truncated);
1118        assert!(!output.content.is_empty());
1119    }
1120
1121    #[test]
1122    fn test_json_format_with_budget_trimming_diffs() {
1123        let pipeline = Pipeline::with_config(PipelineConfig {
1124            format: OutputFormat::Json,
1125            max_chars: 100,
1126            ..Default::default()
1127        });
1128
1129        let diffs = sample_diffs();
1130        let output = pipeline.transform_diffs(diffs).unwrap();
1131
1132        assert!(output.truncated);
1133        assert!(!output.content.is_empty());
1134    }
1135
1136    #[test]
1137    fn test_json_format_with_budget_trimming_comments() {
1138        let pipeline = Pipeline::with_config(PipelineConfig {
1139            format: OutputFormat::Json,
1140            max_chars: 100,
1141            ..Default::default()
1142        });
1143
1144        let comments = sample_comments();
1145        let output = pipeline.transform_comments(comments).unwrap();
1146
1147        assert!(output.truncated);
1148        assert!(!output.content.is_empty());
1149    }
1150
1151    #[test]
1152    fn test_json_format_with_budget_trimming_discussions() {
1153        let pipeline = Pipeline::with_config(PipelineConfig {
1154            format: OutputFormat::Json,
1155            max_chars: 100,
1156            ..Default::default()
1157        });
1158
1159        let discussions = sample_discussions();
1160        let output = pipeline.transform_discussions(discussions).unwrap();
1161
1162        assert!(output.truncated);
1163        assert!(!output.content.is_empty());
1164    }
1165
1166    // --- Chunk index hints (total_pages > 1) ---
1167
1168    #[test]
1169    fn test_pipeline_chunk_index_with_many_issues() {
1170        // Use enough issues and small budget to trigger multi-page chunk index
1171        let issues: Vec<Issue> = (1..=50)
1172            .map(|i| Issue {
1173                key: format!("gh#{}", i),
1174                title: format!("Issue {} with a moderately long title for sizing", i),
1175                description: Some(format!(
1176                    "Description for issue {} with substantial content to inflate token count significantly beyond budget",
1177                    i
1178                )),
1179                state: "open".to_string(),
1180                source: "github".to_string(),
1181                priority: None,
1182                labels: vec!["bug".to_string(), "critical".to_string()],
1183                author: Some(User {
1184                    id: "1".to_string(),
1185                    username: "test".to_string(),
1186                    name: None,
1187                    email: None,
1188                    avatar_url: None,
1189                }),
1190                assignees: vec![],
1191                url: Some(format!("https://github.com/test/repo/issues/{}", i)),
1192                created_at: Some("2024-01-01T00:00:00Z".to_string()),
1193                updated_at: Some("2024-01-02T00:00:00Z".to_string()),
1194                attachments_count: None,
1195            parent: None,
1196                subtasks: vec![],
1197                custom_fields: std::collections::HashMap::new(),
1198            })
1199            .collect();
1200
1201        let pipeline = Pipeline::with_config(PipelineConfig {
1202            max_chars: 500,
1203            include_hints: true,
1204            ..Default::default()
1205        });
1206
1207        let output = pipeline.transform_issues(issues).unwrap();
1208
1209        assert!(output.truncated);
1210        assert!(output.included_count < 50);
1211        // When many items are trimmed, we expect page_index and chunk hint
1212        if let Some(ref hint) = output.agent_hint {
1213            assert!(
1214                hint.contains("Chunk") || hint.contains("Showing"),
1215                "Expected chunk or showing hint, got: {}",
1216                hint
1217            );
1218        }
1219    }
1220
1221    #[test]
1222    fn test_toon_smaller_than_json_for_issues() {
1223        let issues: Vec<Issue> = sample_issues().into_iter().take(10).collect();
1224
1225        let json_pipeline = Pipeline::with_config(PipelineConfig {
1226            format: OutputFormat::Json,
1227            max_chars: 1_000_000,
1228            ..Default::default()
1229        });
1230        let toon_pipeline = Pipeline::with_config(PipelineConfig {
1231            format: OutputFormat::Toon,
1232            max_chars: 1_000_000,
1233            ..Default::default()
1234        });
1235
1236        let json_output = json_pipeline.transform_issues(issues.clone()).unwrap();
1237        let toon_output = toon_pipeline.transform_issues(issues).unwrap();
1238
1239        assert!(
1240            toon_output.content.len() < json_output.content.len(),
1241            "TOON ({}) should be smaller than JSON ({})",
1242            toon_output.content.len(),
1243            json_output.content.len()
1244        );
1245    }
1246
1247    #[test]
1248    fn test_mckp_routes_issues_through_inner_table() {
1249        let issues: Vec<Issue> = sample_issues().into_iter().take(10).collect();
1250
1251        let mckp_pipeline = Pipeline::with_config(PipelineConfig {
1252            format: OutputFormat::Mckp,
1253            max_chars: 1_000_000,
1254            ..Default::default()
1255        });
1256        let json_pipeline = Pipeline::with_config(PipelineConfig {
1257            format: OutputFormat::Json,
1258            max_chars: 1_000_000,
1259            ..Default::default()
1260        });
1261
1262        let mckp_out = mckp_pipeline.transform_issues(issues.clone()).unwrap();
1263        let json_out = json_pipeline.transform_issues(issues).unwrap();
1264
1265        // MCKP must beat the pretty-printed JSON baseline on this shape
1266        // (array of objects → routes to `csv` via try_array_csv).
1267        assert!(
1268            mckp_out.content.len() < json_out.content.len(),
1269            "MCKP ({}) should be smaller than JSON ({})",
1270            mckp_out.content.len(),
1271            json_out.content.len(),
1272        );
1273        // Round-trip key parity: every Issue field still appears in the
1274        // output (the encoder bug regression).
1275        for k in ["key", "title", "state", "source"] {
1276            assert!(
1277                mckp_out.content.contains(k),
1278                "MCKP output is missing field `{k}`: {}",
1279                &mckp_out.content[..mckp_out.content.len().min(200)]
1280            );
1281        }
1282    }
1283
1284    #[test]
1285    fn test_mckp_falls_back_to_pretty_json_on_unstable_keys() {
1286        // Single issue → array length 1, below the min_items threshold for
1287        // try_array_csv. encode_mckp must not crash; it should fall back
1288        // to the pretty JSON.
1289        let issues: Vec<Issue> = sample_issues().into_iter().take(1).collect();
1290        let mckp_pipeline = Pipeline::with_config(PipelineConfig {
1291            format: OutputFormat::Mckp,
1292            max_chars: 1_000_000,
1293            ..Default::default()
1294        });
1295        let out = mckp_pipeline.transform_issues(issues).unwrap();
1296        assert!(out.content.contains("gh#1"));
1297    }
1298}
devboy_format_pipeline/lib.rs

devboy_format_pipeline/
lib.rs