bcp_encoder/
encoder.rs

1use std::sync::Arc;
2
3use bcp_types::BlockContent;
4use bcp_types::annotation::AnnotationBlock;
5use bcp_types::code::CodeBlock;
6use bcp_types::content_store::ContentStore;
7use bcp_types::conversation::ConversationBlock;
8use bcp_types::diff::{DiffBlock, DiffHunk};
9use bcp_types::embedding_ref::EmbeddingRefBlock;
10use bcp_types::document::DocumentBlock;
11use bcp_types::enums::{
12    AnnotationKind, DataFormat, FormatHint, Lang, MediaType, Priority, Role, Status,
13};
14use bcp_types::extension::ExtensionBlock;
15use bcp_types::file_tree::{FileEntry, FileTreeBlock};
16use bcp_types::image::ImageBlock;
17use bcp_types::structured_data::StructuredDataBlock;
18use bcp_types::summary::Summary;
19use bcp_types::tool_result::ToolResultBlock;
20use bcp_wire::block_frame::{BlockFlags, BlockFrame, block_type};
21use bcp_wire::header::{HEADER_SIZE, HeaderFlags, BcpHeader};
22
23use crate::compression::{self, COMPRESSION_THRESHOLD};
24use crate::error::EncodeError;
25
26/// Maximum block body size (16 MiB). Blocks exceeding this limit produce
27/// an [`EncodeError::BlockTooLarge`] during `.encode()`.
28const MAX_BLOCK_BODY_SIZE: usize = 16 * 1024 * 1024;
29
30/// BCP encoder — constructs a binary payload from structured blocks.
31///
32/// The encoder is the tool-facing API that allows agents, MCP servers,
33/// and other producers to build BCP payloads. It follows the builder
34/// pattern defined in RFC §5.6: methods like [`add_code`](Self::add_code),
35/// [`add_conversation`](Self::add_conversation), etc. append typed blocks
36/// to an internal list, and chainable modifiers like
37/// [`with_summary`](Self::with_summary) and
38/// [`with_priority`](Self::with_priority) annotate the most recently
39/// added block.
40///
41/// # Compression (RFC §4.6)
42///
43/// Two compression modes are supported, both opt-in:
44///
45/// - **Per-block**: call [`with_compression`](Self::with_compression) after
46///   adding a block, or [`compress_blocks`](Self::compress_blocks) to
47///   enable compression for all subsequent blocks. Each block body is
48///   independently zstd-compressed if it exceeds
49///   [`COMPRESSION_THRESHOLD`](crate::compression::COMPRESSION_THRESHOLD)
50///   bytes and compression yields a size reduction. The block's
51///   `COMPRESSED` flag (bit 1) is set when compression is applied.
52///
53/// - **Whole-payload**: call [`compress_payload`](Self::compress_payload)
54///   to zstd-compress all bytes after the 8-byte header. When enabled,
55///   per-block compression is skipped (whole-payload subsumes it). The
56///   header's `COMPRESSED` flag (bit 0) is set.
57///
58/// # Content Addressing (RFC §4.7)
59///
60/// When a [`ContentStore`] is configured via
61/// [`set_content_store`](Self::set_content_store), blocks can be stored
62/// by their BLAKE3 hash rather than inline:
63///
64/// - **Per-block**: call [`with_content_addressing`](Self::with_content_addressing)
65///   after adding a block. The body is hashed, stored in the content store,
66///   and replaced with the 32-byte hash on the wire. The block's
67///   `IS_REFERENCE` flag (bit 2) is set.
68///
69/// - **Auto-dedup**: call [`auto_dedup`](Self::auto_dedup) to automatically
70///   content-address any block whose body has been seen before. First
71///   occurrence is stored inline and registered in the store; subsequent
72///   identical blocks become references.
73///
74/// Content addressing runs before compression — a 32-byte hash reference
75/// is always below the compression threshold, so reference blocks are
76/// never compressed.
77///
78/// # Usage
79///
80/// ```rust
81/// use bcp_encoder::BcpEncoder;
82/// use bcp_types::enums::{Lang, Role, Status, Priority};
83///
84/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
85/// let payload = BcpEncoder::new()
86///     .add_code(Lang::Rust, "src/main.rs", b"fn main() {}")
87///     .with_summary("Entry point: CLI setup and server startup.")?
88///     .with_priority(Priority::High)?
89///     .add_conversation(Role::User, b"Fix the timeout bug.")
90///     .add_conversation(Role::Assistant, b"I'll examine the pool config...")
91///     .add_tool_result("ripgrep", Status::Ok, b"3 matches found.")
92///     .encode()?;
93/// # Ok(())
94/// # }
95/// ```
96///
97/// # Output layout
98///
99/// The `.encode()` method serializes all accumulated blocks into a
100/// self-contained byte sequence:
101///
102/// ```text
103/// ┌──────────────┬──────────────────────────────────────────┐
104/// │ [8 bytes]    │ File header (magic, version, flags, rsv) │
105/// │ [N bytes]    │ Block 0 frame (type + flags + len + body)│
106/// │ [N bytes]    │ Block 1 frame ...                        │
107/// │ ...          │                                          │
108/// │ [2-3 bytes]  │ END sentinel (type=0xFF, flags=0, len=0) │
109/// └──────────────┴──────────────────────────────────────────┘
110/// ```
111///
112/// When whole-payload compression is enabled, the layout becomes:
113///
114/// ```text
115/// ┌──────────────┬──────────────────────────────────────────┐
116/// │ [8 bytes]    │ Header (flags bit 0 = COMPRESSED)        │
117/// │ [N bytes]    │ zstd(Block 0 + Block 1 + ... + END)      │
118/// └──────────────┴──────────────────────────────────────────┘
119/// ```
120///
121/// The payload is ready for storage or transmission — no further
122/// framing is required.
123pub struct BcpEncoder {
124    blocks: Vec<PendingBlock>,
125    flags: HeaderFlags,
126    /// When `true`, the entire payload after the header is zstd-compressed.
127    compress_payload: bool,
128    /// When `true`, all blocks are individually compressed (unless
129    /// `compress_payload` is also set, which takes precedence).
130    compress_all_blocks: bool,
131    /// Content store for BLAKE3 content-addressed deduplication.
132    /// Required when any block has `content_address = true` or
133    /// when `auto_dedup` is enabled.
134    content_store: Option<Arc<dyn ContentStore>>,
135    /// When `true`, automatically content-address any block whose body
136    /// has been seen before (hash already exists in the store).
137    auto_dedup: bool,
138}
139
140/// Internal representation of a block awaiting serialization.
141///
142/// Captures the block type tag, the typed content (which knows how to
143/// serialize its own TLV body via [`BlockContent::encode_body`]), an
144/// optional summary to prepend, and per-block compression / content
145/// addressing flags.
146///
147/// `PendingBlock` is never exposed publicly. The encoder builds these
148/// internally as the caller chains `.add_*()` and `.with_*()` methods,
149/// then consumes them during `.encode()`.
150struct PendingBlock {
151    block_type: u8,
152    content: BlockContent,
153    summary: Option<String>,
154    /// When `true`, this block's body should be zstd-compressed if it
155    /// exceeds [`COMPRESSION_THRESHOLD`] and compression yields savings.
156    compress: bool,
157    /// When `true`, this block's body should be replaced with its
158    /// 32-byte BLAKE3 hash and stored in the content store.
159    content_address: bool,
160}
161
162impl BcpEncoder {
163    /// Create a new encoder with default settings (version 1.0, no flags).
164    ///
165    /// The encoder starts with an empty block list, no compression, and
166    /// no content store. At least one block must be added before calling
167    /// `.encode()`, otherwise it returns [`EncodeError::EmptyPayload`].
168    #[must_use]
169    pub fn new() -> Self {
170        Self {
171            blocks: Vec::new(),
172            flags: HeaderFlags::NONE,
173            compress_payload: false,
174            compress_all_blocks: false,
175            content_store: None,
176            auto_dedup: false,
177        }
178    }
179
180    // ── Block addition methods ──────────────────────────────────────────
181    //
182    // Each method constructs the appropriate `BlockContent` variant from
183    // `bcp-types`, wraps it in a `PendingBlock`, pushes it onto the
184    // internal list, and returns `&mut Self` for chaining.
185
186    /// Add a CODE block.
187    ///
188    /// Encodes a source code file or fragment. The `lang` enum identifies
189    /// the programming language (used by the decoder for syntax-aware
190    /// rendering), `path` is the file path (UTF-8), and `content` is the
191    /// raw source bytes.
192    ///
193    /// For partial files, use [`add_code_range`](Self::add_code_range)
194    /// to include line range metadata.
195    pub fn add_code(&mut self, lang: Lang, path: &str, content: &[u8]) -> &mut Self {
196        self.push_block(
197            block_type::CODE,
198            BlockContent::Code(CodeBlock {
199                lang,
200                path: path.to_string(),
201                content: content.to_vec(),
202                line_range: None,
203            }),
204        )
205    }
206
207    /// Add a CODE block with a line range.
208    ///
209    /// Same as [`add_code`](Self::add_code) but includes `line_start` and
210    /// `line_end` metadata (1-based, inclusive). The decoder can use this
211    /// to display line numbers or to correlate with diagnostics.
212    pub fn add_code_range(
213        &mut self,
214        lang: Lang,
215        path: &str,
216        content: &[u8],
217        line_start: u32,
218        line_end: u32,
219    ) -> &mut Self {
220        self.push_block(
221            block_type::CODE,
222            BlockContent::Code(CodeBlock {
223                lang,
224                path: path.to_string(),
225                content: content.to_vec(),
226                line_range: Some((line_start, line_end)),
227            }),
228        )
229    }
230
231    /// Add a CONVERSATION block.
232    ///
233    /// Represents a single chat turn. The `role` identifies the speaker
234    /// (system, user, assistant, or tool) and `content` is the message
235    /// body as raw bytes.
236    pub fn add_conversation(&mut self, role: Role, content: &[u8]) -> &mut Self {
237        self.push_block(
238            block_type::CONVERSATION,
239            BlockContent::Conversation(ConversationBlock {
240                role,
241                content: content.to_vec(),
242                tool_call_id: None,
243            }),
244        )
245    }
246
247    /// Add a CONVERSATION block with a tool call ID.
248    ///
249    /// Used for tool-role messages that reference a specific tool
250    /// invocation. The `tool_call_id` links this response back to the
251    /// tool call that produced it.
252    pub fn add_conversation_tool(
253        &mut self,
254        role: Role,
255        content: &[u8],
256        tool_call_id: &str,
257    ) -> &mut Self {
258        self.push_block(
259            block_type::CONVERSATION,
260            BlockContent::Conversation(ConversationBlock {
261                role,
262                content: content.to_vec(),
263                tool_call_id: Some(tool_call_id.to_string()),
264            }),
265        )
266    }
267
268    /// Add a `FILE_TREE` block.
269    ///
270    /// Represents a directory structure rooted at `root`. Each entry
271    /// contains a name, kind (file or directory), size, and optional
272    /// nested children for recursive directory trees.
273    pub fn add_file_tree(&mut self, root: &str, entries: Vec<FileEntry>) -> &mut Self {
274        self.push_block(
275            block_type::FILE_TREE,
276            BlockContent::FileTree(FileTreeBlock {
277                root_path: root.to_string(),
278                entries,
279            }),
280        )
281    }
282
283    /// Add a `TOOL_RESULT` block.
284    ///
285    /// Captures the output of an external tool invocation (e.g. ripgrep,
286    /// LSP diagnostics, test runner). The `status` indicates whether the
287    /// tool succeeded, failed, or timed out.
288    pub fn add_tool_result(&mut self, name: &str, status: Status, content: &[u8]) -> &mut Self {
289        self.push_block(
290            block_type::TOOL_RESULT,
291            BlockContent::ToolResult(ToolResultBlock {
292                tool_name: name.to_string(),
293                status,
294                content: content.to_vec(),
295                schema_hint: None,
296            }),
297        )
298    }
299
300    /// Add a DOCUMENT block.
301    ///
302    /// Represents prose content — README files, documentation, wiki pages.
303    /// The `format_hint` tells the decoder how to render the body
304    /// (markdown, plain text, or HTML).
305    pub fn add_document(
306        &mut self,
307        title: &str,
308        content: &[u8],
309        format_hint: FormatHint,
310    ) -> &mut Self {
311        self.push_block(
312            block_type::DOCUMENT,
313            BlockContent::Document(DocumentBlock {
314                title: title.to_string(),
315                content: content.to_vec(),
316                format_hint,
317            }),
318        )
319    }
320
321    /// Add a `STRUCTURED_DATA` block.
322    ///
323    /// Encodes tabular or structured content — JSON configs, YAML
324    /// manifests, TOML files, CSV data. The `format` identifies the
325    /// serialization format so the decoder can syntax-highlight or
326    /// parse appropriately.
327    pub fn add_structured_data(&mut self, format: DataFormat, content: &[u8]) -> &mut Self {
328        self.push_block(
329            block_type::STRUCTURED_DATA,
330            BlockContent::StructuredData(StructuredDataBlock {
331                format,
332                content: content.to_vec(),
333                schema: None,
334            }),
335        )
336    }
337
338    /// Add a DIFF block.
339    ///
340    /// Represents code changes for a single file — from git diffs, editor
341    /// changes, or patch files. Each hunk captures a contiguous range of
342    /// modifications in unified diff format.
343    pub fn add_diff(&mut self, path: &str, hunks: Vec<DiffHunk>) -> &mut Self {
344        self.push_block(
345            block_type::DIFF,
346            BlockContent::Diff(DiffBlock {
347                path: path.to_string(),
348                hunks,
349            }),
350        )
351    }
352
353    /// Add an ANNOTATION block.
354    ///
355    /// Annotations are metadata overlays that target another block by its
356    /// zero-based index in the stream. The `kind` determines how the
357    /// `value` payload is interpreted (priority level, summary text, or
358    /// tag label).
359    ///
360    /// For the common case of attaching a priority to the most recent
361    /// block, prefer [`with_priority`](Self::with_priority).
362    pub fn add_annotation(
363        &mut self,
364        target_block_id: u32,
365        kind: AnnotationKind,
366        value: &[u8],
367    ) -> &mut Self {
368        self.push_block(
369            block_type::ANNOTATION,
370            BlockContent::Annotation(AnnotationBlock {
371                target_block_id,
372                kind,
373                value: value.to_vec(),
374            }),
375        )
376    }
377
378    /// Add an EMBEDDING_REF block.
379    ///
380    /// Points to a pre-computed vector embedding stored externally (e.g.
381    /// in a vector database). The `vector_id` is an opaque byte identifier
382    /// for the vector in the external store, `source_hash` is the BLAKE3
383    /// hash of the content that was embedded (32 bytes), and `model` is
384    /// the name of the embedding model (e.g. `"text-embedding-3-small"`).
385    ///
386    /// # Wire type
387    ///
388    /// Block type `0x09` (`EMBEDDING_REF`). See RFC §4.4.
389    pub fn add_embedding_ref(
390        &mut self,
391        vector_id: &[u8],
392        source_hash: &[u8],
393        model: &str,
394    ) -> &mut Self {
395        self.push_block(
396            block_type::EMBEDDING_REF,
397            BlockContent::EmbeddingRef(EmbeddingRefBlock {
398                vector_id: vector_id.to_vec(),
399                source_hash: source_hash.to_vec(),
400                model: model.to_string(),
401            }),
402        )
403    }
404
405    /// Add an IMAGE block.
406    ///
407    /// Encodes an image as inline binary data. The `media_type` identifies
408    /// the image format (PNG, JPEG, etc.), `alt_text` provides a textual
409    /// description for accessibility, and `data` is the raw image bytes.
410    pub fn add_image(&mut self, media_type: MediaType, alt_text: &str, data: &[u8]) -> &mut Self {
411        self.push_block(
412            block_type::IMAGE,
413            BlockContent::Image(ImageBlock {
414                media_type,
415                alt_text: alt_text.to_string(),
416                data: data.to_vec(),
417            }),
418        )
419    }
420
421    /// Add an EXTENSION block.
422    ///
423    /// User-defined block type for custom payloads. The `namespace` and
424    /// `type_name` together form a unique identifier for the extension
425    /// type, preventing collisions across different tools and vendors.
426    pub fn add_extension(&mut self, namespace: &str, type_name: &str, content: &[u8]) -> &mut Self {
427        self.push_block(
428            block_type::EXTENSION,
429            BlockContent::Extension(ExtensionBlock {
430                namespace: namespace.to_string(),
431                type_name: type_name.to_string(),
432                content: content.to_vec(),
433            }),
434        )
435    }
436
437    // ── Modifier methods ────────────────────────────────────────────────
438    //
439    // Modifiers act on the most recently added block. They set metadata
440    // that affects how the block is serialized (summary prefix, flags)
441    // or append related blocks (priority annotation).
442
443    /// Attach a summary to the most recently added block.
444    ///
445    /// Sets the `HAS_SUMMARY` flag on the block and prepends the summary
446    /// sub-block to the body during serialization. The summary is a
447    /// compact UTF-8 description that the token budget engine can use as
448    /// a stand-in when the full block content would exceed the budget.
449    ///
450    /// # Errors
451    ///
452    /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
453    /// added yet. Use this immediately after an `.add_*()` call.
454    pub fn with_summary(&mut self, summary: &str) -> Result<&mut Self, EncodeError> {
455        let block = self
456            .blocks
457            .last_mut()
458            .ok_or(EncodeError::NoBlockTarget { method: "with_summary" })?;
459        block.summary = Some(summary.to_string());
460        Ok(self)
461    }
462
463    /// Attach a priority annotation to the most recently added block.
464    ///
465    /// This is a convenience method that appends an ANNOTATION block
466    /// with `kind=Priority` targeting the last added block's index.
467    /// The annotation's value is the priority byte (e.g. `0x02` for
468    /// `Priority::High`).
469    ///
470    /// # Errors
471    ///
472    /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
473    /// added yet.
474    pub fn with_priority(&mut self, priority: Priority) -> Result<&mut Self, EncodeError> {
475        let target_index = self
476            .blocks
477            .len()
478            .checked_sub(1)
479            .ok_or(EncodeError::NoBlockTarget { method: "with_priority" })?;
480
481        #[allow(clippy::cast_possible_truncation)]
482        let target_id = target_index as u32;
483
484        self.push_block(
485            block_type::ANNOTATION,
486            BlockContent::Annotation(AnnotationBlock {
487                target_block_id: target_id,
488                kind: AnnotationKind::Priority,
489                value: vec![priority.to_wire_byte()],
490            }),
491        );
492        Ok(self)
493    }
494
495    // ── Compression modifiers ────────────────────────────────────────────
496    //
497    // These methods control per-block and whole-payload zstd compression.
498    // Per-block compression is skipped when whole-payload compression is
499    // enabled — the outer zstd frame subsumes individual block compression.
500
501    /// Enable zstd compression for the most recently added block.
502    ///
503    /// During `.encode()`, the block body is compressed with zstd if it
504    /// exceeds [`COMPRESSION_THRESHOLD`] bytes and compression yields a
505    /// size reduction. If compression doesn't help (output >= input), the
506    /// body is stored uncompressed and the `COMPRESSED` flag is not set.
507    ///
508    /// Has no effect if [`compress_payload`](Self::compress_payload) is
509    /// also enabled — whole-payload compression takes precedence.
510    ///
511    /// # Errors
512    ///
513    /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
514    /// added yet.
515    pub fn with_compression(&mut self) -> Result<&mut Self, EncodeError> {
516        let block = self
517            .blocks
518            .last_mut()
519            .ok_or(EncodeError::NoBlockTarget { method: "with_compression" })?;
520        block.compress = true;
521        Ok(self)
522    }
523
524    /// Enable zstd compression for all blocks added so far and all
525    /// future blocks.
526    ///
527    /// Equivalent to calling [`with_compression`](Self::with_compression)
528    /// on every block. Individual blocks still respect the size threshold
529    /// and no-savings guard.
530    pub fn compress_blocks(&mut self) -> &mut Self {
531        self.compress_all_blocks = true;
532        for block in &mut self.blocks {
533            block.compress = true;
534        }
535        self
536    }
537
538    /// Enable whole-payload zstd compression.
539    ///
540    /// When set, the entire block stream (all frames + END sentinel) is
541    /// compressed as a single zstd frame. The 8-byte header is written
542    /// uncompressed with `HeaderFlags::COMPRESSED` set so the decoder
543    /// can detect compression before reading further.
544    ///
545    /// When whole-payload compression is enabled, per-block compression
546    /// is skipped — compressing within a compressed stream adds overhead
547    /// without benefit.
548    ///
549    /// If compression doesn't reduce the total size, the payload is
550    /// stored uncompressed and the header flag is not set.
551    ///
552    /// **Tradeoff**: Whole-payload compression disables incremental
553    /// streaming in `StreamingDecoder` — the decoder must buffer and
554    /// decompress the entire payload before yielding any blocks. If
555    /// streaming is important, use [`compress_blocks`](Self::compress_blocks)
556    /// instead.
557    pub fn compress_payload(&mut self) -> &mut Self {
558        self.compress_payload = true;
559        self
560    }
561
562    // ── Content addressing modifiers ────────────────────────────────────
563    //
564    // These methods control BLAKE3 content-addressed deduplication.
565    // A content store must be configured before blocks can be
566    // content-addressed.
567
568    /// Set the content store used for BLAKE3 content addressing.
569    ///
570    /// The store is shared via `Arc` so the same store can be passed to
571    /// both the encoder and decoder for roundtrip workflows. The encoder
572    /// calls `store.put()` for each content-addressed block; the decoder
573    /// calls `store.get()` to resolve references.
574    ///
575    /// Must be called before `.encode()` if any block has content
576    /// addressing enabled or if [`auto_dedup`](Self::auto_dedup) is set.
577    pub fn set_content_store(&mut self, store: Arc<dyn ContentStore>) -> &mut Self {
578        self.content_store = Some(store);
579        self
580    }
581
582    /// Enable content addressing for the most recently added block.
583    ///
584    /// During `.encode()`, the block body is hashed with BLAKE3,
585    /// stored in the content store, and replaced with the 32-byte hash
586    /// on the wire. The block's `IS_REFERENCE` flag (bit 2) is set.
587    ///
588    /// Requires a content store — call
589    /// [`set_content_store`](Self::set_content_store) before `.encode()`.
590    ///
591    /// Content addressing runs before compression. Since a 32-byte
592    /// hash reference is always below [`COMPRESSION_THRESHOLD`],
593    /// reference blocks are never per-block compressed.
594    ///
595    /// # Errors
596    ///
597    /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
598    /// added yet.
599    pub fn with_content_addressing(&mut self) -> Result<&mut Self, EncodeError> {
600        let block = self
601            .blocks
602            .last_mut()
603            .ok_or(EncodeError::NoBlockTarget { method: "with_content_addressing" })?;
604        block.content_address = true;
605        Ok(self)
606    }
607
608    /// Enable automatic deduplication across all blocks.
609    ///
610    /// When set, the encoder hashes every block body with BLAKE3 during
611    /// `.encode()`. If the hash already exists in the content store
612    /// (i.e. a previous block in this or a prior encoding had the same
613    /// content), the block is automatically replaced with a hash
614    /// reference. First-occurrence blocks are stored inline and
615    /// registered in the store for future dedup.
616    ///
617    /// Requires a content store — call
618    /// [`set_content_store`](Self::set_content_store) before `.encode()`.
619    pub fn auto_dedup(&mut self) -> &mut Self {
620        self.auto_dedup = true;
621        self
622    }
623
624    // ── Encode ──────────────────────────────────────────────────────────
625
626    /// Serialize all accumulated blocks into a complete BCP payload.
627    ///
628    /// The encode pipeline processes each `PendingBlock` through up to
629    /// three stages:
630    ///
631    ///   1. **Serialize** — calls [`BlockContent::encode_body`] to get
632    ///      the TLV-encoded body bytes. If a summary is present, it is
633    ///      prepended and the `HAS_SUMMARY` flag is set.
634    ///
635    ///   2. **Content address** (optional) — if the block has
636    ///      `content_address = true` or auto-dedup detects a duplicate,
637    ///      the body is hashed with BLAKE3, stored in the content store,
638    ///      and replaced with the 32-byte hash. The `IS_REFERENCE` flag
639    ///      (bit 2) is set.
640    ///
641    ///   3. **Per-block compress** (optional) — if compression is enabled
642    ///      for this block, whole-payload compression is NOT active, and
643    ///      the body is not a reference, the body is zstd-compressed if
644    ///      it exceeds [`COMPRESSION_THRESHOLD`] and compression yields
645    ///      savings. The `COMPRESSED` flag (bit 1) is set.
646    ///
647    /// After all blocks, the END sentinel is appended. If whole-payload
648    /// compression is enabled, everything after the 8-byte header is
649    /// compressed as a single zstd frame and the header's `COMPRESSED`
650    /// flag is set.
651    ///
652    /// # Errors
653    ///
654    /// - [`EncodeError::EmptyPayload`] if no blocks have been added.
655    /// - [`EncodeError::BlockTooLarge`] if any block body exceeds 16 MiB.
656    /// - [`EncodeError::MissingContentStore`] if content addressing is
657    ///   requested but no store has been configured.
658    /// - [`EncodeError::Wire`] if the underlying wire serialization fails.
659    /// - [`EncodeError::Io`] if writing to the output buffer fails.
660    pub fn encode(&self) -> Result<Vec<u8>, EncodeError> {
661        if self.blocks.is_empty() {
662            return Err(EncodeError::EmptyPayload);
663        }
664
665        // Validate: if any block needs content addressing or auto_dedup
666        // is enabled, a store must be present.
667        let needs_store = self.auto_dedup || self.blocks.iter().any(|b| b.content_address);
668        if needs_store && self.content_store.is_none() {
669            return Err(EncodeError::MissingContentStore);
670        }
671
672        // Pre-allocate: 8 bytes header + estimated block data + END sentinel.
673        let estimated_size = HEADER_SIZE + self.blocks.len() * 256 + 3;
674        let mut output = Vec::with_capacity(estimated_size);
675
676        // 1. Write a placeholder header (flags may be updated for whole-payload).
677        output.resize(HEADER_SIZE, 0);
678
679        // 2. Serialize each pending block through the encode pipeline.
680        for pending in &self.blocks {
681            let mut body = Self::serialize_block_body(pending)?;
682            let mut flags_raw = 0u8;
683
684            if pending.summary.is_some() {
685                flags_raw |= BlockFlags::HAS_SUMMARY.raw();
686            }
687
688            // Stage 2: Content addressing (runs before compression).
689            let is_reference = self.apply_content_addressing(pending, &mut body)?;
690            if is_reference {
691                flags_raw |= BlockFlags::IS_REFERENCE.raw();
692            }
693
694            // Stage 3: Per-block compression (skipped for references and
695            // when whole-payload compression is active).
696            if !is_reference && !self.compress_payload {
697                let should_compress = pending.compress || self.compress_all_blocks;
698                if should_compress && body.len() >= COMPRESSION_THRESHOLD {
699                    if let Some(compressed) = compression::compress(&body) {
700                        body = compressed;
701                        flags_raw |= BlockFlags::COMPRESSED.raw();
702                    }
703                }
704            }
705
706            let frame = BlockFrame {
707                block_type: pending.block_type,
708                flags: BlockFlags::from_raw(flags_raw),
709                body,
710            };
711            frame.write_to(&mut output)?;
712        }
713
714        // 3. Write the END sentinel.
715        let end_frame = BlockFrame {
716            block_type: block_type::END,
717            flags: BlockFlags::NONE,
718            body: Vec::new(),
719        };
720        end_frame.write_to(&mut output)?;
721
722        // 4. Whole-payload compression: compress everything after the header.
723        let header_flags = if self.compress_payload {
724            let block_data = &output[HEADER_SIZE..];
725            match compression::compress(block_data) {
726                Some(compressed) => {
727                    output.truncate(HEADER_SIZE);
728                    output.extend_from_slice(&compressed);
729                    HeaderFlags::from_raw(self.flags.raw() | HeaderFlags::COMPRESSED.raw())
730                }
731                None => self.flags,
732            }
733        } else {
734            self.flags
735        };
736
737        // 5. Write the final header with correct flags.
738        let header = BcpHeader::new(header_flags);
739        header.write_to(&mut output[..HEADER_SIZE])?;
740
741        Ok(output)
742    }
743
744    // ── Internal helpers ────────────────────────────────────────────────
745
746    /// Push a new `PendingBlock` onto the internal list.
747    ///
748    /// If `compress_all_blocks` is set, the new block inherits
749    /// `compress = true` automatically.
750    ///
751    /// Returns `&mut Self` so callers can chain additional methods.
752    fn push_block(&mut self, block_type: u8, content: BlockContent) -> &mut Self {
753        self.blocks.push(PendingBlock {
754            block_type,
755            content,
756            summary: None,
757            compress: self.compress_all_blocks,
758            content_address: false,
759        });
760        self
761    }
762
763    /// Apply content addressing to a block body if requested.
764    ///
765    /// Returns `true` if the body was replaced with a 32-byte hash
766    /// reference, `false` if the body is unchanged (inline).
767    ///
768    /// Two paths trigger content addressing:
769    /// 1. `pending.content_address == true` — always replace with hash.
770    /// 2. `self.auto_dedup == true` — replace only if the hash already
771    ///    exists in the store (i.e. a duplicate). First occurrence is
772    ///    stored inline and registered for future dedup.
773    fn apply_content_addressing(
774        &self,
775        pending: &PendingBlock,
776        body: &mut Vec<u8>,
777    ) -> Result<bool, EncodeError> {
778        let store = match &self.content_store {
779            Some(s) => s,
780            None => return Ok(false),
781        };
782
783        if pending.content_address {
784            // Explicit content addressing: always replace with hash.
785            let hash = store.put(body);
786            *body = hash.to_vec();
787            return Ok(true);
788        }
789
790        if self.auto_dedup {
791            // Auto-dedup: check if this body was seen before.
792            let hash: [u8; 32] = blake3::hash(body).into();
793            if store.contains(&hash) {
794                // Duplicate — replace with reference.
795                *body = hash.to_vec();
796                return Ok(true);
797            }
798            // First occurrence — store for future dedup, keep inline.
799            store.put(body);
800        }
801
802        Ok(false)
803    }
804
805    /// Serialize a `PendingBlock` into its final body bytes.
806    ///
807    /// If the block has a summary, the summary is encoded first (as a
808    /// length-prefixed UTF-8 string) followed by the TLV body fields.
809    /// This matches the wire convention: when `HAS_SUMMARY` is set, the
810    /// summary occupies the front of the body, before any TLV fields.
811    fn serialize_block_body(pending: &PendingBlock) -> Result<Vec<u8>, EncodeError> {
812        let tlv_body = pending.content.encode_body();
813        let mut body = Vec::new();
814
815        if let Some(ref summary_text) = pending.summary {
816            let summary = Summary {
817                text: summary_text.clone(),
818            };
819            summary.encode(&mut body);
820        }
821
822        body.extend_from_slice(&tlv_body);
823
824        if body.len() > MAX_BLOCK_BODY_SIZE {
825            return Err(EncodeError::BlockTooLarge {
826                size: body.len(),
827                limit: MAX_BLOCK_BODY_SIZE,
828            });
829        }
830
831        Ok(body)
832    }
833}
834
835impl Default for BcpEncoder {
836    fn default() -> Self {
837        Self::new()
838    }
839}
840
841#[cfg(test)]
842mod tests {
843    use super::*;
844    use bcp_types::file_tree::FileEntryKind;
845    use bcp_wire::header::BCP_MAGIC;
846
847    // ── Helper ──────────────────────────────────────────────────────────
848
849    /// Verify that a payload starts with the BCP magic number.
850    fn assert_starts_with_magic(payload: &[u8]) {
851        assert!(payload.len() >= HEADER_SIZE, "payload too short for header");
852        assert_eq!(&payload[..4], &BCP_MAGIC, "missing BCP magic");
853    }
854
855    /// Verify that a payload ends with a valid END sentinel.
856    ///
857    /// The END sentinel is: block_type=0xFF as varint (2 bytes: 0xFF 0x01),
858    /// flags=0x00, content_len=0 as varint (1 byte: 0x00).
859    fn assert_ends_with_end_sentinel(payload: &[u8]) {
860        // The END block type 0xFF encodes as varint [0xFF, 0x01],
861        // followed by flags byte 0x00, followed by content_len varint 0x00.
862        let tail = &payload[payload.len() - 4..];
863        assert_eq!(tail, &[0xFF, 0x01, 0x00, 0x00], "missing END sentinel");
864    }
865
866    // ── Acceptance criteria tests ───────────────────────────────────────
867
868    #[test]
869    fn encode_single_code_block_produces_valid_magic() {
870        let payload = BcpEncoder::new()
871            .add_code(Lang::Rust, "src/main.rs", b"fn main() {}")
872            .encode()
873            .unwrap();
874
875        assert_starts_with_magic(&payload);
876    }
877
878    #[test]
879    fn builder_methods_are_chainable() {
880        let payload = BcpEncoder::new()
881            .add_code(Lang::Rust, "src/lib.rs", b"pub fn hello() {}")
882            .with_summary("Hello function.").unwrap()
883            .add_conversation(Role::User, b"What does this do?")
884            .encode()
885            .unwrap();
886
887        assert_starts_with_magic(&payload);
888        assert_ends_with_end_sentinel(&payload);
889    }
890
891    #[test]
892    fn with_summary_sets_has_summary_flag() {
893        let payload = BcpEncoder::new()
894            .add_code(Lang::Python, "main.py", b"print('hi')")
895            .with_summary("Prints a greeting.").unwrap()
896            .encode()
897            .unwrap();
898
899        // Parse: skip the 8-byte header, read the first block frame.
900        let frame_buf = &payload[HEADER_SIZE..];
901        let (frame, _) = BlockFrame::read_from(frame_buf).unwrap().unwrap();
902        assert!(
903            frame.flags.has_summary(),
904            "HAS_SUMMARY flag should be set on the code block"
905        );
906    }
907
908    #[test]
909    fn with_priority_appends_annotation_block() {
910        let payload = BcpEncoder::new()
911            .add_code(Lang::Rust, "lib.rs", b"// code")
912            .with_priority(Priority::High).unwrap()
913            .encode()
914            .unwrap();
915
916        // Parse: header + first block (CODE) + second block (ANNOTATION) + END
917        let mut cursor = HEADER_SIZE;
918
919        // Block 0: CODE
920        let (frame0, n) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
921        assert_eq!(frame0.block_type, block_type::CODE);
922        cursor += n;
923
924        // Block 1: ANNOTATION (priority)
925        let (frame1, _) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
926        assert_eq!(frame1.block_type, block_type::ANNOTATION);
927
928        // Decode the annotation body and verify it targets block 0
929        let annotation = AnnotationBlock::decode_body(&frame1.body).unwrap();
930        assert_eq!(annotation.target_block_id, 0);
931        assert_eq!(annotation.kind, AnnotationKind::Priority);
932        assert_eq!(annotation.value, vec![Priority::High.to_wire_byte()]);
933    }
934
935    #[test]
936    fn empty_encoder_returns_empty_payload_error() {
937        let result = BcpEncoder::new().encode();
938        assert!(matches!(result, Err(EncodeError::EmptyPayload)));
939    }
940
941    #[test]
942    fn payload_ends_with_end_sentinel() {
943        let payload = BcpEncoder::new()
944            .add_conversation(Role::User, b"hello")
945            .encode()
946            .unwrap();
947
948        assert_ends_with_end_sentinel(&payload);
949    }
950
951    #[test]
952    fn all_eleven_block_types_encode_without_error() {
953        let payload = BcpEncoder::new()
954            .add_code(Lang::Rust, "main.rs", b"fn main() {}")
955            .add_conversation(Role::User, b"hello")
956            .add_file_tree(
957                "/project",
958                vec![FileEntry {
959                    name: "lib.rs".to_string(),
960                    kind: FileEntryKind::File,
961                    size: 100,
962                    children: vec![],
963                }],
964            )
965            .add_tool_result("rg", Status::Ok, b"found 3 matches")
966            .add_document("README", b"# Title", FormatHint::Markdown)
967            .add_structured_data(DataFormat::Json, b"{\"key\": \"value\"}")
968            .add_diff(
969                "src/lib.rs",
970                vec![DiffHunk {
971                    old_start: 1,
972                    new_start: 1,
973                    lines: b"+new line\n".to_vec(),
974                }],
975            )
976            .add_annotation(0, AnnotationKind::Tag, b"important")
977            .add_embedding_ref(b"vec-001", &[0xAB; 32], "text-embedding-3-small")
978            .add_image(MediaType::Png, "screenshot", b"\x89PNG\r\n")
979            .add_extension("myco", "custom_block", b"custom data")
980            .encode()
981            .unwrap();
982
983        assert_starts_with_magic(&payload);
984        assert_ends_with_end_sentinel(&payload);
985
986        // Verify we can walk all 12 content blocks (11 semantic + 1 annotation)
987        let mut cursor = HEADER_SIZE;
988        let mut block_count = 0;
989        loop {
990            match BlockFrame::read_from(&payload[cursor..]).unwrap() {
991                Some((_, n)) => {
992                    cursor += n;
993                    block_count += 1;
994                }
995                None => break, // END sentinel
996            }
997        }
998        assert_eq!(block_count, 11, "expected 11 content blocks");
999    }
1000
1001    #[test]
1002    fn payload_byte_length_matches_calculation() {
1003        let mut enc = BcpEncoder::new();
1004        enc.add_code(Lang::Rust, "x.rs", b"let x = 1;");
1005        enc.add_conversation(Role::User, b"hi");
1006
1007        let payload = enc.encode().unwrap();
1008
1009        // Calculate expected size manually:
1010        // Header: 8 bytes
1011        let mut expected = HEADER_SIZE;
1012
1013        // Walk actual frames to verify
1014        let mut cursor = HEADER_SIZE;
1015        loop {
1016            let remaining = &payload[cursor..];
1017            // Try to read a frame (including END which returns None)
1018            let start = cursor;
1019            match BlockFrame::read_from(remaining).unwrap() {
1020                Some((_, n)) => {
1021                    cursor += n;
1022                    expected += n;
1023                }
1024                None => {
1025                    // END sentinel was consumed — count those bytes too
1026                    let end_bytes = payload.len() - start;
1027                    expected += end_bytes;
1028                    break;
1029                }
1030            }
1031        }
1032
1033        assert_eq!(
1034            payload.len(),
1035            expected,
1036            "payload length should match header + frames + END"
1037        );
1038    }
1039
1040    #[test]
1041    fn optional_fields_omitted_when_none() {
1042        // CODE block without line_range
1043        let payload = BcpEncoder::new()
1044            .add_code(Lang::Rust, "x.rs", b"code")
1045            .encode()
1046            .unwrap();
1047
1048        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1049            .unwrap()
1050            .unwrap();
1051
1052        // Decode the body and verify line_range is None
1053        let code = CodeBlock::decode_body(&frame.body).unwrap();
1054        assert!(code.line_range.is_none());
1055
1056        // CONVERSATION block without tool_call_id
1057        let payload = BcpEncoder::new()
1058            .add_conversation(Role::User, b"msg")
1059            .encode()
1060            .unwrap();
1061
1062        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1063            .unwrap()
1064            .unwrap();
1065
1066        let conv = ConversationBlock::decode_body(&frame.body).unwrap();
1067        assert!(conv.tool_call_id.is_none());
1068    }
1069
1070    #[test]
1071    fn code_range_includes_line_numbers() {
1072        let payload = BcpEncoder::new()
1073            .add_code_range(Lang::Rust, "src/lib.rs", b"fn foo() {}", 10, 20)
1074            .encode()
1075            .unwrap();
1076
1077        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1078            .unwrap()
1079            .unwrap();
1080
1081        let code = CodeBlock::decode_body(&frame.body).unwrap();
1082        assert_eq!(code.line_range, Some((10, 20)));
1083    }
1084
1085    #[test]
1086    fn conversation_tool_includes_tool_call_id() {
1087        let payload = BcpEncoder::new()
1088            .add_conversation_tool(Role::Tool, b"result", "call_123")
1089            .encode()
1090            .unwrap();
1091
1092        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1093            .unwrap()
1094            .unwrap();
1095
1096        let conv = ConversationBlock::decode_body(&frame.body).unwrap();
1097        assert_eq!(conv.tool_call_id.as_deref(), Some("call_123"));
1098    }
1099
1100    #[test]
1101    fn summary_is_decodable_from_block_body() {
1102        let payload = BcpEncoder::new()
1103            .add_code(Lang::Rust, "main.rs", b"fn main() {}")
1104            .with_summary("Entry point for the application.").unwrap()
1105            .encode()
1106            .unwrap();
1107
1108        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1109            .unwrap()
1110            .unwrap();
1111
1112        assert!(frame.flags.has_summary());
1113
1114        // Decode summary from the front of the body
1115        let (summary, consumed) = Summary::decode(&frame.body).unwrap();
1116        assert_eq!(summary.text, "Entry point for the application.");
1117
1118        // Remaining bytes should decode as a valid CodeBlock
1119        let code = CodeBlock::decode_body(&frame.body[consumed..]).unwrap();
1120        assert_eq!(code.path, "main.rs");
1121        assert_eq!(code.content, b"fn main() {}");
1122    }
1123
1124    #[test]
1125    fn rfc_example_encodes_successfully() {
1126        // Reproduces the example from RFC §12.1 / SPEC_03 §1
1127        let payload = BcpEncoder::new()
1128            .add_code(Lang::Rust, "src/main.rs", b"fn main() { todo!() }")
1129            .with_summary("Entry point: CLI setup and server startup.").unwrap()
1130            .with_priority(Priority::High).unwrap()
1131            .add_conversation(Role::User, b"Fix the timeout bug.")
1132            .add_conversation(Role::Assistant, b"I'll examine the pool config...")
1133            .add_tool_result("ripgrep", Status::Ok, b"3 matches found.")
1134            .encode()
1135            .unwrap();
1136
1137        assert_starts_with_magic(&payload);
1138        assert_ends_with_end_sentinel(&payload);
1139
1140        // Walk all frames to verify structure
1141        let mut cursor = HEADER_SIZE;
1142        let mut types = Vec::new();
1143        loop {
1144            match BlockFrame::read_from(&payload[cursor..]).unwrap() {
1145                Some((frame, n)) => {
1146                    types.push(frame.block_type);
1147                    cursor += n;
1148                }
1149                None => break,
1150            }
1151        }
1152
1153        assert_eq!(
1154            types,
1155            vec![
1156                block_type::CODE,
1157                block_type::ANNOTATION, // from with_priority
1158                block_type::CONVERSATION,
1159                block_type::CONVERSATION,
1160                block_type::TOOL_RESULT,
1161            ]
1162        );
1163    }
1164
1165    #[test]
1166    fn default_impl_matches_new() {
1167        let from_new = BcpEncoder::new();
1168        let from_default = BcpEncoder::default();
1169        assert!(from_new.blocks.is_empty());
1170        assert!(from_default.blocks.is_empty());
1171    }
1172
1173    // ── Per-block compression tests ─────────────────────────────────────
1174
1175    #[test]
1176    fn per_block_compression_sets_compressed_flag() {
1177        // Create a large, compressible block (exceeds COMPRESSION_THRESHOLD)
1178        let big_content = "fn main() { println!(\"hello world\"); }\n".repeat(50);
1179        let payload = BcpEncoder::new()
1180            .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1181            .with_compression().unwrap()
1182            .encode()
1183            .unwrap();
1184
1185        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1186            .unwrap()
1187            .unwrap();
1188        assert!(
1189            frame.flags.is_compressed(),
1190            "COMPRESSED flag should be set on a large compressible block"
1191        );
1192        assert!(
1193            frame.body.len() < big_content.len(),
1194            "compressed body should be smaller than original"
1195        );
1196    }
1197
1198    #[test]
1199    fn small_block_not_compressed_even_when_requested() {
1200        let payload = BcpEncoder::new()
1201            .add_code(Lang::Rust, "x.rs", b"let x = 1;")
1202            .with_compression().unwrap()
1203            .encode()
1204            .unwrap();
1205
1206        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1207            .unwrap()
1208            .unwrap();
1209        assert!(
1210            !frame.flags.is_compressed(),
1211            "small blocks should not be compressed (below threshold)"
1212        );
1213    }
1214
1215    #[test]
1216    fn compress_blocks_applies_to_all() {
1217        let big_content = "use std::io;\n".repeat(100);
1218        let payload = BcpEncoder::new()
1219            .add_code(Lang::Rust, "a.rs", big_content.as_bytes())
1220            .add_code(Lang::Rust, "b.rs", big_content.as_bytes())
1221            .compress_blocks()
1222            .encode()
1223            .unwrap();
1224
1225        let mut cursor = HEADER_SIZE;
1226        for _ in 0..2 {
1227            let (frame, n) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
1228            assert!(
1229                frame.flags.is_compressed(),
1230                "all blocks should be compressed with compress_blocks()"
1231            );
1232            cursor += n;
1233        }
1234    }
1235
1236    // ── Whole-payload compression tests ─────────────────────────────────
1237
1238    #[test]
1239    fn whole_payload_compression_sets_header_flag() {
1240        let big_content = "pub fn hello() -> &'static str { \"world\" }\n".repeat(100);
1241        let payload = BcpEncoder::new()
1242            .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1243            .compress_payload()
1244            .encode()
1245            .unwrap();
1246
1247        let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1248        assert!(
1249            header.flags.is_compressed(),
1250            "header COMPRESSED flag should be set for whole-payload compression"
1251        );
1252    }
1253
1254    #[test]
1255    fn whole_payload_skips_per_block_compression() {
1256        // When whole-payload compression is active, individual block
1257        // COMPRESSED flags should NOT be set.
1258        let big_content = "pub fn hello() -> &'static str { \"world\" }\n".repeat(100);
1259        let payload = BcpEncoder::new()
1260            .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1261            .with_compression().unwrap()
1262            .compress_payload()
1263            .encode()
1264            .unwrap();
1265
1266        let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1267        assert!(header.flags.is_compressed());
1268
1269        // Decompress the payload to check individual blocks
1270        let decompressed =
1271            crate::compression::decompress(&payload[HEADER_SIZE..], 16 * 1024 * 1024).unwrap();
1272
1273        let (frame, _) = BlockFrame::read_from(&decompressed).unwrap().unwrap();
1274        assert!(
1275            !frame.flags.is_compressed(),
1276            "per-block COMPRESSED flag should not be set when whole-payload is active"
1277        );
1278    }
1279
1280    #[test]
1281    fn whole_payload_no_savings_stays_uncompressed() {
1282        // Tiny payload — zstd overhead exceeds savings
1283        let payload = BcpEncoder::new()
1284            .add_code(Lang::Rust, "x.rs", b"x")
1285            .compress_payload()
1286            .encode()
1287            .unwrap();
1288
1289        let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1290        assert!(
1291            !header.flags.is_compressed(),
1292            "header COMPRESSED flag should NOT be set when compression yields no savings"
1293        );
1294    }
1295
1296    // ── Content addressing tests ────────────────────────────────────────
1297
1298    #[test]
1299    fn content_addressing_sets_reference_flag() {
1300        let store = Arc::new(crate::MemoryContentStore::new());
1301        let payload = BcpEncoder::new()
1302            .set_content_store(store.clone())
1303            .add_code(Lang::Rust, "main.rs", b"fn main() {}")
1304            .with_content_addressing().unwrap()
1305            .encode()
1306            .unwrap();
1307
1308        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1309            .unwrap()
1310            .unwrap();
1311        assert!(
1312            frame.flags.is_reference(),
1313            "IS_REFERENCE flag should be set on content-addressed block"
1314        );
1315        assert_eq!(
1316            frame.body.len(),
1317            32,
1318            "reference block body should be exactly 32 bytes (BLAKE3 hash)"
1319        );
1320
1321        // The hash should resolve in the store
1322        let hash: [u8; 32] = frame.body.try_into().unwrap();
1323        assert!(store.contains(&hash));
1324    }
1325
1326    #[test]
1327    fn content_addressing_without_store_errors() {
1328        let result = BcpEncoder::new()
1329            .add_code(Lang::Rust, "main.rs", b"fn main() {}")
1330            .with_content_addressing().unwrap()
1331            .encode();
1332
1333        assert!(
1334            matches!(result, Err(EncodeError::MissingContentStore)),
1335            "should error when content addressing is requested without a store"
1336        );
1337    }
1338
1339    #[test]
1340    fn auto_dedup_detects_duplicate_blocks() {
1341        let store = Arc::new(crate::MemoryContentStore::new());
1342        // Both blocks must have identical serialized TLV bodies (same
1343        // path + content + lang) for auto-dedup to detect a duplicate.
1344        let content = b"fn main() {}";
1345
1346        let payload = BcpEncoder::new()
1347            .set_content_store(store.clone())
1348            .auto_dedup()
1349            .add_code(Lang::Rust, "main.rs", content)
1350            .add_code(Lang::Rust, "main.rs", content) // identical TLV body
1351            .encode()
1352            .unwrap();
1353
1354        let mut cursor = HEADER_SIZE;
1355
1356        // First block: inline (first occurrence)
1357        let (frame0, n) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
1358        assert!(
1359            !frame0.flags.is_reference(),
1360            "first occurrence should be stored inline"
1361        );
1362        cursor += n;
1363
1364        // Second block: reference (duplicate)
1365        let (frame1, _) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
1366        assert!(
1367            frame1.flags.is_reference(),
1368            "duplicate should become a hash reference"
1369        );
1370        assert_eq!(frame1.body.len(), 32);
1371    }
1372
1373    #[test]
1374    fn auto_dedup_without_store_errors() {
1375        let result = BcpEncoder::new()
1376            .auto_dedup()
1377            .add_code(Lang::Rust, "x.rs", b"code")
1378            .encode();
1379
1380        assert!(matches!(result, Err(EncodeError::MissingContentStore)));
1381    }
1382
1383    #[test]
1384    fn reference_block_not_per_block_compressed() {
1385        // Content-addressed blocks produce 32-byte bodies which are
1386        // below the compression threshold — verify no COMPRESSED flag.
1387        let store = Arc::new(crate::MemoryContentStore::new());
1388        let big_content = "fn main() { println!(\"hello\"); }\n".repeat(50);
1389        let payload = BcpEncoder::new()
1390            .set_content_store(store)
1391            .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1392            .with_content_addressing().unwrap()
1393            .with_compression().unwrap()
1394            .encode()
1395            .unwrap();
1396
1397        let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1398            .unwrap()
1399            .unwrap();
1400        assert!(frame.flags.is_reference());
1401        assert!(
1402            !frame.flags.is_compressed(),
1403            "reference blocks should not be per-block compressed"
1404        );
1405    }
1406
1407    #[test]
1408    fn content_addressing_with_whole_payload_compression() {
1409        // Reference blocks CAN be wrapped in whole-payload compression.
1410        let store = Arc::new(crate::MemoryContentStore::new());
1411        // Same path + content = identical TLV body = single store entry
1412        let content = "fn main() { println!(\"hello\"); }\n".repeat(50);
1413
1414        let payload = BcpEncoder::new()
1415            .set_content_store(store.clone())
1416            .compress_payload()
1417            .add_code(Lang::Rust, "main.rs", content.as_bytes())
1418            .with_content_addressing().unwrap()
1419            .add_code(Lang::Rust, "main.rs", content.as_bytes())
1420            .with_content_addressing().unwrap()
1421            .encode()
1422            .unwrap();
1423
1424        let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1425        // The payload might or might not be compressed (two 32-byte hashes
1426        // plus framing may not compress well), but if it is, verify it's valid.
1427        if header.flags.is_compressed() {
1428            let decompressed =
1429                crate::compression::decompress(&payload[HEADER_SIZE..], 16 * 1024 * 1024).unwrap();
1430
1431            let (frame, _) = BlockFrame::read_from(&decompressed).unwrap().unwrap();
1432            assert!(frame.flags.is_reference());
1433            assert_eq!(frame.body.len(), 32);
1434        }
1435
1436        // Both blocks have identical TLV bodies → single store entry
1437        assert_eq!(
1438            store.len(),
1439            1,
1440            "identical blocks should produce one store entry"
1441        );
1442    }
1443
1444    // ── Phase 4: Cross-cutting tests ────────────────────────────────────
1445
1446    #[test]
1447    fn compression_ratio_benchmark() {
1448        // A realistic 50-line Rust file should compress by >= 20%.
1449        let rust_code = r#"use std::collections::HashMap;
1450use std::sync::Arc;
1451
1452pub struct Config {
1453    pub name: String,
1454    pub values: HashMap<String, String>,
1455    pub timeout: u64,
1456}
1457
1458impl Config {
1459    pub fn new(name: &str) -> Self {
1460        Self {
1461            name: name.to_string(),
1462            values: HashMap::new(),
1463            timeout: 30,
1464        }
1465    }
1466
1467    pub fn set(&mut self, key: &str, value: &str) {
1468        self.values.insert(key.to_string(), value.to_string());
1469    }
1470
1471    pub fn get(&self, key: &str) -> Option<&String> {
1472        self.values.get(key)
1473    }
1474
1475    pub fn timeout(&self) -> u64 {
1476        self.timeout
1477    }
1478}
1479
1480impl Default for Config {
1481    fn default() -> Self {
1482        Self::new("default")
1483    }
1484}
1485
1486#[cfg(test)]
1487mod tests {
1488    use super::*;
1489
1490    #[test]
1491    fn test_new_config() {
1492        let config = Config::new("test");
1493        assert_eq!(config.name, "test");
1494        assert!(config.values.is_empty());
1495        assert_eq!(config.timeout(), 30);
1496    }
1497
1498    #[test]
1499    fn test_set_and_get() {
1500        let mut config = Config::new("test");
1501        config.set("key", "value");
1502        assert_eq!(config.get("key"), Some(&"value".to_string()));
1503    }
1504}
1505"#;
1506
1507        let uncompressed_payload = BcpEncoder::new()
1508            .add_code(Lang::Rust, "config.rs", rust_code.as_bytes())
1509            .encode()
1510            .unwrap();
1511
1512        let compressed_payload = BcpEncoder::new()
1513            .add_code(Lang::Rust, "config.rs", rust_code.as_bytes())
1514            .with_compression().unwrap()
1515            .encode()
1516            .unwrap();
1517
1518        let savings_pct =
1519            100.0 * (1.0 - compressed_payload.len() as f64 / uncompressed_payload.len() as f64);
1520
1521        assert!(
1522            savings_pct >= 20.0,
1523            "expected >= 20% compression savings on a 50-line Rust file, got {savings_pct:.1}%"
1524        );
1525    }
1526
1527    #[test]
1528    fn whole_payload_wins_over_per_block() {
1529        // When both per-block and whole-payload compression are requested,
1530        // only the header COMPRESSED flag should be set; individual blocks
1531        // should NOT have their COMPRESSED flags set.
1532        let big_content = "pub fn process() -> Result<(), Error> { Ok(()) }\n".repeat(50);
1533        let payload = BcpEncoder::new()
1534            .add_code(Lang::Rust, "a.rs", big_content.as_bytes())
1535            .with_compression().unwrap()
1536            .add_code(Lang::Rust, "b.rs", big_content.as_bytes())
1537            .with_compression().unwrap()
1538            .compress_payload()
1539            .encode()
1540            .unwrap();
1541
1542        let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1543        assert!(
1544            header.flags.is_compressed(),
1545            "header should have COMPRESSED flag"
1546        );
1547
1548        // Decompress payload to inspect individual blocks
1549        let decompressed =
1550            crate::compression::decompress(&payload[HEADER_SIZE..], 16 * 1024 * 1024).unwrap();
1551
1552        let mut cursor = 0;
1553        while let Some((frame, n)) = BlockFrame::read_from(&decompressed[cursor..]).unwrap() {
1554            assert!(
1555                !frame.flags.is_compressed(),
1556                "individual blocks should NOT be compressed when whole-payload is active"
1557            );
1558            cursor += n;
1559        }
1560    }
1561
1562    #[test]
1563    fn full_pipeline_encode_decode_roundtrip() {
1564        // Exercises all features together: multiple block types,
1565        // summaries, priorities, per-block compression, content
1566        // addressing, and auto-dedup.
1567        let store = Arc::new(crate::MemoryContentStore::new());
1568        let big_code = "fn compute() -> i64 { 42 }\n".repeat(50);
1569
1570        let payload = BcpEncoder::new()
1571            .set_content_store(store.clone())
1572            .auto_dedup()
1573            .add_code(Lang::Rust, "lib.rs", big_code.as_bytes())
1574            .with_summary("Core computation module.").unwrap()
1575            .with_compression().unwrap()
1576            .add_code(Lang::Rust, "lib.rs", big_code.as_bytes()) // auto-dedup
1577            .add_conversation(Role::User, b"Review this code")
1578            .add_tool_result("clippy", Status::Ok, b"No warnings")
1579            .encode()
1580            .unwrap();
1581
1582        // Decode with the same store
1583        let decoded = bcp_decoder::BcpDecoder::decode_with_store(&payload, store.as_ref()).unwrap();
1584
1585        assert_eq!(decoded.blocks.len(), 4);
1586        assert_eq!(decoded.blocks[0].block_type, bcp_types::BlockType::Code);
1587        assert_eq!(
1588            decoded.blocks[0].summary.as_ref().unwrap().text,
1589            "Core computation module."
1590        );
1591        assert_eq!(decoded.blocks[1].block_type, bcp_types::BlockType::Code);
1592        assert_eq!(
1593            decoded.blocks[2].block_type,
1594            bcp_types::BlockType::Conversation
1595        );
1596        assert_eq!(
1597            decoded.blocks[3].block_type,
1598            bcp_types::BlockType::ToolResult
1599        );
1600
1601        // Both code blocks should have the same content
1602        for block in &decoded.blocks[..2] {
1603            match &block.content {
1604                BlockContent::Code(code) => {
1605                    assert_eq!(code.content, big_code.as_bytes());
1606                }
1607                other => panic!("expected Code, got {other:?}"),
1608            }
1609        }
1610    }
1611}
bcp_encoder/encoder.rs

bcp_encoder/
encoder.rs