bcp_encoder/encoder.rs
1use std::sync::Arc;
2
3use bcp_types::BlockContent;
4use bcp_types::annotation::AnnotationBlock;
5use bcp_types::code::CodeBlock;
6use bcp_types::content_store::ContentStore;
7use bcp_types::conversation::ConversationBlock;
8use bcp_types::diff::{DiffBlock, DiffHunk};
9use bcp_types::embedding_ref::EmbeddingRefBlock;
10use bcp_types::document::DocumentBlock;
11use bcp_types::enums::{
12 AnnotationKind, DataFormat, FormatHint, Lang, MediaType, Priority, Role, Status,
13};
14use bcp_types::extension::ExtensionBlock;
15use bcp_types::file_tree::{FileEntry, FileTreeBlock};
16use bcp_types::image::ImageBlock;
17use bcp_types::structured_data::StructuredDataBlock;
18use bcp_types::summary::Summary;
19use bcp_types::tool_result::ToolResultBlock;
20use bcp_wire::block_frame::{BlockFlags, BlockFrame, block_type};
21use bcp_wire::header::{HEADER_SIZE, HeaderFlags, BcpHeader};
22
23use crate::compression::{self, COMPRESSION_THRESHOLD};
24use crate::error::EncodeError;
25
26/// Maximum block body size (16 MiB). Blocks exceeding this limit produce
27/// an [`EncodeError::BlockTooLarge`] during `.encode()`.
28const MAX_BLOCK_BODY_SIZE: usize = 16 * 1024 * 1024;
29
30/// BCP encoder — constructs a binary payload from structured blocks.
31///
32/// The encoder is the tool-facing API that allows agents, MCP servers,
33/// and other producers to build BCP payloads. It follows the builder
34/// pattern defined in RFC §5.6: methods like [`add_code`](Self::add_code),
35/// [`add_conversation`](Self::add_conversation), etc. append typed blocks
36/// to an internal list, and chainable modifiers like
37/// [`with_summary`](Self::with_summary) and
38/// [`with_priority`](Self::with_priority) annotate the most recently
39/// added block.
40///
41/// # Compression (RFC §4.6)
42///
43/// Two compression modes are supported, both opt-in:
44///
45/// - **Per-block**: call [`with_compression`](Self::with_compression) after
46/// adding a block, or [`compress_blocks`](Self::compress_blocks) to
47/// enable compression for all subsequent blocks. Each block body is
48/// independently zstd-compressed if it exceeds
49/// [`COMPRESSION_THRESHOLD`](crate::compression::COMPRESSION_THRESHOLD)
50/// bytes and compression yields a size reduction. The block's
51/// `COMPRESSED` flag (bit 1) is set when compression is applied.
52///
53/// - **Whole-payload**: call [`compress_payload`](Self::compress_payload)
54/// to zstd-compress all bytes after the 8-byte header. When enabled,
55/// per-block compression is skipped (whole-payload subsumes it). The
56/// header's `COMPRESSED` flag (bit 0) is set.
57///
58/// # Content Addressing (RFC §4.7)
59///
60/// When a [`ContentStore`] is configured via
61/// [`set_content_store`](Self::set_content_store), blocks can be stored
62/// by their BLAKE3 hash rather than inline:
63///
64/// - **Per-block**: call [`with_content_addressing`](Self::with_content_addressing)
65/// after adding a block. The body is hashed, stored in the content store,
66/// and replaced with the 32-byte hash on the wire. The block's
67/// `IS_REFERENCE` flag (bit 2) is set.
68///
69/// - **Auto-dedup**: call [`auto_dedup`](Self::auto_dedup) to automatically
70/// content-address any block whose body has been seen before. First
71/// occurrence is stored inline and registered in the store; subsequent
72/// identical blocks become references.
73///
74/// Content addressing runs before compression — a 32-byte hash reference
75/// is always below the compression threshold, so reference blocks are
76/// never compressed.
77///
78/// # Usage
79///
80/// ```rust
81/// use bcp_encoder::BcpEncoder;
82/// use bcp_types::enums::{Lang, Role, Status, Priority};
83///
84/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
85/// let payload = BcpEncoder::new()
86/// .add_code(Lang::Rust, "src/main.rs", b"fn main() {}")
87/// .with_summary("Entry point: CLI setup and server startup.")?
88/// .with_priority(Priority::High)?
89/// .add_conversation(Role::User, b"Fix the timeout bug.")
90/// .add_conversation(Role::Assistant, b"I'll examine the pool config...")
91/// .add_tool_result("ripgrep", Status::Ok, b"3 matches found.")
92/// .encode()?;
93/// # Ok(())
94/// # }
95/// ```
96///
97/// # Output layout
98///
99/// The `.encode()` method serializes all accumulated blocks into a
100/// self-contained byte sequence:
101///
102/// ```text
103/// ┌──────────────┬──────────────────────────────────────────┐
104/// │ [8 bytes] │ File header (magic, version, flags, rsv) │
105/// │ [N bytes] │ Block 0 frame (type + flags + len + body)│
106/// │ [N bytes] │ Block 1 frame ... │
107/// │ ... │ │
108/// │ [2-3 bytes] │ END sentinel (type=0xFF, flags=0, len=0) │
109/// └──────────────┴──────────────────────────────────────────┘
110/// ```
111///
112/// When whole-payload compression is enabled, the layout becomes:
113///
114/// ```text
115/// ┌──────────────┬──────────────────────────────────────────┐
116/// │ [8 bytes] │ Header (flags bit 0 = COMPRESSED) │
117/// │ [N bytes] │ zstd(Block 0 + Block 1 + ... + END) │
118/// └──────────────┴──────────────────────────────────────────┘
119/// ```
120///
121/// The payload is ready for storage or transmission — no further
122/// framing is required.
123pub struct BcpEncoder {
124 blocks: Vec<PendingBlock>,
125 flags: HeaderFlags,
126 /// When `true`, the entire payload after the header is zstd-compressed.
127 compress_payload: bool,
128 /// When `true`, all blocks are individually compressed (unless
129 /// `compress_payload` is also set, which takes precedence).
130 compress_all_blocks: bool,
131 /// Content store for BLAKE3 content-addressed deduplication.
132 /// Required when any block has `content_address = true` or
133 /// when `auto_dedup` is enabled.
134 content_store: Option<Arc<dyn ContentStore>>,
135 /// When `true`, automatically content-address any block whose body
136 /// has been seen before (hash already exists in the store).
137 auto_dedup: bool,
138}
139
140/// Internal representation of a block awaiting serialization.
141///
142/// Captures the block type tag, the typed content (which knows how to
143/// serialize its own TLV body via [`BlockContent::encode_body`]), an
144/// optional summary to prepend, and per-block compression / content
145/// addressing flags.
146///
147/// `PendingBlock` is never exposed publicly. The encoder builds these
148/// internally as the caller chains `.add_*()` and `.with_*()` methods,
149/// then consumes them during `.encode()`.
150struct PendingBlock {
151 block_type: u8,
152 content: BlockContent,
153 summary: Option<String>,
154 /// When `true`, this block's body should be zstd-compressed if it
155 /// exceeds [`COMPRESSION_THRESHOLD`] and compression yields savings.
156 compress: bool,
157 /// When `true`, this block's body should be replaced with its
158 /// 32-byte BLAKE3 hash and stored in the content store.
159 content_address: bool,
160}
161
162impl BcpEncoder {
163 /// Create a new encoder with default settings (version 1.0, no flags).
164 ///
165 /// The encoder starts with an empty block list, no compression, and
166 /// no content store. At least one block must be added before calling
167 /// `.encode()`, otherwise it returns [`EncodeError::EmptyPayload`].
168 #[must_use]
169 pub fn new() -> Self {
170 Self {
171 blocks: Vec::new(),
172 flags: HeaderFlags::NONE,
173 compress_payload: false,
174 compress_all_blocks: false,
175 content_store: None,
176 auto_dedup: false,
177 }
178 }
179
180 // ── Block addition methods ──────────────────────────────────────────
181 //
182 // Each method constructs the appropriate `BlockContent` variant from
183 // `bcp-types`, wraps it in a `PendingBlock`, pushes it onto the
184 // internal list, and returns `&mut Self` for chaining.
185
186 /// Add a CODE block.
187 ///
188 /// Encodes a source code file or fragment. The `lang` enum identifies
189 /// the programming language (used by the decoder for syntax-aware
190 /// rendering), `path` is the file path (UTF-8), and `content` is the
191 /// raw source bytes.
192 ///
193 /// For partial files, use [`add_code_range`](Self::add_code_range)
194 /// to include line range metadata.
195 pub fn add_code(&mut self, lang: Lang, path: &str, content: &[u8]) -> &mut Self {
196 self.push_block(
197 block_type::CODE,
198 BlockContent::Code(CodeBlock {
199 lang,
200 path: path.to_string(),
201 content: content.to_vec(),
202 line_range: None,
203 }),
204 )
205 }
206
207 /// Add a CODE block with a line range.
208 ///
209 /// Same as [`add_code`](Self::add_code) but includes `line_start` and
210 /// `line_end` metadata (1-based, inclusive). The decoder can use this
211 /// to display line numbers or to correlate with diagnostics.
212 pub fn add_code_range(
213 &mut self,
214 lang: Lang,
215 path: &str,
216 content: &[u8],
217 line_start: u32,
218 line_end: u32,
219 ) -> &mut Self {
220 self.push_block(
221 block_type::CODE,
222 BlockContent::Code(CodeBlock {
223 lang,
224 path: path.to_string(),
225 content: content.to_vec(),
226 line_range: Some((line_start, line_end)),
227 }),
228 )
229 }
230
231 /// Add a CONVERSATION block.
232 ///
233 /// Represents a single chat turn. The `role` identifies the speaker
234 /// (system, user, assistant, or tool) and `content` is the message
235 /// body as raw bytes.
236 pub fn add_conversation(&mut self, role: Role, content: &[u8]) -> &mut Self {
237 self.push_block(
238 block_type::CONVERSATION,
239 BlockContent::Conversation(ConversationBlock {
240 role,
241 content: content.to_vec(),
242 tool_call_id: None,
243 }),
244 )
245 }
246
247 /// Add a CONVERSATION block with a tool call ID.
248 ///
249 /// Used for tool-role messages that reference a specific tool
250 /// invocation. The `tool_call_id` links this response back to the
251 /// tool call that produced it.
252 pub fn add_conversation_tool(
253 &mut self,
254 role: Role,
255 content: &[u8],
256 tool_call_id: &str,
257 ) -> &mut Self {
258 self.push_block(
259 block_type::CONVERSATION,
260 BlockContent::Conversation(ConversationBlock {
261 role,
262 content: content.to_vec(),
263 tool_call_id: Some(tool_call_id.to_string()),
264 }),
265 )
266 }
267
268 /// Add a `FILE_TREE` block.
269 ///
270 /// Represents a directory structure rooted at `root`. Each entry
271 /// contains a name, kind (file or directory), size, and optional
272 /// nested children for recursive directory trees.
273 pub fn add_file_tree(&mut self, root: &str, entries: Vec<FileEntry>) -> &mut Self {
274 self.push_block(
275 block_type::FILE_TREE,
276 BlockContent::FileTree(FileTreeBlock {
277 root_path: root.to_string(),
278 entries,
279 }),
280 )
281 }
282
283 /// Add a `TOOL_RESULT` block.
284 ///
285 /// Captures the output of an external tool invocation (e.g. ripgrep,
286 /// LSP diagnostics, test runner). The `status` indicates whether the
287 /// tool succeeded, failed, or timed out.
288 pub fn add_tool_result(&mut self, name: &str, status: Status, content: &[u8]) -> &mut Self {
289 self.push_block(
290 block_type::TOOL_RESULT,
291 BlockContent::ToolResult(ToolResultBlock {
292 tool_name: name.to_string(),
293 status,
294 content: content.to_vec(),
295 schema_hint: None,
296 }),
297 )
298 }
299
300 /// Add a DOCUMENT block.
301 ///
302 /// Represents prose content — README files, documentation, wiki pages.
303 /// The `format_hint` tells the decoder how to render the body
304 /// (markdown, plain text, or HTML).
305 pub fn add_document(
306 &mut self,
307 title: &str,
308 content: &[u8],
309 format_hint: FormatHint,
310 ) -> &mut Self {
311 self.push_block(
312 block_type::DOCUMENT,
313 BlockContent::Document(DocumentBlock {
314 title: title.to_string(),
315 content: content.to_vec(),
316 format_hint,
317 }),
318 )
319 }
320
321 /// Add a `STRUCTURED_DATA` block.
322 ///
323 /// Encodes tabular or structured content — JSON configs, YAML
324 /// manifests, TOML files, CSV data. The `format` identifies the
325 /// serialization format so the decoder can syntax-highlight or
326 /// parse appropriately.
327 pub fn add_structured_data(&mut self, format: DataFormat, content: &[u8]) -> &mut Self {
328 self.push_block(
329 block_type::STRUCTURED_DATA,
330 BlockContent::StructuredData(StructuredDataBlock {
331 format,
332 content: content.to_vec(),
333 schema: None,
334 }),
335 )
336 }
337
338 /// Add a DIFF block.
339 ///
340 /// Represents code changes for a single file — from git diffs, editor
341 /// changes, or patch files. Each hunk captures a contiguous range of
342 /// modifications in unified diff format.
343 pub fn add_diff(&mut self, path: &str, hunks: Vec<DiffHunk>) -> &mut Self {
344 self.push_block(
345 block_type::DIFF,
346 BlockContent::Diff(DiffBlock {
347 path: path.to_string(),
348 hunks,
349 }),
350 )
351 }
352
353 /// Add an ANNOTATION block.
354 ///
355 /// Annotations are metadata overlays that target another block by its
356 /// zero-based index in the stream. The `kind` determines how the
357 /// `value` payload is interpreted (priority level, summary text, or
358 /// tag label).
359 ///
360 /// For the common case of attaching a priority to the most recent
361 /// block, prefer [`with_priority`](Self::with_priority).
362 pub fn add_annotation(
363 &mut self,
364 target_block_id: u32,
365 kind: AnnotationKind,
366 value: &[u8],
367 ) -> &mut Self {
368 self.push_block(
369 block_type::ANNOTATION,
370 BlockContent::Annotation(AnnotationBlock {
371 target_block_id,
372 kind,
373 value: value.to_vec(),
374 }),
375 )
376 }
377
378 /// Add an EMBEDDING_REF block.
379 ///
380 /// Points to a pre-computed vector embedding stored externally (e.g.
381 /// in a vector database). The `vector_id` is an opaque byte identifier
382 /// for the vector in the external store, `source_hash` is the BLAKE3
383 /// hash of the content that was embedded (32 bytes), and `model` is
384 /// the name of the embedding model (e.g. `"text-embedding-3-small"`).
385 ///
386 /// # Wire type
387 ///
388 /// Block type `0x09` (`EMBEDDING_REF`). See RFC §4.4.
389 pub fn add_embedding_ref(
390 &mut self,
391 vector_id: &[u8],
392 source_hash: &[u8],
393 model: &str,
394 ) -> &mut Self {
395 self.push_block(
396 block_type::EMBEDDING_REF,
397 BlockContent::EmbeddingRef(EmbeddingRefBlock {
398 vector_id: vector_id.to_vec(),
399 source_hash: source_hash.to_vec(),
400 model: model.to_string(),
401 }),
402 )
403 }
404
405 /// Add an IMAGE block.
406 ///
407 /// Encodes an image as inline binary data. The `media_type` identifies
408 /// the image format (PNG, JPEG, etc.), `alt_text` provides a textual
409 /// description for accessibility, and `data` is the raw image bytes.
410 pub fn add_image(&mut self, media_type: MediaType, alt_text: &str, data: &[u8]) -> &mut Self {
411 self.push_block(
412 block_type::IMAGE,
413 BlockContent::Image(ImageBlock {
414 media_type,
415 alt_text: alt_text.to_string(),
416 data: data.to_vec(),
417 }),
418 )
419 }
420
421 /// Add an EXTENSION block.
422 ///
423 /// User-defined block type for custom payloads. The `namespace` and
424 /// `type_name` together form a unique identifier for the extension
425 /// type, preventing collisions across different tools and vendors.
426 pub fn add_extension(&mut self, namespace: &str, type_name: &str, content: &[u8]) -> &mut Self {
427 self.push_block(
428 block_type::EXTENSION,
429 BlockContent::Extension(ExtensionBlock {
430 namespace: namespace.to_string(),
431 type_name: type_name.to_string(),
432 content: content.to_vec(),
433 }),
434 )
435 }
436
437 // ── Modifier methods ────────────────────────────────────────────────
438 //
439 // Modifiers act on the most recently added block. They set metadata
440 // that affects how the block is serialized (summary prefix, flags)
441 // or append related blocks (priority annotation).
442
443 /// Attach a summary to the most recently added block.
444 ///
445 /// Sets the `HAS_SUMMARY` flag on the block and prepends the summary
446 /// sub-block to the body during serialization. The summary is a
447 /// compact UTF-8 description that the token budget engine can use as
448 /// a stand-in when the full block content would exceed the budget.
449 ///
450 /// # Errors
451 ///
452 /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
453 /// added yet. Use this immediately after an `.add_*()` call.
454 pub fn with_summary(&mut self, summary: &str) -> Result<&mut Self, EncodeError> {
455 let block = self
456 .blocks
457 .last_mut()
458 .ok_or(EncodeError::NoBlockTarget { method: "with_summary" })?;
459 block.summary = Some(summary.to_string());
460 Ok(self)
461 }
462
463 /// Attach a priority annotation to the most recently added block.
464 ///
465 /// This is a convenience method that appends an ANNOTATION block
466 /// with `kind=Priority` targeting the last added block's index.
467 /// The annotation's value is the priority byte (e.g. `0x02` for
468 /// `Priority::High`).
469 ///
470 /// # Errors
471 ///
472 /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
473 /// added yet.
474 pub fn with_priority(&mut self, priority: Priority) -> Result<&mut Self, EncodeError> {
475 let target_index = self
476 .blocks
477 .len()
478 .checked_sub(1)
479 .ok_or(EncodeError::NoBlockTarget { method: "with_priority" })?;
480
481 #[allow(clippy::cast_possible_truncation)]
482 let target_id = target_index as u32;
483
484 self.push_block(
485 block_type::ANNOTATION,
486 BlockContent::Annotation(AnnotationBlock {
487 target_block_id: target_id,
488 kind: AnnotationKind::Priority,
489 value: vec![priority.to_wire_byte()],
490 }),
491 );
492 Ok(self)
493 }
494
495 // ── Compression modifiers ────────────────────────────────────────────
496 //
497 // These methods control per-block and whole-payload zstd compression.
498 // Per-block compression is skipped when whole-payload compression is
499 // enabled — the outer zstd frame subsumes individual block compression.
500
501 /// Enable zstd compression for the most recently added block.
502 ///
503 /// During `.encode()`, the block body is compressed with zstd if it
504 /// exceeds [`COMPRESSION_THRESHOLD`] bytes and compression yields a
505 /// size reduction. If compression doesn't help (output >= input), the
506 /// body is stored uncompressed and the `COMPRESSED` flag is not set.
507 ///
508 /// Has no effect if [`compress_payload`](Self::compress_payload) is
509 /// also enabled — whole-payload compression takes precedence.
510 ///
511 /// # Errors
512 ///
513 /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
514 /// added yet.
515 pub fn with_compression(&mut self) -> Result<&mut Self, EncodeError> {
516 let block = self
517 .blocks
518 .last_mut()
519 .ok_or(EncodeError::NoBlockTarget { method: "with_compression" })?;
520 block.compress = true;
521 Ok(self)
522 }
523
524 /// Enable zstd compression for all blocks added so far and all
525 /// future blocks.
526 ///
527 /// Equivalent to calling [`with_compression`](Self::with_compression)
528 /// on every block. Individual blocks still respect the size threshold
529 /// and no-savings guard.
530 pub fn compress_blocks(&mut self) -> &mut Self {
531 self.compress_all_blocks = true;
532 for block in &mut self.blocks {
533 block.compress = true;
534 }
535 self
536 }
537
538 /// Enable whole-payload zstd compression.
539 ///
540 /// When set, the entire block stream (all frames + END sentinel) is
541 /// compressed as a single zstd frame. The 8-byte header is written
542 /// uncompressed with `HeaderFlags::COMPRESSED` set so the decoder
543 /// can detect compression before reading further.
544 ///
545 /// When whole-payload compression is enabled, per-block compression
546 /// is skipped — compressing within a compressed stream adds overhead
547 /// without benefit.
548 ///
549 /// If compression doesn't reduce the total size, the payload is
550 /// stored uncompressed and the header flag is not set.
551 ///
552 /// **Tradeoff**: Whole-payload compression disables incremental
553 /// streaming in `StreamingDecoder` — the decoder must buffer and
554 /// decompress the entire payload before yielding any blocks. If
555 /// streaming is important, use [`compress_blocks`](Self::compress_blocks)
556 /// instead.
557 pub fn compress_payload(&mut self) -> &mut Self {
558 self.compress_payload = true;
559 self
560 }
561
562 // ── Content addressing modifiers ────────────────────────────────────
563 //
564 // These methods control BLAKE3 content-addressed deduplication.
565 // A content store must be configured before blocks can be
566 // content-addressed.
567
568 /// Set the content store used for BLAKE3 content addressing.
569 ///
570 /// The store is shared via `Arc` so the same store can be passed to
571 /// both the encoder and decoder for roundtrip workflows. The encoder
572 /// calls `store.put()` for each content-addressed block; the decoder
573 /// calls `store.get()` to resolve references.
574 ///
575 /// Must be called before `.encode()` if any block has content
576 /// addressing enabled or if [`auto_dedup`](Self::auto_dedup) is set.
577 pub fn set_content_store(&mut self, store: Arc<dyn ContentStore>) -> &mut Self {
578 self.content_store = Some(store);
579 self
580 }
581
582 /// Enable content addressing for the most recently added block.
583 ///
584 /// During `.encode()`, the block body is hashed with BLAKE3,
585 /// stored in the content store, and replaced with the 32-byte hash
586 /// on the wire. The block's `IS_REFERENCE` flag (bit 2) is set.
587 ///
588 /// Requires a content store — call
589 /// [`set_content_store`](Self::set_content_store) before `.encode()`.
590 ///
591 /// Content addressing runs before compression. Since a 32-byte
592 /// hash reference is always below [`COMPRESSION_THRESHOLD`],
593 /// reference blocks are never per-block compressed.
594 ///
595 /// # Errors
596 ///
597 /// Returns [`EncodeError::NoBlockTarget`] if no blocks have been
598 /// added yet.
599 pub fn with_content_addressing(&mut self) -> Result<&mut Self, EncodeError> {
600 let block = self
601 .blocks
602 .last_mut()
603 .ok_or(EncodeError::NoBlockTarget { method: "with_content_addressing" })?;
604 block.content_address = true;
605 Ok(self)
606 }
607
608 /// Enable automatic deduplication across all blocks.
609 ///
610 /// When set, the encoder hashes every block body with BLAKE3 during
611 /// `.encode()`. If the hash already exists in the content store
612 /// (i.e. a previous block in this or a prior encoding had the same
613 /// content), the block is automatically replaced with a hash
614 /// reference. First-occurrence blocks are stored inline and
615 /// registered in the store for future dedup.
616 ///
617 /// Requires a content store — call
618 /// [`set_content_store`](Self::set_content_store) before `.encode()`.
619 pub fn auto_dedup(&mut self) -> &mut Self {
620 self.auto_dedup = true;
621 self
622 }
623
624 // ── Encode ──────────────────────────────────────────────────────────
625
626 /// Serialize all accumulated blocks into a complete BCP payload.
627 ///
628 /// The encode pipeline processes each `PendingBlock` through up to
629 /// three stages:
630 ///
631 /// 1. **Serialize** — calls [`BlockContent::encode_body`] to get
632 /// the TLV-encoded body bytes. If a summary is present, it is
633 /// prepended and the `HAS_SUMMARY` flag is set.
634 ///
635 /// 2. **Content address** (optional) — if the block has
636 /// `content_address = true` or auto-dedup detects a duplicate,
637 /// the body is hashed with BLAKE3, stored in the content store,
638 /// and replaced with the 32-byte hash. The `IS_REFERENCE` flag
639 /// (bit 2) is set.
640 ///
641 /// 3. **Per-block compress** (optional) — if compression is enabled
642 /// for this block, whole-payload compression is NOT active, and
643 /// the body is not a reference, the body is zstd-compressed if
644 /// it exceeds [`COMPRESSION_THRESHOLD`] and compression yields
645 /// savings. The `COMPRESSED` flag (bit 1) is set.
646 ///
647 /// After all blocks, the END sentinel is appended. If whole-payload
648 /// compression is enabled, everything after the 8-byte header is
649 /// compressed as a single zstd frame and the header's `COMPRESSED`
650 /// flag is set.
651 ///
652 /// # Errors
653 ///
654 /// - [`EncodeError::EmptyPayload`] if no blocks have been added.
655 /// - [`EncodeError::BlockTooLarge`] if any block body exceeds 16 MiB.
656 /// - [`EncodeError::MissingContentStore`] if content addressing is
657 /// requested but no store has been configured.
658 /// - [`EncodeError::Wire`] if the underlying wire serialization fails.
659 /// - [`EncodeError::Io`] if writing to the output buffer fails.
660 pub fn encode(&self) -> Result<Vec<u8>, EncodeError> {
661 if self.blocks.is_empty() {
662 return Err(EncodeError::EmptyPayload);
663 }
664
665 // Validate: if any block needs content addressing or auto_dedup
666 // is enabled, a store must be present.
667 let needs_store = self.auto_dedup || self.blocks.iter().any(|b| b.content_address);
668 if needs_store && self.content_store.is_none() {
669 return Err(EncodeError::MissingContentStore);
670 }
671
672 // Pre-allocate: 8 bytes header + estimated block data + END sentinel.
673 let estimated_size = HEADER_SIZE + self.blocks.len() * 256 + 3;
674 let mut output = Vec::with_capacity(estimated_size);
675
676 // 1. Write a placeholder header (flags may be updated for whole-payload).
677 output.resize(HEADER_SIZE, 0);
678
679 // 2. Serialize each pending block through the encode pipeline.
680 for pending in &self.blocks {
681 let mut body = Self::serialize_block_body(pending)?;
682 let mut flags_raw = 0u8;
683
684 if pending.summary.is_some() {
685 flags_raw |= BlockFlags::HAS_SUMMARY.raw();
686 }
687
688 // Stage 2: Content addressing (runs before compression).
689 let is_reference = self.apply_content_addressing(pending, &mut body)?;
690 if is_reference {
691 flags_raw |= BlockFlags::IS_REFERENCE.raw();
692 }
693
694 // Stage 3: Per-block compression (skipped for references and
695 // when whole-payload compression is active).
696 if !is_reference && !self.compress_payload {
697 let should_compress = pending.compress || self.compress_all_blocks;
698 if should_compress && body.len() >= COMPRESSION_THRESHOLD {
699 if let Some(compressed) = compression::compress(&body) {
700 body = compressed;
701 flags_raw |= BlockFlags::COMPRESSED.raw();
702 }
703 }
704 }
705
706 let frame = BlockFrame {
707 block_type: pending.block_type,
708 flags: BlockFlags::from_raw(flags_raw),
709 body,
710 };
711 frame.write_to(&mut output)?;
712 }
713
714 // 3. Write the END sentinel.
715 let end_frame = BlockFrame {
716 block_type: block_type::END,
717 flags: BlockFlags::NONE,
718 body: Vec::new(),
719 };
720 end_frame.write_to(&mut output)?;
721
722 // 4. Whole-payload compression: compress everything after the header.
723 let header_flags = if self.compress_payload {
724 let block_data = &output[HEADER_SIZE..];
725 match compression::compress(block_data) {
726 Some(compressed) => {
727 output.truncate(HEADER_SIZE);
728 output.extend_from_slice(&compressed);
729 HeaderFlags::from_raw(self.flags.raw() | HeaderFlags::COMPRESSED.raw())
730 }
731 None => self.flags,
732 }
733 } else {
734 self.flags
735 };
736
737 // 5. Write the final header with correct flags.
738 let header = BcpHeader::new(header_flags);
739 header.write_to(&mut output[..HEADER_SIZE])?;
740
741 Ok(output)
742 }
743
744 // ── Internal helpers ────────────────────────────────────────────────
745
746 /// Push a new `PendingBlock` onto the internal list.
747 ///
748 /// If `compress_all_blocks` is set, the new block inherits
749 /// `compress = true` automatically.
750 ///
751 /// Returns `&mut Self` so callers can chain additional methods.
752 fn push_block(&mut self, block_type: u8, content: BlockContent) -> &mut Self {
753 self.blocks.push(PendingBlock {
754 block_type,
755 content,
756 summary: None,
757 compress: self.compress_all_blocks,
758 content_address: false,
759 });
760 self
761 }
762
763 /// Apply content addressing to a block body if requested.
764 ///
765 /// Returns `true` if the body was replaced with a 32-byte hash
766 /// reference, `false` if the body is unchanged (inline).
767 ///
768 /// Two paths trigger content addressing:
769 /// 1. `pending.content_address == true` — always replace with hash.
770 /// 2. `self.auto_dedup == true` — replace only if the hash already
771 /// exists in the store (i.e. a duplicate). First occurrence is
772 /// stored inline and registered for future dedup.
773 fn apply_content_addressing(
774 &self,
775 pending: &PendingBlock,
776 body: &mut Vec<u8>,
777 ) -> Result<bool, EncodeError> {
778 let store = match &self.content_store {
779 Some(s) => s,
780 None => return Ok(false),
781 };
782
783 if pending.content_address {
784 // Explicit content addressing: always replace with hash.
785 let hash = store.put(body);
786 *body = hash.to_vec();
787 return Ok(true);
788 }
789
790 if self.auto_dedup {
791 // Auto-dedup: check if this body was seen before.
792 let hash: [u8; 32] = blake3::hash(body).into();
793 if store.contains(&hash) {
794 // Duplicate — replace with reference.
795 *body = hash.to_vec();
796 return Ok(true);
797 }
798 // First occurrence — store for future dedup, keep inline.
799 store.put(body);
800 }
801
802 Ok(false)
803 }
804
805 /// Serialize a `PendingBlock` into its final body bytes.
806 ///
807 /// If the block has a summary, the summary is encoded first (as a
808 /// length-prefixed UTF-8 string) followed by the TLV body fields.
809 /// This matches the wire convention: when `HAS_SUMMARY` is set, the
810 /// summary occupies the front of the body, before any TLV fields.
811 fn serialize_block_body(pending: &PendingBlock) -> Result<Vec<u8>, EncodeError> {
812 let tlv_body = pending.content.encode_body();
813 let mut body = Vec::new();
814
815 if let Some(ref summary_text) = pending.summary {
816 let summary = Summary {
817 text: summary_text.clone(),
818 };
819 summary.encode(&mut body);
820 }
821
822 body.extend_from_slice(&tlv_body);
823
824 if body.len() > MAX_BLOCK_BODY_SIZE {
825 return Err(EncodeError::BlockTooLarge {
826 size: body.len(),
827 limit: MAX_BLOCK_BODY_SIZE,
828 });
829 }
830
831 Ok(body)
832 }
833}
834
835impl Default for BcpEncoder {
836 fn default() -> Self {
837 Self::new()
838 }
839}
840
841#[cfg(test)]
842mod tests {
843 use super::*;
844 use bcp_types::file_tree::FileEntryKind;
845 use bcp_wire::header::BCP_MAGIC;
846
847 // ── Helper ──────────────────────────────────────────────────────────
848
849 /// Verify that a payload starts with the BCP magic number.
850 fn assert_starts_with_magic(payload: &[u8]) {
851 assert!(payload.len() >= HEADER_SIZE, "payload too short for header");
852 assert_eq!(&payload[..4], &BCP_MAGIC, "missing BCP magic");
853 }
854
855 /// Verify that a payload ends with a valid END sentinel.
856 ///
857 /// The END sentinel is: block_type=0xFF as varint (2 bytes: 0xFF 0x01),
858 /// flags=0x00, content_len=0 as varint (1 byte: 0x00).
859 fn assert_ends_with_end_sentinel(payload: &[u8]) {
860 // The END block type 0xFF encodes as varint [0xFF, 0x01],
861 // followed by flags byte 0x00, followed by content_len varint 0x00.
862 let tail = &payload[payload.len() - 4..];
863 assert_eq!(tail, &[0xFF, 0x01, 0x00, 0x00], "missing END sentinel");
864 }
865
866 // ── Acceptance criteria tests ───────────────────────────────────────
867
868 #[test]
869 fn encode_single_code_block_produces_valid_magic() {
870 let payload = BcpEncoder::new()
871 .add_code(Lang::Rust, "src/main.rs", b"fn main() {}")
872 .encode()
873 .unwrap();
874
875 assert_starts_with_magic(&payload);
876 }
877
878 #[test]
879 fn builder_methods_are_chainable() {
880 let payload = BcpEncoder::new()
881 .add_code(Lang::Rust, "src/lib.rs", b"pub fn hello() {}")
882 .with_summary("Hello function.").unwrap()
883 .add_conversation(Role::User, b"What does this do?")
884 .encode()
885 .unwrap();
886
887 assert_starts_with_magic(&payload);
888 assert_ends_with_end_sentinel(&payload);
889 }
890
891 #[test]
892 fn with_summary_sets_has_summary_flag() {
893 let payload = BcpEncoder::new()
894 .add_code(Lang::Python, "main.py", b"print('hi')")
895 .with_summary("Prints a greeting.").unwrap()
896 .encode()
897 .unwrap();
898
899 // Parse: skip the 8-byte header, read the first block frame.
900 let frame_buf = &payload[HEADER_SIZE..];
901 let (frame, _) = BlockFrame::read_from(frame_buf).unwrap().unwrap();
902 assert!(
903 frame.flags.has_summary(),
904 "HAS_SUMMARY flag should be set on the code block"
905 );
906 }
907
908 #[test]
909 fn with_priority_appends_annotation_block() {
910 let payload = BcpEncoder::new()
911 .add_code(Lang::Rust, "lib.rs", b"// code")
912 .with_priority(Priority::High).unwrap()
913 .encode()
914 .unwrap();
915
916 // Parse: header + first block (CODE) + second block (ANNOTATION) + END
917 let mut cursor = HEADER_SIZE;
918
919 // Block 0: CODE
920 let (frame0, n) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
921 assert_eq!(frame0.block_type, block_type::CODE);
922 cursor += n;
923
924 // Block 1: ANNOTATION (priority)
925 let (frame1, _) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
926 assert_eq!(frame1.block_type, block_type::ANNOTATION);
927
928 // Decode the annotation body and verify it targets block 0
929 let annotation = AnnotationBlock::decode_body(&frame1.body).unwrap();
930 assert_eq!(annotation.target_block_id, 0);
931 assert_eq!(annotation.kind, AnnotationKind::Priority);
932 assert_eq!(annotation.value, vec![Priority::High.to_wire_byte()]);
933 }
934
935 #[test]
936 fn empty_encoder_returns_empty_payload_error() {
937 let result = BcpEncoder::new().encode();
938 assert!(matches!(result, Err(EncodeError::EmptyPayload)));
939 }
940
941 #[test]
942 fn payload_ends_with_end_sentinel() {
943 let payload = BcpEncoder::new()
944 .add_conversation(Role::User, b"hello")
945 .encode()
946 .unwrap();
947
948 assert_ends_with_end_sentinel(&payload);
949 }
950
951 #[test]
952 fn all_eleven_block_types_encode_without_error() {
953 let payload = BcpEncoder::new()
954 .add_code(Lang::Rust, "main.rs", b"fn main() {}")
955 .add_conversation(Role::User, b"hello")
956 .add_file_tree(
957 "/project",
958 vec![FileEntry {
959 name: "lib.rs".to_string(),
960 kind: FileEntryKind::File,
961 size: 100,
962 children: vec![],
963 }],
964 )
965 .add_tool_result("rg", Status::Ok, b"found 3 matches")
966 .add_document("README", b"# Title", FormatHint::Markdown)
967 .add_structured_data(DataFormat::Json, b"{\"key\": \"value\"}")
968 .add_diff(
969 "src/lib.rs",
970 vec![DiffHunk {
971 old_start: 1,
972 new_start: 1,
973 lines: b"+new line\n".to_vec(),
974 }],
975 )
976 .add_annotation(0, AnnotationKind::Tag, b"important")
977 .add_embedding_ref(b"vec-001", &[0xAB; 32], "text-embedding-3-small")
978 .add_image(MediaType::Png, "screenshot", b"\x89PNG\r\n")
979 .add_extension("myco", "custom_block", b"custom data")
980 .encode()
981 .unwrap();
982
983 assert_starts_with_magic(&payload);
984 assert_ends_with_end_sentinel(&payload);
985
986 // Verify we can walk all 12 content blocks (11 semantic + 1 annotation)
987 let mut cursor = HEADER_SIZE;
988 let mut block_count = 0;
989 loop {
990 match BlockFrame::read_from(&payload[cursor..]).unwrap() {
991 Some((_, n)) => {
992 cursor += n;
993 block_count += 1;
994 }
995 None => break, // END sentinel
996 }
997 }
998 assert_eq!(block_count, 11, "expected 11 content blocks");
999 }
1000
1001 #[test]
1002 fn payload_byte_length_matches_calculation() {
1003 let mut enc = BcpEncoder::new();
1004 enc.add_code(Lang::Rust, "x.rs", b"let x = 1;");
1005 enc.add_conversation(Role::User, b"hi");
1006
1007 let payload = enc.encode().unwrap();
1008
1009 // Calculate expected size manually:
1010 // Header: 8 bytes
1011 let mut expected = HEADER_SIZE;
1012
1013 // Walk actual frames to verify
1014 let mut cursor = HEADER_SIZE;
1015 loop {
1016 let remaining = &payload[cursor..];
1017 // Try to read a frame (including END which returns None)
1018 let start = cursor;
1019 match BlockFrame::read_from(remaining).unwrap() {
1020 Some((_, n)) => {
1021 cursor += n;
1022 expected += n;
1023 }
1024 None => {
1025 // END sentinel was consumed — count those bytes too
1026 let end_bytes = payload.len() - start;
1027 expected += end_bytes;
1028 break;
1029 }
1030 }
1031 }
1032
1033 assert_eq!(
1034 payload.len(),
1035 expected,
1036 "payload length should match header + frames + END"
1037 );
1038 }
1039
1040 #[test]
1041 fn optional_fields_omitted_when_none() {
1042 // CODE block without line_range
1043 let payload = BcpEncoder::new()
1044 .add_code(Lang::Rust, "x.rs", b"code")
1045 .encode()
1046 .unwrap();
1047
1048 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1049 .unwrap()
1050 .unwrap();
1051
1052 // Decode the body and verify line_range is None
1053 let code = CodeBlock::decode_body(&frame.body).unwrap();
1054 assert!(code.line_range.is_none());
1055
1056 // CONVERSATION block without tool_call_id
1057 let payload = BcpEncoder::new()
1058 .add_conversation(Role::User, b"msg")
1059 .encode()
1060 .unwrap();
1061
1062 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1063 .unwrap()
1064 .unwrap();
1065
1066 let conv = ConversationBlock::decode_body(&frame.body).unwrap();
1067 assert!(conv.tool_call_id.is_none());
1068 }
1069
1070 #[test]
1071 fn code_range_includes_line_numbers() {
1072 let payload = BcpEncoder::new()
1073 .add_code_range(Lang::Rust, "src/lib.rs", b"fn foo() {}", 10, 20)
1074 .encode()
1075 .unwrap();
1076
1077 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1078 .unwrap()
1079 .unwrap();
1080
1081 let code = CodeBlock::decode_body(&frame.body).unwrap();
1082 assert_eq!(code.line_range, Some((10, 20)));
1083 }
1084
1085 #[test]
1086 fn conversation_tool_includes_tool_call_id() {
1087 let payload = BcpEncoder::new()
1088 .add_conversation_tool(Role::Tool, b"result", "call_123")
1089 .encode()
1090 .unwrap();
1091
1092 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1093 .unwrap()
1094 .unwrap();
1095
1096 let conv = ConversationBlock::decode_body(&frame.body).unwrap();
1097 assert_eq!(conv.tool_call_id.as_deref(), Some("call_123"));
1098 }
1099
1100 #[test]
1101 fn summary_is_decodable_from_block_body() {
1102 let payload = BcpEncoder::new()
1103 .add_code(Lang::Rust, "main.rs", b"fn main() {}")
1104 .with_summary("Entry point for the application.").unwrap()
1105 .encode()
1106 .unwrap();
1107
1108 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1109 .unwrap()
1110 .unwrap();
1111
1112 assert!(frame.flags.has_summary());
1113
1114 // Decode summary from the front of the body
1115 let (summary, consumed) = Summary::decode(&frame.body).unwrap();
1116 assert_eq!(summary.text, "Entry point for the application.");
1117
1118 // Remaining bytes should decode as a valid CodeBlock
1119 let code = CodeBlock::decode_body(&frame.body[consumed..]).unwrap();
1120 assert_eq!(code.path, "main.rs");
1121 assert_eq!(code.content, b"fn main() {}");
1122 }
1123
1124 #[test]
1125 fn rfc_example_encodes_successfully() {
1126 // Reproduces the example from RFC §12.1 / SPEC_03 §1
1127 let payload = BcpEncoder::new()
1128 .add_code(Lang::Rust, "src/main.rs", b"fn main() { todo!() }")
1129 .with_summary("Entry point: CLI setup and server startup.").unwrap()
1130 .with_priority(Priority::High).unwrap()
1131 .add_conversation(Role::User, b"Fix the timeout bug.")
1132 .add_conversation(Role::Assistant, b"I'll examine the pool config...")
1133 .add_tool_result("ripgrep", Status::Ok, b"3 matches found.")
1134 .encode()
1135 .unwrap();
1136
1137 assert_starts_with_magic(&payload);
1138 assert_ends_with_end_sentinel(&payload);
1139
1140 // Walk all frames to verify structure
1141 let mut cursor = HEADER_SIZE;
1142 let mut types = Vec::new();
1143 loop {
1144 match BlockFrame::read_from(&payload[cursor..]).unwrap() {
1145 Some((frame, n)) => {
1146 types.push(frame.block_type);
1147 cursor += n;
1148 }
1149 None => break,
1150 }
1151 }
1152
1153 assert_eq!(
1154 types,
1155 vec![
1156 block_type::CODE,
1157 block_type::ANNOTATION, // from with_priority
1158 block_type::CONVERSATION,
1159 block_type::CONVERSATION,
1160 block_type::TOOL_RESULT,
1161 ]
1162 );
1163 }
1164
1165 #[test]
1166 fn default_impl_matches_new() {
1167 let from_new = BcpEncoder::new();
1168 let from_default = BcpEncoder::default();
1169 assert!(from_new.blocks.is_empty());
1170 assert!(from_default.blocks.is_empty());
1171 }
1172
1173 // ── Per-block compression tests ─────────────────────────────────────
1174
1175 #[test]
1176 fn per_block_compression_sets_compressed_flag() {
1177 // Create a large, compressible block (exceeds COMPRESSION_THRESHOLD)
1178 let big_content = "fn main() { println!(\"hello world\"); }\n".repeat(50);
1179 let payload = BcpEncoder::new()
1180 .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1181 .with_compression().unwrap()
1182 .encode()
1183 .unwrap();
1184
1185 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1186 .unwrap()
1187 .unwrap();
1188 assert!(
1189 frame.flags.is_compressed(),
1190 "COMPRESSED flag should be set on a large compressible block"
1191 );
1192 assert!(
1193 frame.body.len() < big_content.len(),
1194 "compressed body should be smaller than original"
1195 );
1196 }
1197
1198 #[test]
1199 fn small_block_not_compressed_even_when_requested() {
1200 let payload = BcpEncoder::new()
1201 .add_code(Lang::Rust, "x.rs", b"let x = 1;")
1202 .with_compression().unwrap()
1203 .encode()
1204 .unwrap();
1205
1206 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1207 .unwrap()
1208 .unwrap();
1209 assert!(
1210 !frame.flags.is_compressed(),
1211 "small blocks should not be compressed (below threshold)"
1212 );
1213 }
1214
1215 #[test]
1216 fn compress_blocks_applies_to_all() {
1217 let big_content = "use std::io;\n".repeat(100);
1218 let payload = BcpEncoder::new()
1219 .add_code(Lang::Rust, "a.rs", big_content.as_bytes())
1220 .add_code(Lang::Rust, "b.rs", big_content.as_bytes())
1221 .compress_blocks()
1222 .encode()
1223 .unwrap();
1224
1225 let mut cursor = HEADER_SIZE;
1226 for _ in 0..2 {
1227 let (frame, n) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
1228 assert!(
1229 frame.flags.is_compressed(),
1230 "all blocks should be compressed with compress_blocks()"
1231 );
1232 cursor += n;
1233 }
1234 }
1235
1236 // ── Whole-payload compression tests ─────────────────────────────────
1237
1238 #[test]
1239 fn whole_payload_compression_sets_header_flag() {
1240 let big_content = "pub fn hello() -> &'static str { \"world\" }\n".repeat(100);
1241 let payload = BcpEncoder::new()
1242 .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1243 .compress_payload()
1244 .encode()
1245 .unwrap();
1246
1247 let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1248 assert!(
1249 header.flags.is_compressed(),
1250 "header COMPRESSED flag should be set for whole-payload compression"
1251 );
1252 }
1253
1254 #[test]
1255 fn whole_payload_skips_per_block_compression() {
1256 // When whole-payload compression is active, individual block
1257 // COMPRESSED flags should NOT be set.
1258 let big_content = "pub fn hello() -> &'static str { \"world\" }\n".repeat(100);
1259 let payload = BcpEncoder::new()
1260 .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1261 .with_compression().unwrap()
1262 .compress_payload()
1263 .encode()
1264 .unwrap();
1265
1266 let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1267 assert!(header.flags.is_compressed());
1268
1269 // Decompress the payload to check individual blocks
1270 let decompressed =
1271 crate::compression::decompress(&payload[HEADER_SIZE..], 16 * 1024 * 1024).unwrap();
1272
1273 let (frame, _) = BlockFrame::read_from(&decompressed).unwrap().unwrap();
1274 assert!(
1275 !frame.flags.is_compressed(),
1276 "per-block COMPRESSED flag should not be set when whole-payload is active"
1277 );
1278 }
1279
1280 #[test]
1281 fn whole_payload_no_savings_stays_uncompressed() {
1282 // Tiny payload — zstd overhead exceeds savings
1283 let payload = BcpEncoder::new()
1284 .add_code(Lang::Rust, "x.rs", b"x")
1285 .compress_payload()
1286 .encode()
1287 .unwrap();
1288
1289 let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1290 assert!(
1291 !header.flags.is_compressed(),
1292 "header COMPRESSED flag should NOT be set when compression yields no savings"
1293 );
1294 }
1295
1296 // ── Content addressing tests ────────────────────────────────────────
1297
1298 #[test]
1299 fn content_addressing_sets_reference_flag() {
1300 let store = Arc::new(crate::MemoryContentStore::new());
1301 let payload = BcpEncoder::new()
1302 .set_content_store(store.clone())
1303 .add_code(Lang::Rust, "main.rs", b"fn main() {}")
1304 .with_content_addressing().unwrap()
1305 .encode()
1306 .unwrap();
1307
1308 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1309 .unwrap()
1310 .unwrap();
1311 assert!(
1312 frame.flags.is_reference(),
1313 "IS_REFERENCE flag should be set on content-addressed block"
1314 );
1315 assert_eq!(
1316 frame.body.len(),
1317 32,
1318 "reference block body should be exactly 32 bytes (BLAKE3 hash)"
1319 );
1320
1321 // The hash should resolve in the store
1322 let hash: [u8; 32] = frame.body.try_into().unwrap();
1323 assert!(store.contains(&hash));
1324 }
1325
1326 #[test]
1327 fn content_addressing_without_store_errors() {
1328 let result = BcpEncoder::new()
1329 .add_code(Lang::Rust, "main.rs", b"fn main() {}")
1330 .with_content_addressing().unwrap()
1331 .encode();
1332
1333 assert!(
1334 matches!(result, Err(EncodeError::MissingContentStore)),
1335 "should error when content addressing is requested without a store"
1336 );
1337 }
1338
1339 #[test]
1340 fn auto_dedup_detects_duplicate_blocks() {
1341 let store = Arc::new(crate::MemoryContentStore::new());
1342 // Both blocks must have identical serialized TLV bodies (same
1343 // path + content + lang) for auto-dedup to detect a duplicate.
1344 let content = b"fn main() {}";
1345
1346 let payload = BcpEncoder::new()
1347 .set_content_store(store.clone())
1348 .auto_dedup()
1349 .add_code(Lang::Rust, "main.rs", content)
1350 .add_code(Lang::Rust, "main.rs", content) // identical TLV body
1351 .encode()
1352 .unwrap();
1353
1354 let mut cursor = HEADER_SIZE;
1355
1356 // First block: inline (first occurrence)
1357 let (frame0, n) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
1358 assert!(
1359 !frame0.flags.is_reference(),
1360 "first occurrence should be stored inline"
1361 );
1362 cursor += n;
1363
1364 // Second block: reference (duplicate)
1365 let (frame1, _) = BlockFrame::read_from(&payload[cursor..]).unwrap().unwrap();
1366 assert!(
1367 frame1.flags.is_reference(),
1368 "duplicate should become a hash reference"
1369 );
1370 assert_eq!(frame1.body.len(), 32);
1371 }
1372
1373 #[test]
1374 fn auto_dedup_without_store_errors() {
1375 let result = BcpEncoder::new()
1376 .auto_dedup()
1377 .add_code(Lang::Rust, "x.rs", b"code")
1378 .encode();
1379
1380 assert!(matches!(result, Err(EncodeError::MissingContentStore)));
1381 }
1382
1383 #[test]
1384 fn reference_block_not_per_block_compressed() {
1385 // Content-addressed blocks produce 32-byte bodies which are
1386 // below the compression threshold — verify no COMPRESSED flag.
1387 let store = Arc::new(crate::MemoryContentStore::new());
1388 let big_content = "fn main() { println!(\"hello\"); }\n".repeat(50);
1389 let payload = BcpEncoder::new()
1390 .set_content_store(store)
1391 .add_code(Lang::Rust, "main.rs", big_content.as_bytes())
1392 .with_content_addressing().unwrap()
1393 .with_compression().unwrap()
1394 .encode()
1395 .unwrap();
1396
1397 let (frame, _) = BlockFrame::read_from(&payload[HEADER_SIZE..])
1398 .unwrap()
1399 .unwrap();
1400 assert!(frame.flags.is_reference());
1401 assert!(
1402 !frame.flags.is_compressed(),
1403 "reference blocks should not be per-block compressed"
1404 );
1405 }
1406
1407 #[test]
1408 fn content_addressing_with_whole_payload_compression() {
1409 // Reference blocks CAN be wrapped in whole-payload compression.
1410 let store = Arc::new(crate::MemoryContentStore::new());
1411 // Same path + content = identical TLV body = single store entry
1412 let content = "fn main() { println!(\"hello\"); }\n".repeat(50);
1413
1414 let payload = BcpEncoder::new()
1415 .set_content_store(store.clone())
1416 .compress_payload()
1417 .add_code(Lang::Rust, "main.rs", content.as_bytes())
1418 .with_content_addressing().unwrap()
1419 .add_code(Lang::Rust, "main.rs", content.as_bytes())
1420 .with_content_addressing().unwrap()
1421 .encode()
1422 .unwrap();
1423
1424 let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1425 // The payload might or might not be compressed (two 32-byte hashes
1426 // plus framing may not compress well), but if it is, verify it's valid.
1427 if header.flags.is_compressed() {
1428 let decompressed =
1429 crate::compression::decompress(&payload[HEADER_SIZE..], 16 * 1024 * 1024).unwrap();
1430
1431 let (frame, _) = BlockFrame::read_from(&decompressed).unwrap().unwrap();
1432 assert!(frame.flags.is_reference());
1433 assert_eq!(frame.body.len(), 32);
1434 }
1435
1436 // Both blocks have identical TLV bodies → single store entry
1437 assert_eq!(
1438 store.len(),
1439 1,
1440 "identical blocks should produce one store entry"
1441 );
1442 }
1443
1444 // ── Phase 4: Cross-cutting tests ────────────────────────────────────
1445
1446 #[test]
1447 fn compression_ratio_benchmark() {
1448 // A realistic 50-line Rust file should compress by >= 20%.
1449 let rust_code = r#"use std::collections::HashMap;
1450use std::sync::Arc;
1451
1452pub struct Config {
1453 pub name: String,
1454 pub values: HashMap<String, String>,
1455 pub timeout: u64,
1456}
1457
1458impl Config {
1459 pub fn new(name: &str) -> Self {
1460 Self {
1461 name: name.to_string(),
1462 values: HashMap::new(),
1463 timeout: 30,
1464 }
1465 }
1466
1467 pub fn set(&mut self, key: &str, value: &str) {
1468 self.values.insert(key.to_string(), value.to_string());
1469 }
1470
1471 pub fn get(&self, key: &str) -> Option<&String> {
1472 self.values.get(key)
1473 }
1474
1475 pub fn timeout(&self) -> u64 {
1476 self.timeout
1477 }
1478}
1479
1480impl Default for Config {
1481 fn default() -> Self {
1482 Self::new("default")
1483 }
1484}
1485
1486#[cfg(test)]
1487mod tests {
1488 use super::*;
1489
1490 #[test]
1491 fn test_new_config() {
1492 let config = Config::new("test");
1493 assert_eq!(config.name, "test");
1494 assert!(config.values.is_empty());
1495 assert_eq!(config.timeout(), 30);
1496 }
1497
1498 #[test]
1499 fn test_set_and_get() {
1500 let mut config = Config::new("test");
1501 config.set("key", "value");
1502 assert_eq!(config.get("key"), Some(&"value".to_string()));
1503 }
1504}
1505"#;
1506
1507 let uncompressed_payload = BcpEncoder::new()
1508 .add_code(Lang::Rust, "config.rs", rust_code.as_bytes())
1509 .encode()
1510 .unwrap();
1511
1512 let compressed_payload = BcpEncoder::new()
1513 .add_code(Lang::Rust, "config.rs", rust_code.as_bytes())
1514 .with_compression().unwrap()
1515 .encode()
1516 .unwrap();
1517
1518 let savings_pct =
1519 100.0 * (1.0 - compressed_payload.len() as f64 / uncompressed_payload.len() as f64);
1520
1521 assert!(
1522 savings_pct >= 20.0,
1523 "expected >= 20% compression savings on a 50-line Rust file, got {savings_pct:.1}%"
1524 );
1525 }
1526
1527 #[test]
1528 fn whole_payload_wins_over_per_block() {
1529 // When both per-block and whole-payload compression are requested,
1530 // only the header COMPRESSED flag should be set; individual blocks
1531 // should NOT have their COMPRESSED flags set.
1532 let big_content = "pub fn process() -> Result<(), Error> { Ok(()) }\n".repeat(50);
1533 let payload = BcpEncoder::new()
1534 .add_code(Lang::Rust, "a.rs", big_content.as_bytes())
1535 .with_compression().unwrap()
1536 .add_code(Lang::Rust, "b.rs", big_content.as_bytes())
1537 .with_compression().unwrap()
1538 .compress_payload()
1539 .encode()
1540 .unwrap();
1541
1542 let header = BcpHeader::read_from(&payload[..HEADER_SIZE]).unwrap();
1543 assert!(
1544 header.flags.is_compressed(),
1545 "header should have COMPRESSED flag"
1546 );
1547
1548 // Decompress payload to inspect individual blocks
1549 let decompressed =
1550 crate::compression::decompress(&payload[HEADER_SIZE..], 16 * 1024 * 1024).unwrap();
1551
1552 let mut cursor = 0;
1553 while let Some((frame, n)) = BlockFrame::read_from(&decompressed[cursor..]).unwrap() {
1554 assert!(
1555 !frame.flags.is_compressed(),
1556 "individual blocks should NOT be compressed when whole-payload is active"
1557 );
1558 cursor += n;
1559 }
1560 }
1561
1562 #[test]
1563 fn full_pipeline_encode_decode_roundtrip() {
1564 // Exercises all features together: multiple block types,
1565 // summaries, priorities, per-block compression, content
1566 // addressing, and auto-dedup.
1567 let store = Arc::new(crate::MemoryContentStore::new());
1568 let big_code = "fn compute() -> i64 { 42 }\n".repeat(50);
1569
1570 let payload = BcpEncoder::new()
1571 .set_content_store(store.clone())
1572 .auto_dedup()
1573 .add_code(Lang::Rust, "lib.rs", big_code.as_bytes())
1574 .with_summary("Core computation module.").unwrap()
1575 .with_compression().unwrap()
1576 .add_code(Lang::Rust, "lib.rs", big_code.as_bytes()) // auto-dedup
1577 .add_conversation(Role::User, b"Review this code")
1578 .add_tool_result("clippy", Status::Ok, b"No warnings")
1579 .encode()
1580 .unwrap();
1581
1582 // Decode with the same store
1583 let decoded = bcp_decoder::BcpDecoder::decode_with_store(&payload, store.as_ref()).unwrap();
1584
1585 assert_eq!(decoded.blocks.len(), 4);
1586 assert_eq!(decoded.blocks[0].block_type, bcp_types::BlockType::Code);
1587 assert_eq!(
1588 decoded.blocks[0].summary.as_ref().unwrap().text,
1589 "Core computation module."
1590 );
1591 assert_eq!(decoded.blocks[1].block_type, bcp_types::BlockType::Code);
1592 assert_eq!(
1593 decoded.blocks[2].block_type,
1594 bcp_types::BlockType::Conversation
1595 );
1596 assert_eq!(
1597 decoded.blocks[3].block_type,
1598 bcp_types::BlockType::ToolResult
1599 );
1600
1601 // Both code blocks should have the same content
1602 for block in &decoded.blocks[..2] {
1603 match &block.content {
1604 BlockContent::Code(code) => {
1605 assert_eq!(code.content, big_code.as_bytes());
1606 }
1607 other => panic!("expected Code, got {other:?}"),
1608 }
1609 }
1610 }
1611}