harn_vm/orchestration/
assemble.rs

1//! Adaptive context assembly with cross-artifact microcompaction.
2//!
3//! `assemble_context` packs a set of artifacts into a token-budgeted slice
4//! of chunks, deduplicating overlap across artifacts, snipping oversized
5//! entries into chunked form, and returning an observability record that
6//! names why each chunk was included or dropped.
7//!
8//! The core is intentionally deterministic: given the same input artifacts
9//! and options, it produces the same chunk ids and ordering. A host-supplied
10//! ranker callback is the only non-deterministic hook; the VM-side binding
11//! invokes it via the same pattern as `compress_callback`.
12
13use std::collections::BTreeSet;
14
15use sha2::{Digest, Sha256};
16
17use super::ArtifactRecord;
18use crate::stdlib::xml::escape_xml_text;
19
20/// Strategy used to order chunks when packing into the budget.
21#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub enum AssembleStrategy {
23    /// Sort by artifact `created_at` (newest first), then by chunk index.
24    Recency,
25    /// Sort by ranker score (highest first). Default ranker is token-overlap
26    /// against `query`; a host callback can supply a custom one.
27    Relevance,
28    /// Interleave chunks one-per-artifact in artifact input order, cycling
29    /// until the budget fills. Gives every artifact a chance to contribute.
30    RoundRobin,
31}
32
33impl AssembleStrategy {
34    pub fn parse(value: &str) -> Result<Self, String> {
35        match value {
36            "recency" => Ok(Self::Recency),
37            "relevance" => Ok(Self::Relevance),
38            "round_robin" => Ok(Self::RoundRobin),
39            other => Err(format!(
40                "assemble_context: strategy must be one of recency | relevance | round_robin (got {other:?})"
41            )),
42        }
43    }
44
45    pub fn as_str(&self) -> &'static str {
46        match self {
47            Self::Recency => "recency",
48            Self::Relevance => "relevance",
49            Self::RoundRobin => "round_robin",
50        }
51    }
52}
53
54#[derive(Clone, Copy, Debug, Eq, PartialEq)]
55pub enum AssembleDedup {
56    None,
57    /// Hash each chunk's normalized text; drop later duplicates.
58    Chunked,
59    /// Shingle-based overlap detection. Treat chunks whose trigram
60    /// Jaccard similarity exceeds 0.85 as duplicates. Still fully
61    /// deterministic — no embeddings or callback required.
62    Semantic,
63}
64
65impl AssembleDedup {
66    pub fn parse(value: &str) -> Result<Self, String> {
67        match value {
68            "none" => Ok(Self::None),
69            "chunked" => Ok(Self::Chunked),
70            "semantic" => Ok(Self::Semantic),
71            other => Err(format!(
72                "assemble_context: dedup must be one of none | chunked | semantic (got {other:?})"
73            )),
74        }
75    }
76
77    pub fn as_str(&self) -> &'static str {
78        match self {
79            Self::None => "none",
80            Self::Chunked => "chunked",
81            Self::Semantic => "semantic",
82        }
83    }
84}
85
86#[derive(Clone, Debug)]
87pub struct AssembleOptions {
88    pub budget_tokens: usize,
89    pub dedup: AssembleDedup,
90    pub strategy: AssembleStrategy,
91    pub query: Option<String>,
92    /// Artifacts larger than this many tokens are split into chunks.
93    pub microcompact_threshold: usize,
94    /// Minimum overlap ratio (0.0-1.0) that counts as a semantic duplicate.
95    pub semantic_overlap: f64,
96}
97
98impl Default for AssembleOptions {
99    fn default() -> Self {
100        Self {
101            budget_tokens: 8_000,
102            dedup: AssembleDedup::Chunked,
103            strategy: AssembleStrategy::Relevance,
104            query: None,
105            microcompact_threshold: 2_000,
106            semantic_overlap: 0.85,
107        }
108    }
109}
110
111/// One unit of packed context with a stable content-addressed id.
112#[derive(Clone, Debug)]
113pub struct AssembledChunk {
114    pub id: String,
115    pub artifact_id: String,
116    pub artifact_kind: String,
117    pub title: Option<String>,
118    pub source: Option<String>,
119    pub text: String,
120    pub estimated_tokens: usize,
121    pub chunk_index: usize,
122    pub chunk_count: usize,
123    pub score: f64,
124}
125
126/// Per-artifact summary of what made it into the pack.
127#[derive(Clone, Debug)]
128pub struct AssembledArtifactSummary {
129    pub artifact_id: String,
130    pub artifact_kind: String,
131    pub chunks_included: usize,
132    pub chunks_total: usize,
133    pub tokens_included: usize,
134}
135
136/// Reason an artifact or chunk was excluded from the final pack.
137#[derive(Clone, Debug)]
138pub struct AssembledExclusion {
139    pub artifact_id: String,
140    pub chunk_id: Option<String>,
141    pub reason: &'static str,
142    pub detail: Option<String>,
143}
144
145/// Per-chunk rationale — the "why" field the issue calls out.
146#[derive(Clone, Debug)]
147pub struct AssembledReason {
148    pub chunk_id: String,
149    pub artifact_id: String,
150    pub strategy: &'static str,
151    pub score: f64,
152    pub included: bool,
153    pub reason: &'static str,
154}
155
156#[derive(Clone, Debug)]
157pub struct AssembledContext {
158    pub chunks: Vec<AssembledChunk>,
159    pub included: Vec<AssembledArtifactSummary>,
160    pub dropped: Vec<AssembledExclusion>,
161    pub reasons: Vec<AssembledReason>,
162    pub total_tokens: usize,
163    pub budget_tokens: usize,
164    pub strategy: AssembleStrategy,
165    pub dedup: AssembleDedup,
166}
167
168/// Content-addressed chunk id — stable across runs for the same text.
169/// The leading `artifact_id` prefix keeps chunks from one artifact visually
170/// grouped in transcripts and replay diffs.
171pub fn stable_chunk_id(artifact_id: &str, text: &str) -> String {
172    let mut hasher = Sha256::new();
173    hasher.update(text.as_bytes());
174    let digest = hasher.finalize();
175    let hex = digest
176        .iter()
177        .take(8)
178        .map(|byte| format!("{byte:02x}"))
179        .collect::<String>();
180    format!("{artifact_id}#{hex}")
181}
182
183/// Approximate token count using the same chars-per-token heuristic as
184/// `estimate_message_tokens`. One token ~= 4 characters.
185pub fn estimate_chunk_tokens(text: &str) -> usize {
186    text.len().div_ceil(4)
187}
188
189/// Split a text into chunks of roughly `target_tokens` each, snapping to
190/// paragraph (`\n\n`) boundaries when possible, then falling back to line
191/// breaks, then raw character boundaries. Chunks never exceed the target
192/// more than transiently — the final chunk may be short.
193pub fn chunk_text(text: &str, target_tokens: usize) -> Vec<String> {
194    if text.is_empty() {
195        return Vec::new();
196    }
197    let target_chars = (target_tokens.max(1)).saturating_mul(4);
198    if text.len() <= target_chars {
199        return vec![text.to_string()];
200    }
201
202    let mut chunks = Vec::new();
203    let mut current = String::new();
204    let push_current = |current: &mut String, chunks: &mut Vec<String>| {
205        if !current.is_empty() {
206            chunks.push(std::mem::take(current));
207        }
208    };
209
210    for paragraph in split_paragraphs(text) {
211        if current.len() + paragraph.len() + 2 > target_chars && !current.is_empty() {
212            push_current(&mut current, &mut chunks);
213        }
214        if paragraph.len() > target_chars {
215            // Oversized paragraph: split by lines.
216            push_current(&mut current, &mut chunks);
217            let mut inner = String::new();
218            for line in paragraph.split_inclusive('\n') {
219                if inner.len() + line.len() > target_chars && !inner.is_empty() {
220                    chunks.push(std::mem::take(&mut inner));
221                }
222                if line.len() > target_chars {
223                    // Still too big: fall back to char-boundary splits.
224                    let mut i = 0;
225                    let bytes = line.as_bytes();
226                    while i < line.len() {
227                        let mut end = (i + target_chars).min(line.len());
228                        while end < line.len() && (bytes[end] & 0b1100_0000) == 0b1000_0000 {
229                            end += 1;
230                        }
231                        if !inner.is_empty() {
232                            chunks.push(std::mem::take(&mut inner));
233                        }
234                        chunks.push(line[i..end].to_string());
235                        i = end;
236                    }
237                } else {
238                    inner.push_str(line);
239                }
240            }
241            if !inner.is_empty() {
242                chunks.push(inner);
243            }
244        } else {
245            if !current.is_empty() {
246                current.push_str("\n\n");
247            }
248            current.push_str(paragraph);
249        }
250    }
251    push_current(&mut current, &mut chunks);
252    chunks
253}
254
255fn split_paragraphs(text: &str) -> Vec<&str> {
256    let mut out = Vec::new();
257    let mut start = 0;
258    let bytes = text.as_bytes();
259    let mut i = 0;
260    while i + 1 < bytes.len() {
261        if bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
262            let segment = text[start..i].trim_matches('\n');
263            if !segment.is_empty() {
264                out.push(segment);
265            }
266            // skip all consecutive newlines
267            let mut j = i;
268            while j < bytes.len() && bytes[j] == b'\n' {
269                j += 1;
270            }
271            start = j;
272            i = j;
273        } else {
274            i += 1;
275        }
276    }
277    let tail = text[start..].trim_matches('\n');
278    if !tail.is_empty() {
279        out.push(tail);
280    }
281    if out.is_empty() && !text.is_empty() {
282        out.push(text);
283    }
284    out
285}
286
287/// Trigram set for shingle-based semantic dedup. Lowercases and strips
288/// non-alphanumeric characters; produces trigrams over UTF-8 byte windows
289/// so behavior is stable across platforms.
290fn trigrams(text: &str) -> BTreeSet<[u8; 3]> {
291    let normalized: Vec<u8> = text
292        .chars()
293        .filter_map(|c| {
294            if c.is_alphanumeric() {
295                Some(c.to_ascii_lowercase() as u8)
296            } else if c.is_whitespace() {
297                Some(b' ')
298            } else {
299                None
300            }
301        })
302        .collect();
303    let mut out = BTreeSet::new();
304    if normalized.len() < 3 {
305        return out;
306    }
307    for window in normalized.windows(3) {
308        out.insert([window[0], window[1], window[2]]);
309    }
310    out
311}
312
313fn jaccard(a: &BTreeSet<[u8; 3]>, b: &BTreeSet<[u8; 3]>) -> f64 {
314    if a.is_empty() && b.is_empty() {
315        return 1.0;
316    }
317    let intersection = a.intersection(b).count() as f64;
318    let union = a.union(b).count() as f64;
319    if union == 0.0 {
320        0.0
321    } else {
322        intersection / union
323    }
324}
325
326fn keyword_overlap_score(text: &str, query: &str) -> f64 {
327    if query.trim().is_empty() {
328        return 0.0;
329    }
330    let query_terms: BTreeSet<String> = query
331        .split_whitespace()
332        .filter(|term| term.len() > 2)
333        .map(|term| term.to_ascii_lowercase())
334        .collect();
335    if query_terms.is_empty() {
336        return 0.0;
337    }
338    let mut matches = 0usize;
339    let lower = text.to_ascii_lowercase();
340    for term in &query_terms {
341        if lower.contains(term.as_str()) {
342            matches += 1;
343        }
344    }
345    let base = matches as f64 / query_terms.len() as f64;
346    // Length penalty: prefer chunks where matched query terms are dense,
347    // so a 100-char chunk that mentions "parser" scores higher than a
348    // 10k-char chunk that mentions "parser" once.
349    let density = (matches as f64) / (text.len() as f64 / 400.0 + 1.0);
350    base * 0.7 + density.min(1.0) * 0.3
351}
352
353/// Build every candidate chunk from the input artifacts. Artifacts that
354/// exceed `microcompact_threshold` tokens get split; smaller ones become
355/// a single chunk each. Skips artifacts with no text body.
356pub fn build_candidate_chunks(
357    artifacts: &[ArtifactRecord],
358    options: &AssembleOptions,
359    dropped: &mut Vec<AssembledExclusion>,
360) -> Vec<AssembledChunk> {
361    let mut candidates = Vec::new();
362    for artifact in artifacts {
363        let Some(text) = artifact.text.as_ref() else {
364            dropped.push(AssembledExclusion {
365                artifact_id: artifact.id.clone(),
366                chunk_id: None,
367                reason: "no_text",
368                detail: None,
369            });
370            continue;
371        };
372        let trimmed = text.trim();
373        if trimmed.is_empty() {
374            dropped.push(AssembledExclusion {
375                artifact_id: artifact.id.clone(),
376                chunk_id: None,
377                reason: "empty_text",
378                detail: None,
379            });
380            continue;
381        }
382        let estimated = artifact
383            .estimated_tokens
384            .unwrap_or_else(|| estimate_chunk_tokens(text));
385        let pieces: Vec<String> = if estimated > options.microcompact_threshold {
386            chunk_text(text, options.microcompact_threshold)
387        } else {
388            vec![text.clone()]
389        };
390        let count = pieces.len();
391        for (idx, piece) in pieces.into_iter().enumerate() {
392            let id = stable_chunk_id(&artifact.id, &piece);
393            let tokens = estimate_chunk_tokens(&piece);
394            candidates.push(AssembledChunk {
395                id,
396                artifact_id: artifact.id.clone(),
397                artifact_kind: artifact.kind.clone(),
398                title: artifact.title.clone(),
399                source: artifact.source.clone(),
400                text: piece,
401                estimated_tokens: tokens,
402                chunk_index: idx,
403                chunk_count: count,
404                score: 0.0,
405            });
406        }
407    }
408    candidates
409}
410
411/// Apply dedup. Returns (kept, dropped-by-dedup). `dropped` is the slice
412/// needed for the caller's observability record; the reason is always
413/// `"duplicate"`.
414pub fn dedup_chunks(
415    mut chunks: Vec<AssembledChunk>,
416    mode: AssembleDedup,
417    semantic_overlap: f64,
418) -> (Vec<AssembledChunk>, Vec<AssembledExclusion>) {
419    let mut dropped = Vec::new();
420    match mode {
421        AssembleDedup::None => (chunks, dropped),
422        AssembleDedup::Chunked => {
423            let mut seen: BTreeSet<String> = BTreeSet::new();
424            chunks.retain(|chunk| {
425                let key = normalized_text_key(&chunk.text);
426                if seen.insert(key) {
427                    true
428                } else {
429                    dropped.push(AssembledExclusion {
430                        artifact_id: chunk.artifact_id.clone(),
431                        chunk_id: Some(chunk.id.clone()),
432                        reason: "duplicate",
433                        detail: Some("chunked".to_string()),
434                    });
435                    false
436                }
437            });
438            (chunks, dropped)
439        }
440        AssembleDedup::Semantic => {
441            let mut kept: Vec<(AssembledChunk, BTreeSet<[u8; 3]>)> = Vec::new();
442            for chunk in chunks.drain(..) {
443                let trigrams_new = trigrams(&chunk.text);
444                let mut duplicate = false;
445                for (existing, existing_trigrams) in &kept {
446                    if jaccard(&trigrams_new, existing_trigrams) >= semantic_overlap {
447                        dropped.push(AssembledExclusion {
448                            artifact_id: chunk.artifact_id.clone(),
449                            chunk_id: Some(chunk.id.clone()),
450                            reason: "duplicate",
451                            detail: Some(format!("semantic≈{}", existing.id)),
452                        });
453                        duplicate = true;
454                        break;
455                    }
456                }
457                if !duplicate {
458                    kept.push((chunk, trigrams_new));
459                }
460            }
461            (kept.into_iter().map(|(chunk, _)| chunk).collect(), dropped)
462        }
463    }
464}
465
466fn normalized_text_key(text: &str) -> String {
467    text.split_whitespace().collect::<Vec<_>>().join(" ")
468}
469
470/// Score chunks under `strategy`. With `Relevance` and no callback scores,
471/// falls back to keyword overlap against `options.query`. The `custom_scores`
472/// Option is the hook where a host-supplied ranker slots in — the caller
473/// (stdlib binding) invokes the closure and passes the resulting Vec here.
474pub fn score_chunks(
475    chunks: &mut [AssembledChunk],
476    artifacts: &[ArtifactRecord],
477    options: &AssembleOptions,
478    custom_scores: Option<&[f64]>,
479) {
480    match options.strategy {
481        AssembleStrategy::Recency => {
482            // Newer artifacts first; within an artifact, earlier chunks first.
483            let order: std::collections::BTreeMap<&str, (String, usize)> = artifacts
484                .iter()
485                .enumerate()
486                .map(|(idx, artifact)| (artifact.id.as_str(), (artifact.created_at.clone(), idx)))
487                .collect();
488            for chunk in chunks.iter_mut() {
489                let (created_at, input_idx) = order
490                    .get(chunk.artifact_id.as_str())
491                    .cloned()
492                    .unwrap_or_else(|| (String::new(), 0));
493                // Score is a synthetic "recency score" in [0, 1] based on
494                // lexicographic order of created_at with the input index as
495                // a stable tiebreaker. Newer created_at → higher score.
496                let recency_rank = created_at
497                    .chars()
498                    .fold(0u64, |acc, c| acc.wrapping_mul(131).wrapping_add(c as u64));
499                chunk.score = recency_rank as f64 / u64::MAX as f64
500                    - (input_idx as f64) * 1e-9
501                    - (chunk.chunk_index as f64) * 1e-12;
502            }
503        }
504        AssembleStrategy::Relevance => {
505            if let Some(scores) = custom_scores {
506                for (chunk, score) in chunks.iter_mut().zip(scores.iter()) {
507                    chunk.score = *score;
508                }
509                // Any trailing chunks without a custom score keep 0.0.
510            } else {
511                let query = options.query.as_deref().unwrap_or("");
512                for chunk in chunks.iter_mut() {
513                    chunk.score = keyword_overlap_score(&chunk.text, query);
514                }
515            }
516        }
517        AssembleStrategy::RoundRobin => {
518            // Round-robin is handled at pack time; score just reflects
519            // input order so ties break deterministically.
520            for (idx, chunk) in chunks.iter_mut().enumerate() {
521                chunk.score = 1.0 - (idx as f64) * 1e-6;
522            }
523        }
524    }
525}
526
527/// Pack chunks under `budget_tokens`. Returns (selected, rejected-for-budget).
528pub fn pack_budget(
529    chunks: Vec<AssembledChunk>,
530    options: &AssembleOptions,
531) -> (Vec<AssembledChunk>, Vec<AssembledChunk>) {
532    let mut sorted = chunks;
533    match options.strategy {
534        AssembleStrategy::RoundRobin => {
535            // Group by artifact_id preserving first-appearance order, then interleave.
536            let mut groups: Vec<Vec<AssembledChunk>> = Vec::new();
537            let mut group_index: std::collections::BTreeMap<String, usize> =
538                std::collections::BTreeMap::new();
539            // Preserve input order of artifacts by walking sorted-as-given.
540            for chunk in sorted.drain(..) {
541                let key = chunk.artifact_id.clone();
542                let idx = match group_index.get(&key) {
543                    Some(idx) => *idx,
544                    None => {
545                        let idx = groups.len();
546                        group_index.insert(key.clone(), idx);
547                        groups.push(Vec::new());
548                        idx
549                    }
550                };
551                groups[idx].push(chunk);
552            }
553            // Within each group, keep by chunk_index ascending.
554            for group in &mut groups {
555                group.sort_by_key(|chunk| chunk.chunk_index);
556            }
557            let mut interleaved = Vec::new();
558            let max_len = groups.iter().map(Vec::len).max().unwrap_or(0);
559            for i in 0..max_len {
560                for group in &mut groups {
561                    if i < group.len() {
562                        interleaved.push(group[i].clone());
563                    }
564                }
565            }
566            sorted = interleaved;
567        }
568        _ => {
569            sorted.sort_by(|a, b| {
570                b.score
571                    .partial_cmp(&a.score)
572                    .unwrap_or(std::cmp::Ordering::Equal)
573                    .then_with(|| a.artifact_id.cmp(&b.artifact_id))
574                    .then_with(|| a.chunk_index.cmp(&b.chunk_index))
575            });
576        }
577    }
578
579    let mut selected = Vec::new();
580    let mut rejected = Vec::new();
581    let mut used = 0usize;
582    for chunk in sorted {
583        if used + chunk.estimated_tokens > options.budget_tokens {
584            rejected.push(chunk);
585            continue;
586        }
587        used += chunk.estimated_tokens;
588        selected.push(chunk);
589    }
590    (selected, rejected)
591}
592
593/// Core assembly pass. The caller is responsible for supplying
594/// `custom_scores` (Some when a host ranker produced them, None otherwise).
595pub fn assemble_context(
596    artifacts: &[ArtifactRecord],
597    options: &AssembleOptions,
598    custom_scores: Option<&[f64]>,
599) -> AssembledContext {
600    let mut dropped = Vec::new();
601    let candidates = build_candidate_chunks(artifacts, options, &mut dropped);
602    // When a custom_scores slice was supplied, it's indexed over the
603    // *pre-dedup* candidate list — the caller saw those chunk ids. Build
604    // a score map keyed by chunk id so dedup doesn't misalign the slice.
605    let custom_map: Option<std::collections::BTreeMap<String, f64>> = custom_scores.map(|scores| {
606        candidates
607            .iter()
608            .zip(scores.iter().copied())
609            .map(|(chunk, score)| (chunk.id.clone(), score))
610            .collect()
611    });
612    let (mut deduped, dedup_dropped) =
613        dedup_chunks(candidates, options.dedup, options.semantic_overlap);
614    dropped.extend(dedup_dropped);
615
616    if let Some(map) = custom_map.as_ref() {
617        for chunk in deduped.iter_mut() {
618            chunk.score = map.get(&chunk.id).copied().unwrap_or(0.0);
619        }
620    } else {
621        score_chunks(&mut deduped, artifacts, options, None);
622    }
623
624    let (selected, rejected) = pack_budget(deduped, options);
625
626    let mut reasons = Vec::new();
627    let mut included_tokens: std::collections::BTreeMap<String, (String, usize, usize, usize)> =
628        std::collections::BTreeMap::new();
629    // chunk_count per artifact from selected + rejected, for "of X chunks" observability.
630    let mut total_counts: std::collections::BTreeMap<String, usize> =
631        std::collections::BTreeMap::new();
632    for chunk in selected.iter().chain(rejected.iter()) {
633        *total_counts.entry(chunk.artifact_id.clone()).or_insert(0) += 1;
634    }
635
636    for chunk in &selected {
637        reasons.push(AssembledReason {
638            chunk_id: chunk.id.clone(),
639            artifact_id: chunk.artifact_id.clone(),
640            strategy: options.strategy.as_str(),
641            score: chunk.score,
642            included: true,
643            reason: "selected",
644        });
645        let entry = included_tokens
646            .entry(chunk.artifact_id.clone())
647            .or_insert_with(|| {
648                (
649                    chunk.artifact_kind.clone(),
650                    0,
651                    *total_counts.get(&chunk.artifact_id).unwrap_or(&0),
652                    0,
653                )
654            });
655        entry.1 += 1;
656        entry.3 += chunk.estimated_tokens;
657    }
658    for chunk in &rejected {
659        reasons.push(AssembledReason {
660            chunk_id: chunk.id.clone(),
661            artifact_id: chunk.artifact_id.clone(),
662            strategy: options.strategy.as_str(),
663            score: chunk.score,
664            included: false,
665            reason: "budget_exceeded",
666        });
667        dropped.push(AssembledExclusion {
668            artifact_id: chunk.artifact_id.clone(),
669            chunk_id: Some(chunk.id.clone()),
670            reason: "budget_exceeded",
671            detail: None,
672        });
673    }
674
675    let total_tokens = selected.iter().map(|chunk| chunk.estimated_tokens).sum();
676    let included: Vec<AssembledArtifactSummary> = included_tokens
677        .into_iter()
678        .map(
679            |(artifact_id, (kind, included, total, tokens))| AssembledArtifactSummary {
680                artifact_id,
681                artifact_kind: kind,
682                chunks_included: included,
683                chunks_total: total,
684                tokens_included: tokens,
685            },
686        )
687        .collect();
688
689    AssembledContext {
690        chunks: selected,
691        included,
692        dropped,
693        reasons,
694        total_tokens,
695        budget_tokens: options.budget_tokens,
696        strategy: options.strategy,
697        dedup: options.dedup,
698    }
699}
700
701/// Render assembled chunks into the XML-ish `<artifact>` format used by
702/// `render_artifacts_context`, so swapping in `assemble_context` at the
703/// workflow stage layer produces the same prompt shape agents already
704/// expect. Appends a trailing `<context_budget>` summary so the agent
705/// (and replay diff) can see how much of the budget got used.
706pub fn render_assembled_chunks(assembled: &AssembledContext) -> String {
707    let mut parts = Vec::with_capacity(assembled.chunks.len() + 1);
708    for chunk in &assembled.chunks {
709        let title = chunk
710            .title
711            .clone()
712            .unwrap_or_else(|| format!("{} {}", chunk.artifact_kind, chunk.artifact_id));
713        parts.push(format!(
714            "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
715<chunk_id>{}</chunk_id>\n<chunk_index>{} of {}</chunk_index>\n<body>\n{}\n</body>\n</artifact>",
716            escape_xml_text(&title),
717            escape_xml_text(&chunk.artifact_kind),
718            escape_xml_text(chunk.source.as_deref().unwrap_or("unknown")),
719            escape_xml_text(&chunk.id),
720            chunk.chunk_index + 1,
721            chunk.chunk_count,
722            chunk.text,
723        ));
724    }
725    parts.push(format!(
726        "<context_budget>\n<used_tokens>{}</used_tokens>\n<budget_tokens>{}</budget_tokens>\n<strategy>{}</strategy>\n<dedup>{}</dedup>\n</context_budget>",
727        assembled.total_tokens,
728        assembled.budget_tokens,
729        assembled.strategy.as_str(),
730        assembled.dedup.as_str(),
731    ));
732    parts.join("\n\n")
733}
734
735#[cfg(test)]
736mod tests {
737    use super::*;
738
739    fn artifact(id: &str, text: &str) -> ArtifactRecord {
740        ArtifactRecord {
741            type_name: "artifact".to_string(),
742            id: id.to_string(),
743            kind: "resource".to_string(),
744            title: Some(id.to_string()),
745            text: Some(text.to_string()),
746            data: None,
747            source: None,
748            created_at: format!("2026-04-{id:0>2}T00:00:00Z"),
749            freshness: None,
750            priority: Some(50),
751            lineage: Vec::new(),
752            relevance: None,
753            estimated_tokens: None,
754            stage: None,
755            metadata: Default::default(),
756        }
757        .normalize()
758    }
759
760    #[test]
761    fn chunk_ids_are_stable_and_content_addressed() {
762        let a = artifact("01", "alpha bravo charlie");
763        let options = AssembleOptions::default();
764        let mut dropped = Vec::new();
765        let first = build_candidate_chunks(&[a.clone()], &options, &mut dropped);
766        let second = build_candidate_chunks(&[a], &options, &mut dropped);
767        assert_eq!(first[0].id, second[0].id);
768        assert!(first[0].id.starts_with("01#"));
769        // Different text → different id.
770        let different = artifact("01", "delta echo foxtrot");
771        let different_chunks = build_candidate_chunks(&[different], &options, &mut dropped);
772        assert_ne!(first[0].id, different_chunks[0].id);
773    }
774
775    #[test]
776    fn chunked_dedup_drops_exact_duplicates() {
777        let a = artifact("01", "shared body");
778        let b = artifact("02", "shared body");
779        let options = AssembleOptions {
780            budget_tokens: 10_000,
781            dedup: AssembleDedup::Chunked,
782            strategy: AssembleStrategy::Recency,
783            ..AssembleOptions::default()
784        };
785        let result = assemble_context(&[a, b], &options, None);
786        assert_eq!(result.chunks.len(), 1);
787        assert!(result.dropped.iter().any(|d| d.reason == "duplicate"));
788    }
789
790    #[test]
791    fn semantic_dedup_catches_near_duplicates() {
792        let a = artifact(
793            "01",
794            "The parser drift issue was diagnosed by tracing token spans.",
795        );
796        let b = artifact(
797            "02",
798            "The parser drift issue, diagnosed by tracing token spans, appeared in the tokenizer.",
799        );
800        let options = AssembleOptions {
801            dedup: AssembleDedup::Semantic,
802            strategy: AssembleStrategy::Recency,
803            semantic_overlap: 0.5,
804            ..AssembleOptions::default()
805        };
806        let result = assemble_context(&[a, b], &options, None);
807        // One of the two should dedup out.
808        assert_eq!(result.chunks.len(), 1);
809        assert!(result.dropped.iter().any(|d| d.reason == "duplicate"
810            && d.detail
811                .as_deref()
812                .is_some_and(|s| s.starts_with("semantic"))));
813    }
814
815    #[test]
816    fn budget_enforcement_trims_excess_chunks() {
817        let text = "word ".repeat(5_000); // ~25_000 chars → ~6_250 tokens
818        let a = artifact("01", &text);
819        let options = AssembleOptions {
820            budget_tokens: 500,
821            dedup: AssembleDedup::None,
822            strategy: AssembleStrategy::Recency,
823            microcompact_threshold: 200,
824            ..AssembleOptions::default()
825        };
826        let result = assemble_context(&[a], &options, None);
827        assert!(result.total_tokens <= options.budget_tokens);
828        assert!(result
829            .reasons
830            .iter()
831            .any(|r| !r.included && r.reason == "budget_exceeded"));
832    }
833
834    #[test]
835    fn relevance_strategy_prefers_query_matches() {
836        let a = artifact("01", "completely unrelated content about weather");
837        let b = artifact("02", "parser drift diagnostics token spans hotspot");
838        let options = AssembleOptions {
839            // Tight budget: only one ~11-token chunk fits.
840            budget_tokens: 12,
841            dedup: AssembleDedup::None,
842            strategy: AssembleStrategy::Relevance,
843            query: Some("parser drift diagnostics".to_string()),
844            microcompact_threshold: 10_000,
845            ..AssembleOptions::default()
846        };
847        let result = assemble_context(&[a, b], &options, None);
848        assert_eq!(result.chunks.len(), 1);
849        assert_eq!(result.chunks[0].artifact_id, "02");
850    }
851
852    #[test]
853    fn round_robin_interleaves_artifacts() {
854        // Each paragraph is ~10 chars (~3 tokens); microcompact threshold of
855        // 3 tokens (=12 chars) forces one chunk per paragraph without
856        // fragmenting the final one mid-word.
857        let a = artifact("01", "alpha aaaa\n\nbeta bbbb\n\ngamma ccc");
858        let b = artifact("02", "delta dddd\n\nepsilon ee\n\nzeta ff");
859        let options = AssembleOptions {
860            budget_tokens: 10_000,
861            dedup: AssembleDedup::None,
862            strategy: AssembleStrategy::RoundRobin,
863            microcompact_threshold: 3,
864            ..AssembleOptions::default()
865        };
866        let result = assemble_context(&[a, b], &options, None);
867        let order: Vec<&str> = result
868            .chunks
869            .iter()
870            .map(|c| c.artifact_id.as_str())
871            .collect();
872        // First four positions must alternate even if counts don't match
873        // exactly — interleaving is the invariant, not total chunk count.
874        assert!(order.len() >= 4);
875        assert_eq!(order[0], "01");
876        assert_eq!(order[1], "02");
877        assert_eq!(order[2], "01");
878        assert_eq!(order[3], "02");
879    }
880
881    #[test]
882    fn custom_scores_override_default_ranker() {
883        let a = artifact("01", "first body content");
884        let b = artifact("02", "second body content");
885        let options = AssembleOptions {
886            // Tight budget: only one ~5-token chunk fits.
887            budget_tokens: 6,
888            dedup: AssembleDedup::None,
889            strategy: AssembleStrategy::Relevance,
890            query: Some("first".to_string()),
891            microcompact_threshold: 10_000,
892            ..AssembleOptions::default()
893        };
894        let mut dropped = Vec::new();
895        let candidates = build_candidate_chunks(&[a.clone(), b.clone()], &options, &mut dropped);
896        assert_eq!(candidates.len(), 2);
897        // Host-supplied ranker deliberately inverts the default order: it
898        // scores the second artifact higher even though query says "first".
899        let scores = vec![0.1, 0.9];
900        let result = assemble_context(&[a, b], &options, Some(&scores));
901        assert_eq!(result.chunks.len(), 1);
902        assert_eq!(result.chunks[0].artifact_id, "02");
903    }
904
905    #[test]
906    fn reasons_name_strategy_and_inclusion() {
907        let a = artifact("01", "included body");
908        let b = artifact("02", "dropped body because budget");
909        let options = AssembleOptions {
910            budget_tokens: 5,
911            dedup: AssembleDedup::None,
912            strategy: AssembleStrategy::Recency,
913            microcompact_threshold: 10_000,
914            ..AssembleOptions::default()
915        };
916        let result = assemble_context(&[a, b], &options, None);
917        assert!(result.reasons.iter().any(|r| r.included));
918        assert!(result.reasons.iter().any(|r| !r.included));
919        for reason in &result.reasons {
920            assert_eq!(reason.strategy, "recency");
921        }
922    }
923
924    #[test]
925    fn empty_artifact_reports_dropped() {
926        let mut empty = artifact("01", "");
927        empty.text = Some(String::new());
928        let options = AssembleOptions::default();
929        let result = assemble_context(&[empty], &options, None);
930        assert!(result.chunks.is_empty());
931        assert!(result
932            .dropped
933            .iter()
934            .any(|d| d.reason == "empty_text" || d.reason == "no_text"));
935    }
936}
harn_vm/orchestration/assemble.rs

harn_vm/orchestration/
assemble.rs