harn_vm/orchestration/
assemble.rs

1//! Adaptive context assembly with cross-artifact microcompaction.
2//!
3//! `assemble_context` packs a set of artifacts into a token-budgeted slice
4//! of chunks, deduplicating overlap across artifacts, snipping oversized
5//! entries into chunked form, and returning an observability record that
6//! names why each chunk was included or dropped.
7//!
8//! The core is intentionally deterministic: given the same input artifacts
9//! and options, it produces the same chunk ids and ordering. A host-supplied
10//! ranker callback is the only non-deterministic hook; the VM-side binding
11//! invokes it via the same pattern as `compress_callback`.
12
13use std::collections::BTreeSet;
14
15use sha2::{Digest, Sha256};
16
17use super::ArtifactRecord;
18
19/// Strategy used to order chunks when packing into the budget.
20#[derive(Clone, Copy, Debug, Eq, PartialEq)]
21pub enum AssembleStrategy {
22    /// Sort by artifact `created_at` (newest first), then by chunk index.
23    Recency,
24    /// Sort by ranker score (highest first). Default ranker is token-overlap
25    /// against `query`; a host callback can supply a custom one.
26    Relevance,
27    /// Interleave chunks one-per-artifact in artifact input order, cycling
28    /// until the budget fills. Gives every artifact a chance to contribute.
29    RoundRobin,
30}
31
32impl AssembleStrategy {
33    pub fn parse(value: &str) -> Result<Self, String> {
34        match value {
35            "recency" => Ok(Self::Recency),
36            "relevance" => Ok(Self::Relevance),
37            "round_robin" => Ok(Self::RoundRobin),
38            other => Err(format!(
39                "assemble_context: strategy must be one of recency | relevance | round_robin (got {other:?})"
40            )),
41        }
42    }
43
44    pub fn as_str(&self) -> &'static str {
45        match self {
46            Self::Recency => "recency",
47            Self::Relevance => "relevance",
48            Self::RoundRobin => "round_robin",
49        }
50    }
51}
52
53#[derive(Clone, Copy, Debug, Eq, PartialEq)]
54pub enum AssembleDedup {
55    None,
56    /// Hash each chunk's normalized text; drop later duplicates.
57    Chunked,
58    /// Shingle-based overlap detection. Treat chunks whose trigram
59    /// Jaccard similarity exceeds 0.85 as duplicates. Still fully
60    /// deterministic — no embeddings or callback required.
61    Semantic,
62}
63
64impl AssembleDedup {
65    pub fn parse(value: &str) -> Result<Self, String> {
66        match value {
67            "none" => Ok(Self::None),
68            "chunked" => Ok(Self::Chunked),
69            "semantic" => Ok(Self::Semantic),
70            other => Err(format!(
71                "assemble_context: dedup must be one of none | chunked | semantic (got {other:?})"
72            )),
73        }
74    }
75
76    pub fn as_str(&self) -> &'static str {
77        match self {
78            Self::None => "none",
79            Self::Chunked => "chunked",
80            Self::Semantic => "semantic",
81        }
82    }
83}
84
85#[derive(Clone, Debug)]
86pub struct AssembleOptions {
87    pub budget_tokens: usize,
88    pub dedup: AssembleDedup,
89    pub strategy: AssembleStrategy,
90    pub query: Option<String>,
91    /// Artifacts larger than this many tokens are split into chunks.
92    pub microcompact_threshold: usize,
93    /// Minimum overlap ratio (0.0-1.0) that counts as a semantic duplicate.
94    pub semantic_overlap: f64,
95}
96
97impl Default for AssembleOptions {
98    fn default() -> Self {
99        Self {
100            budget_tokens: 8_000,
101            dedup: AssembleDedup::Chunked,
102            strategy: AssembleStrategy::Relevance,
103            query: None,
104            microcompact_threshold: 2_000,
105            semantic_overlap: 0.85,
106        }
107    }
108}
109
110/// One unit of packed context with a stable content-addressed id.
111#[derive(Clone, Debug)]
112pub struct AssembledChunk {
113    pub id: String,
114    pub artifact_id: String,
115    pub artifact_kind: String,
116    pub title: Option<String>,
117    pub source: Option<String>,
118    pub text: String,
119    pub estimated_tokens: usize,
120    pub chunk_index: usize,
121    pub chunk_count: usize,
122    pub score: f64,
123}
124
125/// Per-artifact summary of what made it into the pack.
126#[derive(Clone, Debug)]
127pub struct AssembledArtifactSummary {
128    pub artifact_id: String,
129    pub artifact_kind: String,
130    pub chunks_included: usize,
131    pub chunks_total: usize,
132    pub tokens_included: usize,
133}
134
135/// Reason an artifact or chunk was excluded from the final pack.
136#[derive(Clone, Debug)]
137pub struct AssembledExclusion {
138    pub artifact_id: String,
139    pub chunk_id: Option<String>,
140    pub reason: &'static str,
141    pub detail: Option<String>,
142}
143
144/// Per-chunk rationale — the "why" field the issue calls out.
145#[derive(Clone, Debug)]
146pub struct AssembledReason {
147    pub chunk_id: String,
148    pub artifact_id: String,
149    pub strategy: &'static str,
150    pub score: f64,
151    pub included: bool,
152    pub reason: &'static str,
153}
154
155#[derive(Clone, Debug)]
156pub struct AssembledContext {
157    pub chunks: Vec<AssembledChunk>,
158    pub included: Vec<AssembledArtifactSummary>,
159    pub dropped: Vec<AssembledExclusion>,
160    pub reasons: Vec<AssembledReason>,
161    pub total_tokens: usize,
162    pub budget_tokens: usize,
163    pub strategy: AssembleStrategy,
164    pub dedup: AssembleDedup,
165}
166
167/// Content-addressed chunk id — stable across runs for the same text.
168/// The leading `artifact_id` prefix keeps chunks from one artifact visually
169/// grouped in transcripts and replay diffs.
170pub fn stable_chunk_id(artifact_id: &str, text: &str) -> String {
171    let mut hasher = Sha256::new();
172    hasher.update(text.as_bytes());
173    let digest = hasher.finalize();
174    let hex = digest
175        .iter()
176        .take(8)
177        .map(|byte| format!("{byte:02x}"))
178        .collect::<String>();
179    format!("{artifact_id}#{hex}")
180}
181
182/// Approximate token count using the same chars-per-token heuristic as
183/// `estimate_message_tokens`. One token ~= 4 characters.
184pub fn estimate_chunk_tokens(text: &str) -> usize {
185    text.len().div_ceil(4)
186}
187
188/// Split a text into chunks of roughly `target_tokens` each, snapping to
189/// paragraph (`\n\n`) boundaries when possible, then falling back to line
190/// breaks, then raw character boundaries. Chunks never exceed the target
191/// more than transiently — the final chunk may be short.
192pub fn chunk_text(text: &str, target_tokens: usize) -> Vec<String> {
193    if text.is_empty() {
194        return Vec::new();
195    }
196    let target_chars = (target_tokens.max(1)).saturating_mul(4);
197    if text.len() <= target_chars {
198        return vec![text.to_string()];
199    }
200
201    let mut chunks = Vec::new();
202    let mut current = String::new();
203    let push_current = |current: &mut String, chunks: &mut Vec<String>| {
204        if !current.is_empty() {
205            chunks.push(std::mem::take(current));
206        }
207    };
208
209    for paragraph in split_paragraphs(text) {
210        if current.len() + paragraph.len() + 2 > target_chars && !current.is_empty() {
211            push_current(&mut current, &mut chunks);
212        }
213        if paragraph.len() > target_chars {
214            // Oversized paragraph: split by lines.
215            push_current(&mut current, &mut chunks);
216            let mut inner = String::new();
217            for line in paragraph.split_inclusive('\n') {
218                if inner.len() + line.len() > target_chars && !inner.is_empty() {
219                    chunks.push(std::mem::take(&mut inner));
220                }
221                if line.len() > target_chars {
222                    // Still too big: fall back to char-boundary splits.
223                    let mut i = 0;
224                    let bytes = line.as_bytes();
225                    while i < line.len() {
226                        let mut end = (i + target_chars).min(line.len());
227                        while end < line.len() && (bytes[end] & 0b1100_0000) == 0b1000_0000 {
228                            end += 1;
229                        }
230                        if !inner.is_empty() {
231                            chunks.push(std::mem::take(&mut inner));
232                        }
233                        chunks.push(line[i..end].to_string());
234                        i = end;
235                    }
236                } else {
237                    inner.push_str(line);
238                }
239            }
240            if !inner.is_empty() {
241                chunks.push(inner);
242            }
243        } else {
244            if !current.is_empty() {
245                current.push_str("\n\n");
246            }
247            current.push_str(paragraph);
248        }
249    }
250    push_current(&mut current, &mut chunks);
251    chunks
252}
253
254fn split_paragraphs(text: &str) -> Vec<&str> {
255    let mut out = Vec::new();
256    let mut start = 0;
257    let bytes = text.as_bytes();
258    let mut i = 0;
259    while i + 1 < bytes.len() {
260        if bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
261            let segment = text[start..i].trim_matches('\n');
262            if !segment.is_empty() {
263                out.push(segment);
264            }
265            // skip all consecutive newlines
266            let mut j = i;
267            while j < bytes.len() && bytes[j] == b'\n' {
268                j += 1;
269            }
270            start = j;
271            i = j;
272        } else {
273            i += 1;
274        }
275    }
276    let tail = text[start..].trim_matches('\n');
277    if !tail.is_empty() {
278        out.push(tail);
279    }
280    if out.is_empty() && !text.is_empty() {
281        out.push(text);
282    }
283    out
284}
285
286/// Trigram set for shingle-based semantic dedup. Lowercases and strips
287/// non-alphanumeric characters; produces trigrams over UTF-8 byte windows
288/// so behavior is stable across platforms.
289fn trigrams(text: &str) -> BTreeSet<[u8; 3]> {
290    let normalized: Vec<u8> = text
291        .chars()
292        .filter_map(|c| {
293            if c.is_alphanumeric() {
294                Some(c.to_ascii_lowercase() as u8)
295            } else if c.is_whitespace() {
296                Some(b' ')
297            } else {
298                None
299            }
300        })
301        .collect();
302    let mut out = BTreeSet::new();
303    if normalized.len() < 3 {
304        return out;
305    }
306    for window in normalized.windows(3) {
307        out.insert([window[0], window[1], window[2]]);
308    }
309    out
310}
311
312fn jaccard(a: &BTreeSet<[u8; 3]>, b: &BTreeSet<[u8; 3]>) -> f64 {
313    if a.is_empty() && b.is_empty() {
314        return 1.0;
315    }
316    let intersection = a.intersection(b).count() as f64;
317    let union = a.union(b).count() as f64;
318    if union == 0.0 {
319        0.0
320    } else {
321        intersection / union
322    }
323}
324
325fn keyword_overlap_score(text: &str, query: &str) -> f64 {
326    if query.trim().is_empty() {
327        return 0.0;
328    }
329    let query_terms: BTreeSet<String> = query
330        .split_whitespace()
331        .filter(|term| term.len() > 2)
332        .map(|term| term.to_ascii_lowercase())
333        .collect();
334    if query_terms.is_empty() {
335        return 0.0;
336    }
337    let mut matches = 0usize;
338    let lower = text.to_ascii_lowercase();
339    for term in &query_terms {
340        if lower.contains(term.as_str()) {
341            matches += 1;
342        }
343    }
344    let base = matches as f64 / query_terms.len() as f64;
345    // Length penalty: prefer chunks where matched query terms are dense,
346    // so a 100-char chunk that mentions "parser" scores higher than a
347    // 10k-char chunk that mentions "parser" once.
348    let density = (matches as f64) / (text.len() as f64 / 400.0 + 1.0);
349    base * 0.7 + density.min(1.0) * 0.3
350}
351
352/// Build every candidate chunk from the input artifacts. Artifacts that
353/// exceed `microcompact_threshold` tokens get split; smaller ones become
354/// a single chunk each. Skips artifacts with no text body.
355pub fn build_candidate_chunks(
356    artifacts: &[ArtifactRecord],
357    options: &AssembleOptions,
358    dropped: &mut Vec<AssembledExclusion>,
359) -> Vec<AssembledChunk> {
360    let mut candidates = Vec::new();
361    for artifact in artifacts {
362        let Some(text) = artifact.text.as_ref() else {
363            dropped.push(AssembledExclusion {
364                artifact_id: artifact.id.clone(),
365                chunk_id: None,
366                reason: "no_text",
367                detail: None,
368            });
369            continue;
370        };
371        let trimmed = text.trim();
372        if trimmed.is_empty() {
373            dropped.push(AssembledExclusion {
374                artifact_id: artifact.id.clone(),
375                chunk_id: None,
376                reason: "empty_text",
377                detail: None,
378            });
379            continue;
380        }
381        let estimated = artifact
382            .estimated_tokens
383            .unwrap_or_else(|| estimate_chunk_tokens(text));
384        let pieces: Vec<String> = if estimated > options.microcompact_threshold {
385            chunk_text(text, options.microcompact_threshold)
386        } else {
387            vec![text.to_string()]
388        };
389        let count = pieces.len();
390        for (idx, piece) in pieces.into_iter().enumerate() {
391            let id = stable_chunk_id(&artifact.id, &piece);
392            let tokens = estimate_chunk_tokens(&piece);
393            candidates.push(AssembledChunk {
394                id,
395                artifact_id: artifact.id.clone(),
396                artifact_kind: artifact.kind.clone(),
397                title: artifact.title.clone(),
398                source: artifact.source.clone(),
399                text: piece,
400                estimated_tokens: tokens,
401                chunk_index: idx,
402                chunk_count: count,
403                score: 0.0,
404            });
405        }
406    }
407    candidates
408}
409
410/// Apply dedup. Returns (kept, dropped-by-dedup). `dropped` is the slice
411/// needed for the caller's observability record; the reason is always
412/// `"duplicate"`.
413pub fn dedup_chunks(
414    mut chunks: Vec<AssembledChunk>,
415    mode: AssembleDedup,
416    semantic_overlap: f64,
417) -> (Vec<AssembledChunk>, Vec<AssembledExclusion>) {
418    let mut dropped = Vec::new();
419    match mode {
420        AssembleDedup::None => (chunks, dropped),
421        AssembleDedup::Chunked => {
422            let mut seen: BTreeSet<String> = BTreeSet::new();
423            chunks.retain(|chunk| {
424                let key = normalized_text_key(&chunk.text);
425                if seen.insert(key) {
426                    true
427                } else {
428                    dropped.push(AssembledExclusion {
429                        artifact_id: chunk.artifact_id.clone(),
430                        chunk_id: Some(chunk.id.clone()),
431                        reason: "duplicate",
432                        detail: Some("chunked".to_string()),
433                    });
434                    false
435                }
436            });
437            (chunks, dropped)
438        }
439        AssembleDedup::Semantic => {
440            let mut kept: Vec<(AssembledChunk, BTreeSet<[u8; 3]>)> = Vec::new();
441            for chunk in chunks.drain(..) {
442                let trigrams_new = trigrams(&chunk.text);
443                let mut duplicate = false;
444                for (existing, existing_trigrams) in &kept {
445                    if jaccard(&trigrams_new, existing_trigrams) >= semantic_overlap {
446                        dropped.push(AssembledExclusion {
447                            artifact_id: chunk.artifact_id.clone(),
448                            chunk_id: Some(chunk.id.clone()),
449                            reason: "duplicate",
450                            detail: Some(format!("semantic≈{}", existing.id)),
451                        });
452                        duplicate = true;
453                        break;
454                    }
455                }
456                if !duplicate {
457                    kept.push((chunk, trigrams_new));
458                }
459            }
460            (kept.into_iter().map(|(chunk, _)| chunk).collect(), dropped)
461        }
462    }
463}
464
465fn normalized_text_key(text: &str) -> String {
466    text.split_whitespace().collect::<Vec<_>>().join(" ")
467}
468
469/// Score chunks under `strategy`. With `Relevance` and no callback scores,
470/// falls back to keyword overlap against `options.query`. The `custom_scores`
471/// Option is the hook where a host-supplied ranker slots in — the caller
472/// (stdlib binding) invokes the closure and passes the resulting Vec here.
473pub fn score_chunks(
474    chunks: &mut [AssembledChunk],
475    artifacts: &[ArtifactRecord],
476    options: &AssembleOptions,
477    custom_scores: Option<&[f64]>,
478) {
479    match options.strategy {
480        AssembleStrategy::Recency => {
481            // Newer artifacts first; within an artifact, earlier chunks first.
482            let order: std::collections::BTreeMap<&str, (String, usize)> = artifacts
483                .iter()
484                .enumerate()
485                .map(|(idx, artifact)| (artifact.id.as_str(), (artifact.created_at.clone(), idx)))
486                .collect();
487            for chunk in chunks.iter_mut() {
488                let (created_at, input_idx) = order
489                    .get(chunk.artifact_id.as_str())
490                    .cloned()
491                    .unwrap_or_else(|| (String::new(), 0));
492                // Score is a synthetic "recency score" in [0, 1] based on
493                // lexicographic order of created_at with the input index as
494                // a stable tiebreaker. Newer created_at → higher score.
495                let recency_rank = created_at
496                    .chars()
497                    .fold(0u64, |acc, c| acc.wrapping_mul(131).wrapping_add(c as u64));
498                chunk.score = recency_rank as f64 / u64::MAX as f64
499                    - (input_idx as f64) * 1e-9
500                    - (chunk.chunk_index as f64) * 1e-12;
501            }
502        }
503        AssembleStrategy::Relevance => {
504            if let Some(scores) = custom_scores {
505                for (chunk, score) in chunks.iter_mut().zip(scores.iter()) {
506                    chunk.score = *score;
507                }
508                // Any trailing chunks without a custom score keep 0.0.
509            } else {
510                let query = options.query.as_deref().unwrap_or("");
511                for chunk in chunks.iter_mut() {
512                    chunk.score = keyword_overlap_score(&chunk.text, query);
513                }
514            }
515        }
516        AssembleStrategy::RoundRobin => {
517            // Round-robin is handled at pack time; score just reflects
518            // input order so ties break deterministically.
519            for (idx, chunk) in chunks.iter_mut().enumerate() {
520                chunk.score = 1.0 - (idx as f64) * 1e-6;
521            }
522        }
523    }
524}
525
526/// Pack chunks under `budget_tokens`. Returns (selected, rejected-for-budget).
527pub fn pack_budget(
528    chunks: Vec<AssembledChunk>,
529    options: &AssembleOptions,
530) -> (Vec<AssembledChunk>, Vec<AssembledChunk>) {
531    let mut sorted = chunks;
532    match options.strategy {
533        AssembleStrategy::RoundRobin => {
534            // Group by artifact_id preserving first-appearance order, then interleave.
535            let mut groups: Vec<Vec<AssembledChunk>> = Vec::new();
536            let mut group_index: std::collections::BTreeMap<String, usize> =
537                std::collections::BTreeMap::new();
538            // Preserve input order of artifacts by walking sorted-as-given.
539            for chunk in sorted.drain(..) {
540                let key = chunk.artifact_id.clone();
541                let idx = match group_index.get(&key) {
542                    Some(idx) => *idx,
543                    None => {
544                        let idx = groups.len();
545                        group_index.insert(key.clone(), idx);
546                        groups.push(Vec::new());
547                        idx
548                    }
549                };
550                groups[idx].push(chunk);
551            }
552            // Within each group, keep by chunk_index ascending.
553            for group in &mut groups {
554                group.sort_by_key(|chunk| chunk.chunk_index);
555            }
556            let mut interleaved = Vec::new();
557            let max_len = groups.iter().map(Vec::len).max().unwrap_or(0);
558            for i in 0..max_len {
559                for group in &mut groups {
560                    if i < group.len() {
561                        interleaved.push(group[i].clone());
562                    }
563                }
564            }
565            sorted = interleaved;
566        }
567        _ => {
568            sorted.sort_by(|a, b| {
569                b.score
570                    .partial_cmp(&a.score)
571                    .unwrap_or(std::cmp::Ordering::Equal)
572                    .then_with(|| a.artifact_id.cmp(&b.artifact_id))
573                    .then_with(|| a.chunk_index.cmp(&b.chunk_index))
574            });
575        }
576    }
577
578    let mut selected = Vec::new();
579    let mut rejected = Vec::new();
580    let mut used = 0usize;
581    for chunk in sorted {
582        if used + chunk.estimated_tokens > options.budget_tokens {
583            rejected.push(chunk);
584            continue;
585        }
586        used += chunk.estimated_tokens;
587        selected.push(chunk);
588    }
589    (selected, rejected)
590}
591
592/// Core assembly pass. The caller is responsible for supplying
593/// `custom_scores` (Some when a host ranker produced them, None otherwise).
594pub fn assemble_context(
595    artifacts: &[ArtifactRecord],
596    options: &AssembleOptions,
597    custom_scores: Option<&[f64]>,
598) -> AssembledContext {
599    let mut dropped = Vec::new();
600    let candidates = build_candidate_chunks(artifacts, options, &mut dropped);
601    // When a custom_scores slice was supplied, it's indexed over the
602    // *pre-dedup* candidate list — the caller saw those chunk ids. Build
603    // a score map keyed by chunk id so dedup doesn't misalign the slice.
604    let custom_map: Option<std::collections::BTreeMap<String, f64>> = custom_scores.map(|scores| {
605        candidates
606            .iter()
607            .zip(scores.iter().copied())
608            .map(|(chunk, score)| (chunk.id.clone(), score))
609            .collect()
610    });
611    let (mut deduped, dedup_dropped) =
612        dedup_chunks(candidates, options.dedup, options.semantic_overlap);
613    dropped.extend(dedup_dropped);
614
615    if let Some(map) = custom_map.as_ref() {
616        for chunk in deduped.iter_mut() {
617            chunk.score = map.get(&chunk.id).copied().unwrap_or(0.0);
618        }
619    } else {
620        score_chunks(&mut deduped, artifacts, options, None);
621    }
622
623    let (selected, rejected) = pack_budget(deduped, options);
624
625    let mut reasons = Vec::new();
626    let mut included_tokens: std::collections::BTreeMap<String, (String, usize, usize, usize)> =
627        std::collections::BTreeMap::new();
628    // chunk_count per artifact from selected + rejected, for "of X chunks" observability.
629    let mut total_counts: std::collections::BTreeMap<String, usize> =
630        std::collections::BTreeMap::new();
631    for chunk in selected.iter().chain(rejected.iter()) {
632        *total_counts.entry(chunk.artifact_id.clone()).or_insert(0) += 1;
633    }
634
635    for chunk in &selected {
636        reasons.push(AssembledReason {
637            chunk_id: chunk.id.clone(),
638            artifact_id: chunk.artifact_id.clone(),
639            strategy: options.strategy.as_str(),
640            score: chunk.score,
641            included: true,
642            reason: "selected",
643        });
644        let entry = included_tokens
645            .entry(chunk.artifact_id.clone())
646            .or_insert_with(|| {
647                (
648                    chunk.artifact_kind.clone(),
649                    0,
650                    *total_counts.get(&chunk.artifact_id).unwrap_or(&0),
651                    0,
652                )
653            });
654        entry.1 += 1;
655        entry.3 += chunk.estimated_tokens;
656    }
657    for chunk in &rejected {
658        reasons.push(AssembledReason {
659            chunk_id: chunk.id.clone(),
660            artifact_id: chunk.artifact_id.clone(),
661            strategy: options.strategy.as_str(),
662            score: chunk.score,
663            included: false,
664            reason: "budget_exceeded",
665        });
666        dropped.push(AssembledExclusion {
667            artifact_id: chunk.artifact_id.clone(),
668            chunk_id: Some(chunk.id.clone()),
669            reason: "budget_exceeded",
670            detail: None,
671        });
672    }
673
674    let total_tokens = selected.iter().map(|chunk| chunk.estimated_tokens).sum();
675    let included: Vec<AssembledArtifactSummary> = included_tokens
676        .into_iter()
677        .map(
678            |(artifact_id, (kind, included, total, tokens))| AssembledArtifactSummary {
679                artifact_id,
680                artifact_kind: kind,
681                chunks_included: included,
682                chunks_total: total,
683                tokens_included: tokens,
684            },
685        )
686        .collect();
687
688    AssembledContext {
689        chunks: selected,
690        included,
691        dropped,
692        reasons,
693        total_tokens,
694        budget_tokens: options.budget_tokens,
695        strategy: options.strategy,
696        dedup: options.dedup,
697    }
698}
699
700/// Render assembled chunks into the XML-ish `<artifact>` format used by
701/// `render_artifacts_context`, so swapping in `assemble_context` at the
702/// workflow stage layer produces the same prompt shape agents already
703/// expect. Appends a trailing `<context_budget>` summary so the agent
704/// (and replay diff) can see how much of the budget got used.
705pub fn render_assembled_chunks(assembled: &AssembledContext) -> String {
706    let mut parts = Vec::with_capacity(assembled.chunks.len() + 1);
707    for chunk in &assembled.chunks {
708        let title = chunk
709            .title
710            .clone()
711            .unwrap_or_else(|| format!("{} {}", chunk.artifact_kind, chunk.artifact_id));
712        parts.push(format!(
713            "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
714<chunk_id>{}</chunk_id>\n<chunk_index>{} of {}</chunk_index>\n<body>\n{}\n</body>\n</artifact>",
715            escape_xml(&title),
716            escape_xml(&chunk.artifact_kind),
717            escape_xml(chunk.source.as_deref().unwrap_or("unknown")),
718            escape_xml(&chunk.id),
719            chunk.chunk_index + 1,
720            chunk.chunk_count,
721            chunk.text,
722        ));
723    }
724    parts.push(format!(
725        "<context_budget>\n<used_tokens>{}</used_tokens>\n<budget_tokens>{}</budget_tokens>\n<strategy>{}</strategy>\n<dedup>{}</dedup>\n</context_budget>",
726        assembled.total_tokens,
727        assembled.budget_tokens,
728        assembled.strategy.as_str(),
729        assembled.dedup.as_str(),
730    ));
731    parts.join("\n\n")
732}
733
734fn escape_xml(text: &str) -> String {
735    text.replace('&', "&amp;")
736        .replace('<', "&lt;")
737        .replace('>', "&gt;")
738}
739
740#[cfg(test)]
741mod tests {
742    use super::*;
743
744    fn artifact(id: &str, text: &str) -> ArtifactRecord {
745        ArtifactRecord {
746            type_name: "artifact".to_string(),
747            id: id.to_string(),
748            kind: "resource".to_string(),
749            title: Some(id.to_string()),
750            text: Some(text.to_string()),
751            data: None,
752            source: None,
753            created_at: format!("2026-04-{id:0>2}T00:00:00Z"),
754            freshness: None,
755            priority: Some(50),
756            lineage: Vec::new(),
757            relevance: None,
758            estimated_tokens: None,
759            stage: None,
760            metadata: Default::default(),
761        }
762        .normalize()
763    }
764
765    #[test]
766    fn chunk_ids_are_stable_and_content_addressed() {
767        let a = artifact("01", "alpha bravo charlie");
768        let options = AssembleOptions::default();
769        let mut dropped = Vec::new();
770        let first = build_candidate_chunks(&[a.clone()], &options, &mut dropped);
771        let second = build_candidate_chunks(&[a], &options, &mut dropped);
772        assert_eq!(first[0].id, second[0].id);
773        assert!(first[0].id.starts_with("01#"));
774        // Different text → different id.
775        let different = artifact("01", "delta echo foxtrot");
776        let different_chunks = build_candidate_chunks(&[different], &options, &mut dropped);
777        assert_ne!(first[0].id, different_chunks[0].id);
778    }
779
780    #[test]
781    fn chunked_dedup_drops_exact_duplicates() {
782        let a = artifact("01", "shared body");
783        let b = artifact("02", "shared body");
784        let options = AssembleOptions {
785            budget_tokens: 10_000,
786            dedup: AssembleDedup::Chunked,
787            strategy: AssembleStrategy::Recency,
788            ..AssembleOptions::default()
789        };
790        let result = assemble_context(&[a, b], &options, None);
791        assert_eq!(result.chunks.len(), 1);
792        assert!(result.dropped.iter().any(|d| d.reason == "duplicate"));
793    }
794
795    #[test]
796    fn semantic_dedup_catches_near_duplicates() {
797        let a = artifact(
798            "01",
799            "The parser drift issue was diagnosed by tracing token spans.",
800        );
801        let b = artifact(
802            "02",
803            "The parser drift issue, diagnosed by tracing token spans, appeared in the tokenizer.",
804        );
805        let options = AssembleOptions {
806            dedup: AssembleDedup::Semantic,
807            strategy: AssembleStrategy::Recency,
808            semantic_overlap: 0.5,
809            ..AssembleOptions::default()
810        };
811        let result = assemble_context(&[a, b], &options, None);
812        // One of the two should dedup out.
813        assert_eq!(result.chunks.len(), 1);
814        assert!(result.dropped.iter().any(|d| d.reason == "duplicate"
815            && d.detail
816                .as_deref()
817                .is_some_and(|s| s.starts_with("semantic"))));
818    }
819
820    #[test]
821    fn budget_enforcement_trims_excess_chunks() {
822        let text = "word ".repeat(5_000); // ~25_000 chars → ~6_250 tokens
823        let a = artifact("01", &text);
824        let options = AssembleOptions {
825            budget_tokens: 500,
826            dedup: AssembleDedup::None,
827            strategy: AssembleStrategy::Recency,
828            microcompact_threshold: 200,
829            ..AssembleOptions::default()
830        };
831        let result = assemble_context(&[a], &options, None);
832        assert!(result.total_tokens <= options.budget_tokens);
833        assert!(result
834            .reasons
835            .iter()
836            .any(|r| !r.included && r.reason == "budget_exceeded"));
837    }
838
839    #[test]
840    fn relevance_strategy_prefers_query_matches() {
841        let a = artifact("01", "completely unrelated content about weather");
842        let b = artifact("02", "parser drift diagnostics token spans hotspot");
843        let options = AssembleOptions {
844            // Tight budget: only one ~11-token chunk fits.
845            budget_tokens: 12,
846            dedup: AssembleDedup::None,
847            strategy: AssembleStrategy::Relevance,
848            query: Some("parser drift diagnostics".to_string()),
849            microcompact_threshold: 10_000,
850            ..AssembleOptions::default()
851        };
852        let result = assemble_context(&[a, b], &options, None);
853        assert_eq!(result.chunks.len(), 1);
854        assert_eq!(result.chunks[0].artifact_id, "02");
855    }
856
857    #[test]
858    fn round_robin_interleaves_artifacts() {
859        // Each paragraph is ~10 chars (~3 tokens); microcompact threshold of
860        // 3 tokens (=12 chars) forces one chunk per paragraph without
861        // fragmenting the final one mid-word.
862        let a = artifact("01", "alpha aaaa\n\nbeta bbbb\n\ngamma ccc");
863        let b = artifact("02", "delta dddd\n\nepsilon ee\n\nzeta ff");
864        let options = AssembleOptions {
865            budget_tokens: 10_000,
866            dedup: AssembleDedup::None,
867            strategy: AssembleStrategy::RoundRobin,
868            microcompact_threshold: 3,
869            ..AssembleOptions::default()
870        };
871        let result = assemble_context(&[a, b], &options, None);
872        let order: Vec<&str> = result
873            .chunks
874            .iter()
875            .map(|c| c.artifact_id.as_str())
876            .collect();
877        // First four positions must alternate even if counts don't match
878        // exactly — interleaving is the invariant, not total chunk count.
879        assert!(order.len() >= 4);
880        assert_eq!(order[0], "01");
881        assert_eq!(order[1], "02");
882        assert_eq!(order[2], "01");
883        assert_eq!(order[3], "02");
884    }
885
886    #[test]
887    fn custom_scores_override_default_ranker() {
888        let a = artifact("01", "first body content");
889        let b = artifact("02", "second body content");
890        let options = AssembleOptions {
891            // Tight budget: only one ~5-token chunk fits.
892            budget_tokens: 6,
893            dedup: AssembleDedup::None,
894            strategy: AssembleStrategy::Relevance,
895            query: Some("first".to_string()),
896            microcompact_threshold: 10_000,
897            ..AssembleOptions::default()
898        };
899        let mut dropped = Vec::new();
900        let candidates = build_candidate_chunks(&[a.clone(), b.clone()], &options, &mut dropped);
901        assert_eq!(candidates.len(), 2);
902        // Host-supplied ranker deliberately inverts the default order: it
903        // scores the second artifact higher even though query says "first".
904        let scores = vec![0.1, 0.9];
905        let result = assemble_context(&[a, b], &options, Some(&scores));
906        assert_eq!(result.chunks.len(), 1);
907        assert_eq!(result.chunks[0].artifact_id, "02");
908    }
909
910    #[test]
911    fn reasons_name_strategy_and_inclusion() {
912        let a = artifact("01", "included body");
913        let b = artifact("02", "dropped body because budget");
914        let options = AssembleOptions {
915            budget_tokens: 5,
916            dedup: AssembleDedup::None,
917            strategy: AssembleStrategy::Recency,
918            microcompact_threshold: 10_000,
919            ..AssembleOptions::default()
920        };
921        let result = assemble_context(&[a, b], &options, None);
922        assert!(result.reasons.iter().any(|r| r.included));
923        assert!(result.reasons.iter().any(|r| !r.included));
924        for reason in &result.reasons {
925            assert_eq!(reason.strategy, "recency");
926        }
927    }
928
929    #[test]
930    fn empty_artifact_reports_dropped() {
931        let mut empty = artifact("01", "");
932        empty.text = Some(String::new());
933        let options = AssembleOptions::default();
934        let result = assemble_context(&[empty], &options, None);
935        assert!(result.chunks.is_empty());
936        assert!(result
937            .dropped
938            .iter()
939            .any(|d| d.reason == "empty_text" || d.reason == "no_text"));
940    }
941}
harn_vm/orchestration/assemble.rs

harn_vm/orchestration/
assemble.rs