1use std::collections::BTreeSet;
14
15use sha2::{Digest, Sha256};
16
17use super::ArtifactRecord;
18
19#[derive(Clone, Copy, Debug, Eq, PartialEq)]
21pub enum AssembleStrategy {
22 Recency,
24 Relevance,
27 RoundRobin,
30}
31
32impl AssembleStrategy {
33 pub fn parse(value: &str) -> Result<Self, String> {
34 match value {
35 "recency" => Ok(Self::Recency),
36 "relevance" => Ok(Self::Relevance),
37 "round_robin" => Ok(Self::RoundRobin),
38 other => Err(format!(
39 "assemble_context: strategy must be one of recency | relevance | round_robin (got {other:?})"
40 )),
41 }
42 }
43
44 pub fn as_str(&self) -> &'static str {
45 match self {
46 Self::Recency => "recency",
47 Self::Relevance => "relevance",
48 Self::RoundRobin => "round_robin",
49 }
50 }
51}
52
53#[derive(Clone, Copy, Debug, Eq, PartialEq)]
54pub enum AssembleDedup {
55 None,
56 Chunked,
58 Semantic,
62}
63
64impl AssembleDedup {
65 pub fn parse(value: &str) -> Result<Self, String> {
66 match value {
67 "none" => Ok(Self::None),
68 "chunked" => Ok(Self::Chunked),
69 "semantic" => Ok(Self::Semantic),
70 other => Err(format!(
71 "assemble_context: dedup must be one of none | chunked | semantic (got {other:?})"
72 )),
73 }
74 }
75
76 pub fn as_str(&self) -> &'static str {
77 match self {
78 Self::None => "none",
79 Self::Chunked => "chunked",
80 Self::Semantic => "semantic",
81 }
82 }
83}
84
85#[derive(Clone, Debug)]
86pub struct AssembleOptions {
87 pub budget_tokens: usize,
88 pub dedup: AssembleDedup,
89 pub strategy: AssembleStrategy,
90 pub query: Option<String>,
91 pub microcompact_threshold: usize,
93 pub semantic_overlap: f64,
95}
96
97impl Default for AssembleOptions {
98 fn default() -> Self {
99 Self {
100 budget_tokens: 8_000,
101 dedup: AssembleDedup::Chunked,
102 strategy: AssembleStrategy::Relevance,
103 query: None,
104 microcompact_threshold: 2_000,
105 semantic_overlap: 0.85,
106 }
107 }
108}
109
110#[derive(Clone, Debug)]
112pub struct AssembledChunk {
113 pub id: String,
114 pub artifact_id: String,
115 pub artifact_kind: String,
116 pub title: Option<String>,
117 pub source: Option<String>,
118 pub text: String,
119 pub estimated_tokens: usize,
120 pub chunk_index: usize,
121 pub chunk_count: usize,
122 pub score: f64,
123}
124
125#[derive(Clone, Debug)]
127pub struct AssembledArtifactSummary {
128 pub artifact_id: String,
129 pub artifact_kind: String,
130 pub chunks_included: usize,
131 pub chunks_total: usize,
132 pub tokens_included: usize,
133}
134
135#[derive(Clone, Debug)]
137pub struct AssembledExclusion {
138 pub artifact_id: String,
139 pub chunk_id: Option<String>,
140 pub reason: &'static str,
141 pub detail: Option<String>,
142}
143
144#[derive(Clone, Debug)]
146pub struct AssembledReason {
147 pub chunk_id: String,
148 pub artifact_id: String,
149 pub strategy: &'static str,
150 pub score: f64,
151 pub included: bool,
152 pub reason: &'static str,
153}
154
155#[derive(Clone, Debug)]
156pub struct AssembledContext {
157 pub chunks: Vec<AssembledChunk>,
158 pub included: Vec<AssembledArtifactSummary>,
159 pub dropped: Vec<AssembledExclusion>,
160 pub reasons: Vec<AssembledReason>,
161 pub total_tokens: usize,
162 pub budget_tokens: usize,
163 pub strategy: AssembleStrategy,
164 pub dedup: AssembleDedup,
165}
166
167pub fn stable_chunk_id(artifact_id: &str, text: &str) -> String {
171 let mut hasher = Sha256::new();
172 hasher.update(text.as_bytes());
173 let digest = hasher.finalize();
174 let hex = digest
175 .iter()
176 .take(8)
177 .map(|byte| format!("{byte:02x}"))
178 .collect::<String>();
179 format!("{artifact_id}#{hex}")
180}
181
182pub fn estimate_chunk_tokens(text: &str) -> usize {
185 text.len().div_ceil(4)
186}
187
188pub fn chunk_text(text: &str, target_tokens: usize) -> Vec<String> {
193 if text.is_empty() {
194 return Vec::new();
195 }
196 let target_chars = (target_tokens.max(1)).saturating_mul(4);
197 if text.len() <= target_chars {
198 return vec![text.to_string()];
199 }
200
201 let mut chunks = Vec::new();
202 let mut current = String::new();
203 let push_current = |current: &mut String, chunks: &mut Vec<String>| {
204 if !current.is_empty() {
205 chunks.push(std::mem::take(current));
206 }
207 };
208
209 for paragraph in split_paragraphs(text) {
210 if current.len() + paragraph.len() + 2 > target_chars && !current.is_empty() {
211 push_current(&mut current, &mut chunks);
212 }
213 if paragraph.len() > target_chars {
214 push_current(&mut current, &mut chunks);
216 let mut inner = String::new();
217 for line in paragraph.split_inclusive('\n') {
218 if inner.len() + line.len() > target_chars && !inner.is_empty() {
219 chunks.push(std::mem::take(&mut inner));
220 }
221 if line.len() > target_chars {
222 let mut i = 0;
224 let bytes = line.as_bytes();
225 while i < line.len() {
226 let mut end = (i + target_chars).min(line.len());
227 while end < line.len() && (bytes[end] & 0b1100_0000) == 0b1000_0000 {
228 end += 1;
229 }
230 if !inner.is_empty() {
231 chunks.push(std::mem::take(&mut inner));
232 }
233 chunks.push(line[i..end].to_string());
234 i = end;
235 }
236 } else {
237 inner.push_str(line);
238 }
239 }
240 if !inner.is_empty() {
241 chunks.push(inner);
242 }
243 } else {
244 if !current.is_empty() {
245 current.push_str("\n\n");
246 }
247 current.push_str(paragraph);
248 }
249 }
250 push_current(&mut current, &mut chunks);
251 chunks
252}
253
254fn split_paragraphs(text: &str) -> Vec<&str> {
255 let mut out = Vec::new();
256 let mut start = 0;
257 let bytes = text.as_bytes();
258 let mut i = 0;
259 while i + 1 < bytes.len() {
260 if bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
261 let segment = text[start..i].trim_matches('\n');
262 if !segment.is_empty() {
263 out.push(segment);
264 }
265 let mut j = i;
267 while j < bytes.len() && bytes[j] == b'\n' {
268 j += 1;
269 }
270 start = j;
271 i = j;
272 } else {
273 i += 1;
274 }
275 }
276 let tail = text[start..].trim_matches('\n');
277 if !tail.is_empty() {
278 out.push(tail);
279 }
280 if out.is_empty() && !text.is_empty() {
281 out.push(text);
282 }
283 out
284}
285
286fn trigrams(text: &str) -> BTreeSet<[u8; 3]> {
290 let normalized: Vec<u8> = text
291 .chars()
292 .filter_map(|c| {
293 if c.is_alphanumeric() {
294 Some(c.to_ascii_lowercase() as u8)
295 } else if c.is_whitespace() {
296 Some(b' ')
297 } else {
298 None
299 }
300 })
301 .collect();
302 let mut out = BTreeSet::new();
303 if normalized.len() < 3 {
304 return out;
305 }
306 for window in normalized.windows(3) {
307 out.insert([window[0], window[1], window[2]]);
308 }
309 out
310}
311
312fn jaccard(a: &BTreeSet<[u8; 3]>, b: &BTreeSet<[u8; 3]>) -> f64 {
313 if a.is_empty() && b.is_empty() {
314 return 1.0;
315 }
316 let intersection = a.intersection(b).count() as f64;
317 let union = a.union(b).count() as f64;
318 if union == 0.0 {
319 0.0
320 } else {
321 intersection / union
322 }
323}
324
325fn keyword_overlap_score(text: &str, query: &str) -> f64 {
326 if query.trim().is_empty() {
327 return 0.0;
328 }
329 let query_terms: BTreeSet<String> = query
330 .split_whitespace()
331 .filter(|term| term.len() > 2)
332 .map(|term| term.to_ascii_lowercase())
333 .collect();
334 if query_terms.is_empty() {
335 return 0.0;
336 }
337 let mut matches = 0usize;
338 let lower = text.to_ascii_lowercase();
339 for term in &query_terms {
340 if lower.contains(term.as_str()) {
341 matches += 1;
342 }
343 }
344 let base = matches as f64 / query_terms.len() as f64;
345 let density = (matches as f64) / (text.len() as f64 / 400.0 + 1.0);
349 base * 0.7 + density.min(1.0) * 0.3
350}
351
352pub fn build_candidate_chunks(
356 artifacts: &[ArtifactRecord],
357 options: &AssembleOptions,
358 dropped: &mut Vec<AssembledExclusion>,
359) -> Vec<AssembledChunk> {
360 let mut candidates = Vec::new();
361 for artifact in artifacts {
362 let Some(text) = artifact.text.as_ref() else {
363 dropped.push(AssembledExclusion {
364 artifact_id: artifact.id.clone(),
365 chunk_id: None,
366 reason: "no_text",
367 detail: None,
368 });
369 continue;
370 };
371 let trimmed = text.trim();
372 if trimmed.is_empty() {
373 dropped.push(AssembledExclusion {
374 artifact_id: artifact.id.clone(),
375 chunk_id: None,
376 reason: "empty_text",
377 detail: None,
378 });
379 continue;
380 }
381 let estimated = artifact
382 .estimated_tokens
383 .unwrap_or_else(|| estimate_chunk_tokens(text));
384 let pieces: Vec<String> = if estimated > options.microcompact_threshold {
385 chunk_text(text, options.microcompact_threshold)
386 } else {
387 vec![text.to_string()]
388 };
389 let count = pieces.len();
390 for (idx, piece) in pieces.into_iter().enumerate() {
391 let id = stable_chunk_id(&artifact.id, &piece);
392 let tokens = estimate_chunk_tokens(&piece);
393 candidates.push(AssembledChunk {
394 id,
395 artifact_id: artifact.id.clone(),
396 artifact_kind: artifact.kind.clone(),
397 title: artifact.title.clone(),
398 source: artifact.source.clone(),
399 text: piece,
400 estimated_tokens: tokens,
401 chunk_index: idx,
402 chunk_count: count,
403 score: 0.0,
404 });
405 }
406 }
407 candidates
408}
409
410pub fn dedup_chunks(
414 mut chunks: Vec<AssembledChunk>,
415 mode: AssembleDedup,
416 semantic_overlap: f64,
417) -> (Vec<AssembledChunk>, Vec<AssembledExclusion>) {
418 let mut dropped = Vec::new();
419 match mode {
420 AssembleDedup::None => (chunks, dropped),
421 AssembleDedup::Chunked => {
422 let mut seen: BTreeSet<String> = BTreeSet::new();
423 chunks.retain(|chunk| {
424 let key = normalized_text_key(&chunk.text);
425 if seen.insert(key) {
426 true
427 } else {
428 dropped.push(AssembledExclusion {
429 artifact_id: chunk.artifact_id.clone(),
430 chunk_id: Some(chunk.id.clone()),
431 reason: "duplicate",
432 detail: Some("chunked".to_string()),
433 });
434 false
435 }
436 });
437 (chunks, dropped)
438 }
439 AssembleDedup::Semantic => {
440 let mut kept: Vec<(AssembledChunk, BTreeSet<[u8; 3]>)> = Vec::new();
441 for chunk in chunks.drain(..) {
442 let trigrams_new = trigrams(&chunk.text);
443 let mut duplicate = false;
444 for (existing, existing_trigrams) in &kept {
445 if jaccard(&trigrams_new, existing_trigrams) >= semantic_overlap {
446 dropped.push(AssembledExclusion {
447 artifact_id: chunk.artifact_id.clone(),
448 chunk_id: Some(chunk.id.clone()),
449 reason: "duplicate",
450 detail: Some(format!("semantic≈{}", existing.id)),
451 });
452 duplicate = true;
453 break;
454 }
455 }
456 if !duplicate {
457 kept.push((chunk, trigrams_new));
458 }
459 }
460 (kept.into_iter().map(|(chunk, _)| chunk).collect(), dropped)
461 }
462 }
463}
464
465fn normalized_text_key(text: &str) -> String {
466 text.split_whitespace().collect::<Vec<_>>().join(" ")
467}
468
469pub fn score_chunks(
474 chunks: &mut [AssembledChunk],
475 artifacts: &[ArtifactRecord],
476 options: &AssembleOptions,
477 custom_scores: Option<&[f64]>,
478) {
479 match options.strategy {
480 AssembleStrategy::Recency => {
481 let order: std::collections::BTreeMap<&str, (String, usize)> = artifacts
483 .iter()
484 .enumerate()
485 .map(|(idx, artifact)| (artifact.id.as_str(), (artifact.created_at.clone(), idx)))
486 .collect();
487 for chunk in chunks.iter_mut() {
488 let (created_at, input_idx) = order
489 .get(chunk.artifact_id.as_str())
490 .cloned()
491 .unwrap_or_else(|| (String::new(), 0));
492 let recency_rank = created_at
496 .chars()
497 .fold(0u64, |acc, c| acc.wrapping_mul(131).wrapping_add(c as u64));
498 chunk.score = recency_rank as f64 / u64::MAX as f64
499 - (input_idx as f64) * 1e-9
500 - (chunk.chunk_index as f64) * 1e-12;
501 }
502 }
503 AssembleStrategy::Relevance => {
504 if let Some(scores) = custom_scores {
505 for (chunk, score) in chunks.iter_mut().zip(scores.iter()) {
506 chunk.score = *score;
507 }
508 } else {
510 let query = options.query.as_deref().unwrap_or("");
511 for chunk in chunks.iter_mut() {
512 chunk.score = keyword_overlap_score(&chunk.text, query);
513 }
514 }
515 }
516 AssembleStrategy::RoundRobin => {
517 for (idx, chunk) in chunks.iter_mut().enumerate() {
520 chunk.score = 1.0 - (idx as f64) * 1e-6;
521 }
522 }
523 }
524}
525
526pub fn pack_budget(
528 chunks: Vec<AssembledChunk>,
529 options: &AssembleOptions,
530) -> (Vec<AssembledChunk>, Vec<AssembledChunk>) {
531 let mut sorted = chunks;
532 match options.strategy {
533 AssembleStrategy::RoundRobin => {
534 let mut groups: Vec<Vec<AssembledChunk>> = Vec::new();
536 let mut group_index: std::collections::BTreeMap<String, usize> =
537 std::collections::BTreeMap::new();
538 for chunk in sorted.drain(..) {
540 let key = chunk.artifact_id.clone();
541 let idx = match group_index.get(&key) {
542 Some(idx) => *idx,
543 None => {
544 let idx = groups.len();
545 group_index.insert(key.clone(), idx);
546 groups.push(Vec::new());
547 idx
548 }
549 };
550 groups[idx].push(chunk);
551 }
552 for group in &mut groups {
554 group.sort_by_key(|chunk| chunk.chunk_index);
555 }
556 let mut interleaved = Vec::new();
557 let max_len = groups.iter().map(Vec::len).max().unwrap_or(0);
558 for i in 0..max_len {
559 for group in &mut groups {
560 if i < group.len() {
561 interleaved.push(group[i].clone());
562 }
563 }
564 }
565 sorted = interleaved;
566 }
567 _ => {
568 sorted.sort_by(|a, b| {
569 b.score
570 .partial_cmp(&a.score)
571 .unwrap_or(std::cmp::Ordering::Equal)
572 .then_with(|| a.artifact_id.cmp(&b.artifact_id))
573 .then_with(|| a.chunk_index.cmp(&b.chunk_index))
574 });
575 }
576 }
577
578 let mut selected = Vec::new();
579 let mut rejected = Vec::new();
580 let mut used = 0usize;
581 for chunk in sorted {
582 if used + chunk.estimated_tokens > options.budget_tokens {
583 rejected.push(chunk);
584 continue;
585 }
586 used += chunk.estimated_tokens;
587 selected.push(chunk);
588 }
589 (selected, rejected)
590}
591
592pub fn assemble_context(
595 artifacts: &[ArtifactRecord],
596 options: &AssembleOptions,
597 custom_scores: Option<&[f64]>,
598) -> AssembledContext {
599 let mut dropped = Vec::new();
600 let candidates = build_candidate_chunks(artifacts, options, &mut dropped);
601 let custom_map: Option<std::collections::BTreeMap<String, f64>> = custom_scores.map(|scores| {
605 candidates
606 .iter()
607 .zip(scores.iter().copied())
608 .map(|(chunk, score)| (chunk.id.clone(), score))
609 .collect()
610 });
611 let (mut deduped, dedup_dropped) =
612 dedup_chunks(candidates, options.dedup, options.semantic_overlap);
613 dropped.extend(dedup_dropped);
614
615 if let Some(map) = custom_map.as_ref() {
616 for chunk in deduped.iter_mut() {
617 chunk.score = map.get(&chunk.id).copied().unwrap_or(0.0);
618 }
619 } else {
620 score_chunks(&mut deduped, artifacts, options, None);
621 }
622
623 let (selected, rejected) = pack_budget(deduped, options);
624
625 let mut reasons = Vec::new();
626 let mut included_tokens: std::collections::BTreeMap<String, (String, usize, usize, usize)> =
627 std::collections::BTreeMap::new();
628 let mut total_counts: std::collections::BTreeMap<String, usize> =
630 std::collections::BTreeMap::new();
631 for chunk in selected.iter().chain(rejected.iter()) {
632 *total_counts.entry(chunk.artifact_id.clone()).or_insert(0) += 1;
633 }
634
635 for chunk in &selected {
636 reasons.push(AssembledReason {
637 chunk_id: chunk.id.clone(),
638 artifact_id: chunk.artifact_id.clone(),
639 strategy: options.strategy.as_str(),
640 score: chunk.score,
641 included: true,
642 reason: "selected",
643 });
644 let entry = included_tokens
645 .entry(chunk.artifact_id.clone())
646 .or_insert_with(|| {
647 (
648 chunk.artifact_kind.clone(),
649 0,
650 *total_counts.get(&chunk.artifact_id).unwrap_or(&0),
651 0,
652 )
653 });
654 entry.1 += 1;
655 entry.3 += chunk.estimated_tokens;
656 }
657 for chunk in &rejected {
658 reasons.push(AssembledReason {
659 chunk_id: chunk.id.clone(),
660 artifact_id: chunk.artifact_id.clone(),
661 strategy: options.strategy.as_str(),
662 score: chunk.score,
663 included: false,
664 reason: "budget_exceeded",
665 });
666 dropped.push(AssembledExclusion {
667 artifact_id: chunk.artifact_id.clone(),
668 chunk_id: Some(chunk.id.clone()),
669 reason: "budget_exceeded",
670 detail: None,
671 });
672 }
673
674 let total_tokens = selected.iter().map(|chunk| chunk.estimated_tokens).sum();
675 let included: Vec<AssembledArtifactSummary> = included_tokens
676 .into_iter()
677 .map(
678 |(artifact_id, (kind, included, total, tokens))| AssembledArtifactSummary {
679 artifact_id,
680 artifact_kind: kind,
681 chunks_included: included,
682 chunks_total: total,
683 tokens_included: tokens,
684 },
685 )
686 .collect();
687
688 AssembledContext {
689 chunks: selected,
690 included,
691 dropped,
692 reasons,
693 total_tokens,
694 budget_tokens: options.budget_tokens,
695 strategy: options.strategy,
696 dedup: options.dedup,
697 }
698}
699
700pub fn render_assembled_chunks(assembled: &AssembledContext) -> String {
706 let mut parts = Vec::with_capacity(assembled.chunks.len() + 1);
707 for chunk in &assembled.chunks {
708 let title = chunk
709 .title
710 .clone()
711 .unwrap_or_else(|| format!("{} {}", chunk.artifact_kind, chunk.artifact_id));
712 parts.push(format!(
713 "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
714<chunk_id>{}</chunk_id>\n<chunk_index>{} of {}</chunk_index>\n<body>\n{}\n</body>\n</artifact>",
715 escape_xml(&title),
716 escape_xml(&chunk.artifact_kind),
717 escape_xml(chunk.source.as_deref().unwrap_or("unknown")),
718 escape_xml(&chunk.id),
719 chunk.chunk_index + 1,
720 chunk.chunk_count,
721 chunk.text,
722 ));
723 }
724 parts.push(format!(
725 "<context_budget>\n<used_tokens>{}</used_tokens>\n<budget_tokens>{}</budget_tokens>\n<strategy>{}</strategy>\n<dedup>{}</dedup>\n</context_budget>",
726 assembled.total_tokens,
727 assembled.budget_tokens,
728 assembled.strategy.as_str(),
729 assembled.dedup.as_str(),
730 ));
731 parts.join("\n\n")
732}
733
734fn escape_xml(text: &str) -> String {
735 text.replace('&', "&")
736 .replace('<', "<")
737 .replace('>', ">")
738}
739
740#[cfg(test)]
741mod tests {
742 use super::*;
743
744 fn artifact(id: &str, text: &str) -> ArtifactRecord {
745 ArtifactRecord {
746 type_name: "artifact".to_string(),
747 id: id.to_string(),
748 kind: "resource".to_string(),
749 title: Some(id.to_string()),
750 text: Some(text.to_string()),
751 data: None,
752 source: None,
753 created_at: format!("2026-04-{id:0>2}T00:00:00Z"),
754 freshness: None,
755 priority: Some(50),
756 lineage: Vec::new(),
757 relevance: None,
758 estimated_tokens: None,
759 stage: None,
760 metadata: Default::default(),
761 }
762 .normalize()
763 }
764
765 #[test]
766 fn chunk_ids_are_stable_and_content_addressed() {
767 let a = artifact("01", "alpha bravo charlie");
768 let options = AssembleOptions::default();
769 let mut dropped = Vec::new();
770 let first = build_candidate_chunks(&[a.clone()], &options, &mut dropped);
771 let second = build_candidate_chunks(&[a], &options, &mut dropped);
772 assert_eq!(first[0].id, second[0].id);
773 assert!(first[0].id.starts_with("01#"));
774 let different = artifact("01", "delta echo foxtrot");
776 let different_chunks = build_candidate_chunks(&[different], &options, &mut dropped);
777 assert_ne!(first[0].id, different_chunks[0].id);
778 }
779
780 #[test]
781 fn chunked_dedup_drops_exact_duplicates() {
782 let a = artifact("01", "shared body");
783 let b = artifact("02", "shared body");
784 let options = AssembleOptions {
785 budget_tokens: 10_000,
786 dedup: AssembleDedup::Chunked,
787 strategy: AssembleStrategy::Recency,
788 ..AssembleOptions::default()
789 };
790 let result = assemble_context(&[a, b], &options, None);
791 assert_eq!(result.chunks.len(), 1);
792 assert!(result.dropped.iter().any(|d| d.reason == "duplicate"));
793 }
794
795 #[test]
796 fn semantic_dedup_catches_near_duplicates() {
797 let a = artifact(
798 "01",
799 "The parser drift issue was diagnosed by tracing token spans.",
800 );
801 let b = artifact(
802 "02",
803 "The parser drift issue, diagnosed by tracing token spans, appeared in the tokenizer.",
804 );
805 let options = AssembleOptions {
806 dedup: AssembleDedup::Semantic,
807 strategy: AssembleStrategy::Recency,
808 semantic_overlap: 0.5,
809 ..AssembleOptions::default()
810 };
811 let result = assemble_context(&[a, b], &options, None);
812 assert_eq!(result.chunks.len(), 1);
814 assert!(result.dropped.iter().any(|d| d.reason == "duplicate"
815 && d.detail
816 .as_deref()
817 .is_some_and(|s| s.starts_with("semantic"))));
818 }
819
820 #[test]
821 fn budget_enforcement_trims_excess_chunks() {
822 let text = "word ".repeat(5_000); let a = artifact("01", &text);
824 let options = AssembleOptions {
825 budget_tokens: 500,
826 dedup: AssembleDedup::None,
827 strategy: AssembleStrategy::Recency,
828 microcompact_threshold: 200,
829 ..AssembleOptions::default()
830 };
831 let result = assemble_context(&[a], &options, None);
832 assert!(result.total_tokens <= options.budget_tokens);
833 assert!(result
834 .reasons
835 .iter()
836 .any(|r| !r.included && r.reason == "budget_exceeded"));
837 }
838
839 #[test]
840 fn relevance_strategy_prefers_query_matches() {
841 let a = artifact("01", "completely unrelated content about weather");
842 let b = artifact("02", "parser drift diagnostics token spans hotspot");
843 let options = AssembleOptions {
844 budget_tokens: 12,
846 dedup: AssembleDedup::None,
847 strategy: AssembleStrategy::Relevance,
848 query: Some("parser drift diagnostics".to_string()),
849 microcompact_threshold: 10_000,
850 ..AssembleOptions::default()
851 };
852 let result = assemble_context(&[a, b], &options, None);
853 assert_eq!(result.chunks.len(), 1);
854 assert_eq!(result.chunks[0].artifact_id, "02");
855 }
856
857 #[test]
858 fn round_robin_interleaves_artifacts() {
859 let a = artifact("01", "alpha aaaa\n\nbeta bbbb\n\ngamma ccc");
863 let b = artifact("02", "delta dddd\n\nepsilon ee\n\nzeta ff");
864 let options = AssembleOptions {
865 budget_tokens: 10_000,
866 dedup: AssembleDedup::None,
867 strategy: AssembleStrategy::RoundRobin,
868 microcompact_threshold: 3,
869 ..AssembleOptions::default()
870 };
871 let result = assemble_context(&[a, b], &options, None);
872 let order: Vec<&str> = result
873 .chunks
874 .iter()
875 .map(|c| c.artifact_id.as_str())
876 .collect();
877 assert!(order.len() >= 4);
880 assert_eq!(order[0], "01");
881 assert_eq!(order[1], "02");
882 assert_eq!(order[2], "01");
883 assert_eq!(order[3], "02");
884 }
885
886 #[test]
887 fn custom_scores_override_default_ranker() {
888 let a = artifact("01", "first body content");
889 let b = artifact("02", "second body content");
890 let options = AssembleOptions {
891 budget_tokens: 6,
893 dedup: AssembleDedup::None,
894 strategy: AssembleStrategy::Relevance,
895 query: Some("first".to_string()),
896 microcompact_threshold: 10_000,
897 ..AssembleOptions::default()
898 };
899 let mut dropped = Vec::new();
900 let candidates = build_candidate_chunks(&[a.clone(), b.clone()], &options, &mut dropped);
901 assert_eq!(candidates.len(), 2);
902 let scores = vec![0.1, 0.9];
905 let result = assemble_context(&[a, b], &options, Some(&scores));
906 assert_eq!(result.chunks.len(), 1);
907 assert_eq!(result.chunks[0].artifact_id, "02");
908 }
909
910 #[test]
911 fn reasons_name_strategy_and_inclusion() {
912 let a = artifact("01", "included body");
913 let b = artifact("02", "dropped body because budget");
914 let options = AssembleOptions {
915 budget_tokens: 5,
916 dedup: AssembleDedup::None,
917 strategy: AssembleStrategy::Recency,
918 microcompact_threshold: 10_000,
919 ..AssembleOptions::default()
920 };
921 let result = assemble_context(&[a, b], &options, None);
922 assert!(result.reasons.iter().any(|r| r.included));
923 assert!(result.reasons.iter().any(|r| !r.included));
924 for reason in &result.reasons {
925 assert_eq!(reason.strategy, "recency");
926 }
927 }
928
929 #[test]
930 fn empty_artifact_reports_dropped() {
931 let mut empty = artifact("01", "");
932 empty.text = Some(String::new());
933 let options = AssembleOptions::default();
934 let result = assemble_context(&[empty], &options, None);
935 assert!(result.chunks.is_empty());
936 assert!(result
937 .dropped
938 .iter()
939 .any(|d| d.reason == "empty_text" || d.reason == "no_text"));
940 }
941}