1use std::collections::BTreeSet;
14
15use sha2::{Digest, Sha256};
16
17use super::ArtifactRecord;
18use crate::stdlib::xml::escape_xml_text;
19
20#[derive(Clone, Copy, Debug, Eq, PartialEq)]
22pub enum AssembleStrategy {
23 Recency,
25 Relevance,
28 RoundRobin,
31}
32
33impl AssembleStrategy {
34 pub fn parse(value: &str) -> Result<Self, String> {
35 match value {
36 "recency" => Ok(Self::Recency),
37 "relevance" => Ok(Self::Relevance),
38 "round_robin" => Ok(Self::RoundRobin),
39 other => Err(format!(
40 "assemble_context: strategy must be one of recency | relevance | round_robin (got {other:?})"
41 )),
42 }
43 }
44
45 pub fn as_str(&self) -> &'static str {
46 match self {
47 Self::Recency => "recency",
48 Self::Relevance => "relevance",
49 Self::RoundRobin => "round_robin",
50 }
51 }
52}
53
54#[derive(Clone, Copy, Debug, Eq, PartialEq)]
55pub enum AssembleDedup {
56 None,
57 Chunked,
59 Semantic,
63}
64
65impl AssembleDedup {
66 pub fn parse(value: &str) -> Result<Self, String> {
67 match value {
68 "none" => Ok(Self::None),
69 "chunked" => Ok(Self::Chunked),
70 "semantic" => Ok(Self::Semantic),
71 other => Err(format!(
72 "assemble_context: dedup must be one of none | chunked | semantic (got {other:?})"
73 )),
74 }
75 }
76
77 pub fn as_str(&self) -> &'static str {
78 match self {
79 Self::None => "none",
80 Self::Chunked => "chunked",
81 Self::Semantic => "semantic",
82 }
83 }
84}
85
86#[derive(Clone, Debug)]
87pub struct AssembleOptions {
88 pub budget_tokens: usize,
89 pub dedup: AssembleDedup,
90 pub strategy: AssembleStrategy,
91 pub query: Option<String>,
92 pub microcompact_threshold: usize,
94 pub semantic_overlap: f64,
96}
97
98impl Default for AssembleOptions {
99 fn default() -> Self {
100 Self {
101 budget_tokens: 8_000,
102 dedup: AssembleDedup::Chunked,
103 strategy: AssembleStrategy::Relevance,
104 query: None,
105 microcompact_threshold: 2_000,
106 semantic_overlap: 0.85,
107 }
108 }
109}
110
111#[derive(Clone, Debug)]
113pub struct AssembledChunk {
114 pub id: String,
115 pub artifact_id: String,
116 pub artifact_kind: String,
117 pub title: Option<String>,
118 pub source: Option<String>,
119 pub text: String,
120 pub estimated_tokens: usize,
121 pub chunk_index: usize,
122 pub chunk_count: usize,
123 pub score: f64,
124}
125
126#[derive(Clone, Debug)]
128pub struct AssembledArtifactSummary {
129 pub artifact_id: String,
130 pub artifact_kind: String,
131 pub chunks_included: usize,
132 pub chunks_total: usize,
133 pub tokens_included: usize,
134}
135
136#[derive(Clone, Debug)]
138pub struct AssembledExclusion {
139 pub artifact_id: String,
140 pub chunk_id: Option<String>,
141 pub reason: &'static str,
142 pub detail: Option<String>,
143}
144
145#[derive(Clone, Debug)]
147pub struct AssembledReason {
148 pub chunk_id: String,
149 pub artifact_id: String,
150 pub strategy: &'static str,
151 pub score: f64,
152 pub included: bool,
153 pub reason: &'static str,
154}
155
156#[derive(Clone, Debug)]
157pub struct AssembledContext {
158 pub chunks: Vec<AssembledChunk>,
159 pub included: Vec<AssembledArtifactSummary>,
160 pub dropped: Vec<AssembledExclusion>,
161 pub reasons: Vec<AssembledReason>,
162 pub total_tokens: usize,
163 pub budget_tokens: usize,
164 pub strategy: AssembleStrategy,
165 pub dedup: AssembleDedup,
166}
167
168pub fn stable_chunk_id(artifact_id: &str, text: &str) -> String {
172 let mut hasher = Sha256::new();
173 hasher.update(text.as_bytes());
174 let digest = hasher.finalize();
175 let hex = digest
176 .iter()
177 .take(8)
178 .map(|byte| format!("{byte:02x}"))
179 .collect::<String>();
180 format!("{artifact_id}#{hex}")
181}
182
183pub fn estimate_chunk_tokens(text: &str) -> usize {
186 text.len().div_ceil(4)
187}
188
189pub fn chunk_text(text: &str, target_tokens: usize) -> Vec<String> {
194 if text.is_empty() {
195 return Vec::new();
196 }
197 let target_chars = (target_tokens.max(1)).saturating_mul(4);
198 if text.len() <= target_chars {
199 return vec![text.to_string()];
200 }
201
202 let mut chunks = Vec::new();
203 let mut current = String::new();
204 let push_current = |current: &mut String, chunks: &mut Vec<String>| {
205 if !current.is_empty() {
206 chunks.push(std::mem::take(current));
207 }
208 };
209
210 for paragraph in split_paragraphs(text) {
211 if current.len() + paragraph.len() + 2 > target_chars && !current.is_empty() {
212 push_current(&mut current, &mut chunks);
213 }
214 if paragraph.len() > target_chars {
215 push_current(&mut current, &mut chunks);
217 let mut inner = String::new();
218 for line in paragraph.split_inclusive('\n') {
219 if inner.len() + line.len() > target_chars && !inner.is_empty() {
220 chunks.push(std::mem::take(&mut inner));
221 }
222 if line.len() > target_chars {
223 let mut i = 0;
225 let bytes = line.as_bytes();
226 while i < line.len() {
227 let mut end = (i + target_chars).min(line.len());
228 while end < line.len() && (bytes[end] & 0b1100_0000) == 0b1000_0000 {
229 end += 1;
230 }
231 if !inner.is_empty() {
232 chunks.push(std::mem::take(&mut inner));
233 }
234 chunks.push(line[i..end].to_string());
235 i = end;
236 }
237 } else {
238 inner.push_str(line);
239 }
240 }
241 if !inner.is_empty() {
242 chunks.push(inner);
243 }
244 } else {
245 if !current.is_empty() {
246 current.push_str("\n\n");
247 }
248 current.push_str(paragraph);
249 }
250 }
251 push_current(&mut current, &mut chunks);
252 chunks
253}
254
255fn split_paragraphs(text: &str) -> Vec<&str> {
256 let mut out = Vec::new();
257 let mut start = 0;
258 let bytes = text.as_bytes();
259 let mut i = 0;
260 while i + 1 < bytes.len() {
261 if bytes[i] == b'\n' && bytes[i + 1] == b'\n' {
262 let segment = text[start..i].trim_matches('\n');
263 if !segment.is_empty() {
264 out.push(segment);
265 }
266 let mut j = i;
268 while j < bytes.len() && bytes[j] == b'\n' {
269 j += 1;
270 }
271 start = j;
272 i = j;
273 } else {
274 i += 1;
275 }
276 }
277 let tail = text[start..].trim_matches('\n');
278 if !tail.is_empty() {
279 out.push(tail);
280 }
281 if out.is_empty() && !text.is_empty() {
282 out.push(text);
283 }
284 out
285}
286
287fn trigrams(text: &str) -> BTreeSet<[u8; 3]> {
291 let normalized: Vec<u8> = text
292 .chars()
293 .filter_map(|c| {
294 if c.is_alphanumeric() {
295 Some(c.to_ascii_lowercase() as u8)
296 } else if c.is_whitespace() {
297 Some(b' ')
298 } else {
299 None
300 }
301 })
302 .collect();
303 let mut out = BTreeSet::new();
304 if normalized.len() < 3 {
305 return out;
306 }
307 for window in normalized.windows(3) {
308 out.insert([window[0], window[1], window[2]]);
309 }
310 out
311}
312
313fn jaccard(a: &BTreeSet<[u8; 3]>, b: &BTreeSet<[u8; 3]>) -> f64 {
314 if a.is_empty() && b.is_empty() {
315 return 1.0;
316 }
317 let intersection = a.intersection(b).count() as f64;
318 let union = a.union(b).count() as f64;
319 if union == 0.0 {
320 0.0
321 } else {
322 intersection / union
323 }
324}
325
326fn keyword_overlap_score(text: &str, query: &str) -> f64 {
327 if query.trim().is_empty() {
328 return 0.0;
329 }
330 let query_terms: BTreeSet<String> = query
331 .split_whitespace()
332 .filter(|term| term.len() > 2)
333 .map(|term| term.to_ascii_lowercase())
334 .collect();
335 if query_terms.is_empty() {
336 return 0.0;
337 }
338 let mut matches = 0usize;
339 let lower = text.to_ascii_lowercase();
340 for term in &query_terms {
341 if lower.contains(term.as_str()) {
342 matches += 1;
343 }
344 }
345 let base = matches as f64 / query_terms.len() as f64;
346 let density = (matches as f64) / (text.len() as f64 / 400.0 + 1.0);
350 base * 0.7 + density.min(1.0) * 0.3
351}
352
353pub fn build_candidate_chunks(
357 artifacts: &[ArtifactRecord],
358 options: &AssembleOptions,
359 dropped: &mut Vec<AssembledExclusion>,
360) -> Vec<AssembledChunk> {
361 let mut candidates = Vec::new();
362 for artifact in artifacts {
363 let Some(text) = artifact.text.as_ref() else {
364 dropped.push(AssembledExclusion {
365 artifact_id: artifact.id.clone(),
366 chunk_id: None,
367 reason: "no_text",
368 detail: None,
369 });
370 continue;
371 };
372 let trimmed = text.trim();
373 if trimmed.is_empty() {
374 dropped.push(AssembledExclusion {
375 artifact_id: artifact.id.clone(),
376 chunk_id: None,
377 reason: "empty_text",
378 detail: None,
379 });
380 continue;
381 }
382 let estimated = artifact
383 .estimated_tokens
384 .unwrap_or_else(|| estimate_chunk_tokens(text));
385 let pieces: Vec<String> = if estimated > options.microcompact_threshold {
386 chunk_text(text, options.microcompact_threshold)
387 } else {
388 vec![text.clone()]
389 };
390 let count = pieces.len();
391 for (idx, piece) in pieces.into_iter().enumerate() {
392 let id = stable_chunk_id(&artifact.id, &piece);
393 let tokens = estimate_chunk_tokens(&piece);
394 candidates.push(AssembledChunk {
395 id,
396 artifact_id: artifact.id.clone(),
397 artifact_kind: artifact.kind.clone(),
398 title: artifact.title.clone(),
399 source: artifact.source.clone(),
400 text: piece,
401 estimated_tokens: tokens,
402 chunk_index: idx,
403 chunk_count: count,
404 score: 0.0,
405 });
406 }
407 }
408 candidates
409}
410
411pub fn dedup_chunks(
415 mut chunks: Vec<AssembledChunk>,
416 mode: AssembleDedup,
417 semantic_overlap: f64,
418) -> (Vec<AssembledChunk>, Vec<AssembledExclusion>) {
419 let mut dropped = Vec::new();
420 match mode {
421 AssembleDedup::None => (chunks, dropped),
422 AssembleDedup::Chunked => {
423 let mut seen: BTreeSet<String> = BTreeSet::new();
424 chunks.retain(|chunk| {
425 let key = normalized_text_key(&chunk.text);
426 if seen.insert(key) {
427 true
428 } else {
429 dropped.push(AssembledExclusion {
430 artifact_id: chunk.artifact_id.clone(),
431 chunk_id: Some(chunk.id.clone()),
432 reason: "duplicate",
433 detail: Some("chunked".to_string()),
434 });
435 false
436 }
437 });
438 (chunks, dropped)
439 }
440 AssembleDedup::Semantic => {
441 let mut kept: Vec<(AssembledChunk, BTreeSet<[u8; 3]>)> = Vec::new();
442 for chunk in chunks.drain(..) {
443 let trigrams_new = trigrams(&chunk.text);
444 let mut duplicate = false;
445 for (existing, existing_trigrams) in &kept {
446 if jaccard(&trigrams_new, existing_trigrams) >= semantic_overlap {
447 dropped.push(AssembledExclusion {
448 artifact_id: chunk.artifact_id.clone(),
449 chunk_id: Some(chunk.id.clone()),
450 reason: "duplicate",
451 detail: Some(format!("semantic≈{}", existing.id)),
452 });
453 duplicate = true;
454 break;
455 }
456 }
457 if !duplicate {
458 kept.push((chunk, trigrams_new));
459 }
460 }
461 (kept.into_iter().map(|(chunk, _)| chunk).collect(), dropped)
462 }
463 }
464}
465
466fn normalized_text_key(text: &str) -> String {
467 text.split_whitespace().collect::<Vec<_>>().join(" ")
468}
469
470pub fn score_chunks(
475 chunks: &mut [AssembledChunk],
476 artifacts: &[ArtifactRecord],
477 options: &AssembleOptions,
478 custom_scores: Option<&[f64]>,
479) {
480 match options.strategy {
481 AssembleStrategy::Recency => {
482 let order: std::collections::BTreeMap<&str, (String, usize)> = artifacts
484 .iter()
485 .enumerate()
486 .map(|(idx, artifact)| (artifact.id.as_str(), (artifact.created_at.clone(), idx)))
487 .collect();
488 for chunk in chunks.iter_mut() {
489 let (created_at, input_idx) = order
490 .get(chunk.artifact_id.as_str())
491 .cloned()
492 .unwrap_or_else(|| (String::new(), 0));
493 let recency_rank = created_at
497 .chars()
498 .fold(0u64, |acc, c| acc.wrapping_mul(131).wrapping_add(c as u64));
499 chunk.score = recency_rank as f64 / u64::MAX as f64
500 - (input_idx as f64) * 1e-9
501 - (chunk.chunk_index as f64) * 1e-12;
502 }
503 }
504 AssembleStrategy::Relevance => {
505 if let Some(scores) = custom_scores {
506 for (chunk, score) in chunks.iter_mut().zip(scores.iter()) {
507 chunk.score = *score;
508 }
509 } else {
511 let query = options.query.as_deref().unwrap_or("");
512 for chunk in chunks.iter_mut() {
513 chunk.score = keyword_overlap_score(&chunk.text, query);
514 }
515 }
516 }
517 AssembleStrategy::RoundRobin => {
518 for (idx, chunk) in chunks.iter_mut().enumerate() {
521 chunk.score = 1.0 - (idx as f64) * 1e-6;
522 }
523 }
524 }
525}
526
527pub fn pack_budget(
529 chunks: Vec<AssembledChunk>,
530 options: &AssembleOptions,
531) -> (Vec<AssembledChunk>, Vec<AssembledChunk>) {
532 let mut sorted = chunks;
533 match options.strategy {
534 AssembleStrategy::RoundRobin => {
535 let mut groups: Vec<Vec<AssembledChunk>> = Vec::new();
537 let mut group_index: std::collections::BTreeMap<String, usize> =
538 std::collections::BTreeMap::new();
539 for chunk in sorted.drain(..) {
541 let key = chunk.artifact_id.clone();
542 let idx = match group_index.get(&key) {
543 Some(idx) => *idx,
544 None => {
545 let idx = groups.len();
546 group_index.insert(key.clone(), idx);
547 groups.push(Vec::new());
548 idx
549 }
550 };
551 groups[idx].push(chunk);
552 }
553 for group in &mut groups {
555 group.sort_by_key(|chunk| chunk.chunk_index);
556 }
557 let mut interleaved = Vec::new();
558 let max_len = groups.iter().map(Vec::len).max().unwrap_or(0);
559 for i in 0..max_len {
560 for group in &mut groups {
561 if i < group.len() {
562 interleaved.push(group[i].clone());
563 }
564 }
565 }
566 sorted = interleaved;
567 }
568 _ => {
569 sorted.sort_by(|a, b| {
570 b.score
571 .partial_cmp(&a.score)
572 .unwrap_or(std::cmp::Ordering::Equal)
573 .then_with(|| a.artifact_id.cmp(&b.artifact_id))
574 .then_with(|| a.chunk_index.cmp(&b.chunk_index))
575 });
576 }
577 }
578
579 let mut selected = Vec::new();
580 let mut rejected = Vec::new();
581 let mut used = 0usize;
582 for chunk in sorted {
583 if used + chunk.estimated_tokens > options.budget_tokens {
584 rejected.push(chunk);
585 continue;
586 }
587 used += chunk.estimated_tokens;
588 selected.push(chunk);
589 }
590 (selected, rejected)
591}
592
593pub fn assemble_context(
596 artifacts: &[ArtifactRecord],
597 options: &AssembleOptions,
598 custom_scores: Option<&[f64]>,
599) -> AssembledContext {
600 let mut dropped = Vec::new();
601 let candidates = build_candidate_chunks(artifacts, options, &mut dropped);
602 let custom_map: Option<std::collections::BTreeMap<String, f64>> = custom_scores.map(|scores| {
606 candidates
607 .iter()
608 .zip(scores.iter().copied())
609 .map(|(chunk, score)| (chunk.id.clone(), score))
610 .collect()
611 });
612 let (mut deduped, dedup_dropped) =
613 dedup_chunks(candidates, options.dedup, options.semantic_overlap);
614 dropped.extend(dedup_dropped);
615
616 if let Some(map) = custom_map.as_ref() {
617 for chunk in deduped.iter_mut() {
618 chunk.score = map.get(&chunk.id).copied().unwrap_or(0.0);
619 }
620 } else {
621 score_chunks(&mut deduped, artifacts, options, None);
622 }
623
624 let (selected, rejected) = pack_budget(deduped, options);
625
626 let mut reasons = Vec::new();
627 let mut included_tokens: std::collections::BTreeMap<String, (String, usize, usize, usize)> =
628 std::collections::BTreeMap::new();
629 let mut total_counts: std::collections::BTreeMap<String, usize> =
631 std::collections::BTreeMap::new();
632 for chunk in selected.iter().chain(rejected.iter()) {
633 *total_counts.entry(chunk.artifact_id.clone()).or_insert(0) += 1;
634 }
635
636 for chunk in &selected {
637 reasons.push(AssembledReason {
638 chunk_id: chunk.id.clone(),
639 artifact_id: chunk.artifact_id.clone(),
640 strategy: options.strategy.as_str(),
641 score: chunk.score,
642 included: true,
643 reason: "selected",
644 });
645 let entry = included_tokens
646 .entry(chunk.artifact_id.clone())
647 .or_insert_with(|| {
648 (
649 chunk.artifact_kind.clone(),
650 0,
651 *total_counts.get(&chunk.artifact_id).unwrap_or(&0),
652 0,
653 )
654 });
655 entry.1 += 1;
656 entry.3 += chunk.estimated_tokens;
657 }
658 for chunk in &rejected {
659 reasons.push(AssembledReason {
660 chunk_id: chunk.id.clone(),
661 artifact_id: chunk.artifact_id.clone(),
662 strategy: options.strategy.as_str(),
663 score: chunk.score,
664 included: false,
665 reason: "budget_exceeded",
666 });
667 dropped.push(AssembledExclusion {
668 artifact_id: chunk.artifact_id.clone(),
669 chunk_id: Some(chunk.id.clone()),
670 reason: "budget_exceeded",
671 detail: None,
672 });
673 }
674
675 let total_tokens = selected.iter().map(|chunk| chunk.estimated_tokens).sum();
676 let included: Vec<AssembledArtifactSummary> = included_tokens
677 .into_iter()
678 .map(
679 |(artifact_id, (kind, included, total, tokens))| AssembledArtifactSummary {
680 artifact_id,
681 artifact_kind: kind,
682 chunks_included: included,
683 chunks_total: total,
684 tokens_included: tokens,
685 },
686 )
687 .collect();
688
689 AssembledContext {
690 chunks: selected,
691 included,
692 dropped,
693 reasons,
694 total_tokens,
695 budget_tokens: options.budget_tokens,
696 strategy: options.strategy,
697 dedup: options.dedup,
698 }
699}
700
701pub fn render_assembled_chunks(assembled: &AssembledContext) -> String {
707 let mut parts = Vec::with_capacity(assembled.chunks.len() + 1);
708 for chunk in &assembled.chunks {
709 let title = chunk
710 .title
711 .clone()
712 .unwrap_or_else(|| format!("{} {}", chunk.artifact_kind, chunk.artifact_id));
713 parts.push(format!(
714 "<artifact>\n<title>{}</title>\n<kind>{}</kind>\n<source>{}</source>\n\
715<chunk_id>{}</chunk_id>\n<chunk_index>{} of {}</chunk_index>\n<body>\n{}\n</body>\n</artifact>",
716 escape_xml_text(&title),
717 escape_xml_text(&chunk.artifact_kind),
718 escape_xml_text(chunk.source.as_deref().unwrap_or("unknown")),
719 escape_xml_text(&chunk.id),
720 chunk.chunk_index + 1,
721 chunk.chunk_count,
722 chunk.text,
723 ));
724 }
725 parts.push(format!(
726 "<context_budget>\n<used_tokens>{}</used_tokens>\n<budget_tokens>{}</budget_tokens>\n<strategy>{}</strategy>\n<dedup>{}</dedup>\n</context_budget>",
727 assembled.total_tokens,
728 assembled.budget_tokens,
729 assembled.strategy.as_str(),
730 assembled.dedup.as_str(),
731 ));
732 parts.join("\n\n")
733}
734
735#[cfg(test)]
736mod tests {
737 use super::*;
738
739 fn artifact(id: &str, text: &str) -> ArtifactRecord {
740 ArtifactRecord {
741 type_name: "artifact".to_string(),
742 id: id.to_string(),
743 kind: "resource".to_string(),
744 title: Some(id.to_string()),
745 text: Some(text.to_string()),
746 data: None,
747 source: None,
748 created_at: format!("2026-04-{id:0>2}T00:00:00Z"),
749 freshness: None,
750 priority: Some(50),
751 lineage: Vec::new(),
752 relevance: None,
753 estimated_tokens: None,
754 stage: None,
755 metadata: Default::default(),
756 }
757 .normalize()
758 }
759
760 #[test]
761 fn chunk_ids_are_stable_and_content_addressed() {
762 let a = artifact("01", "alpha bravo charlie");
763 let options = AssembleOptions::default();
764 let mut dropped = Vec::new();
765 let first = build_candidate_chunks(&[a.clone()], &options, &mut dropped);
766 let second = build_candidate_chunks(&[a], &options, &mut dropped);
767 assert_eq!(first[0].id, second[0].id);
768 assert!(first[0].id.starts_with("01#"));
769 let different = artifact("01", "delta echo foxtrot");
771 let different_chunks = build_candidate_chunks(&[different], &options, &mut dropped);
772 assert_ne!(first[0].id, different_chunks[0].id);
773 }
774
775 #[test]
776 fn chunked_dedup_drops_exact_duplicates() {
777 let a = artifact("01", "shared body");
778 let b = artifact("02", "shared body");
779 let options = AssembleOptions {
780 budget_tokens: 10_000,
781 dedup: AssembleDedup::Chunked,
782 strategy: AssembleStrategy::Recency,
783 ..AssembleOptions::default()
784 };
785 let result = assemble_context(&[a, b], &options, None);
786 assert_eq!(result.chunks.len(), 1);
787 assert!(result.dropped.iter().any(|d| d.reason == "duplicate"));
788 }
789
790 #[test]
791 fn semantic_dedup_catches_near_duplicates() {
792 let a = artifact(
793 "01",
794 "The parser drift issue was diagnosed by tracing token spans.",
795 );
796 let b = artifact(
797 "02",
798 "The parser drift issue, diagnosed by tracing token spans, appeared in the tokenizer.",
799 );
800 let options = AssembleOptions {
801 dedup: AssembleDedup::Semantic,
802 strategy: AssembleStrategy::Recency,
803 semantic_overlap: 0.5,
804 ..AssembleOptions::default()
805 };
806 let result = assemble_context(&[a, b], &options, None);
807 assert_eq!(result.chunks.len(), 1);
809 assert!(result.dropped.iter().any(|d| d.reason == "duplicate"
810 && d.detail
811 .as_deref()
812 .is_some_and(|s| s.starts_with("semantic"))));
813 }
814
815 #[test]
816 fn budget_enforcement_trims_excess_chunks() {
817 let text = "word ".repeat(5_000); let a = artifact("01", &text);
819 let options = AssembleOptions {
820 budget_tokens: 500,
821 dedup: AssembleDedup::None,
822 strategy: AssembleStrategy::Recency,
823 microcompact_threshold: 200,
824 ..AssembleOptions::default()
825 };
826 let result = assemble_context(&[a], &options, None);
827 assert!(result.total_tokens <= options.budget_tokens);
828 assert!(result
829 .reasons
830 .iter()
831 .any(|r| !r.included && r.reason == "budget_exceeded"));
832 }
833
834 #[test]
835 fn relevance_strategy_prefers_query_matches() {
836 let a = artifact("01", "completely unrelated content about weather");
837 let b = artifact("02", "parser drift diagnostics token spans hotspot");
838 let options = AssembleOptions {
839 budget_tokens: 12,
841 dedup: AssembleDedup::None,
842 strategy: AssembleStrategy::Relevance,
843 query: Some("parser drift diagnostics".to_string()),
844 microcompact_threshold: 10_000,
845 ..AssembleOptions::default()
846 };
847 let result = assemble_context(&[a, b], &options, None);
848 assert_eq!(result.chunks.len(), 1);
849 assert_eq!(result.chunks[0].artifact_id, "02");
850 }
851
852 #[test]
853 fn round_robin_interleaves_artifacts() {
854 let a = artifact("01", "alpha aaaa\n\nbeta bbbb\n\ngamma ccc");
858 let b = artifact("02", "delta dddd\n\nepsilon ee\n\nzeta ff");
859 let options = AssembleOptions {
860 budget_tokens: 10_000,
861 dedup: AssembleDedup::None,
862 strategy: AssembleStrategy::RoundRobin,
863 microcompact_threshold: 3,
864 ..AssembleOptions::default()
865 };
866 let result = assemble_context(&[a, b], &options, None);
867 let order: Vec<&str> = result
868 .chunks
869 .iter()
870 .map(|c| c.artifact_id.as_str())
871 .collect();
872 assert!(order.len() >= 4);
875 assert_eq!(order[0], "01");
876 assert_eq!(order[1], "02");
877 assert_eq!(order[2], "01");
878 assert_eq!(order[3], "02");
879 }
880
881 #[test]
882 fn custom_scores_override_default_ranker() {
883 let a = artifact("01", "first body content");
884 let b = artifact("02", "second body content");
885 let options = AssembleOptions {
886 budget_tokens: 6,
888 dedup: AssembleDedup::None,
889 strategy: AssembleStrategy::Relevance,
890 query: Some("first".to_string()),
891 microcompact_threshold: 10_000,
892 ..AssembleOptions::default()
893 };
894 let mut dropped = Vec::new();
895 let candidates = build_candidate_chunks(&[a.clone(), b.clone()], &options, &mut dropped);
896 assert_eq!(candidates.len(), 2);
897 let scores = vec![0.1, 0.9];
900 let result = assemble_context(&[a, b], &options, Some(&scores));
901 assert_eq!(result.chunks.len(), 1);
902 assert_eq!(result.chunks[0].artifact_id, "02");
903 }
904
905 #[test]
906 fn reasons_name_strategy_and_inclusion() {
907 let a = artifact("01", "included body");
908 let b = artifact("02", "dropped body because budget");
909 let options = AssembleOptions {
910 budget_tokens: 5,
911 dedup: AssembleDedup::None,
912 strategy: AssembleStrategy::Recency,
913 microcompact_threshold: 10_000,
914 ..AssembleOptions::default()
915 };
916 let result = assemble_context(&[a, b], &options, None);
917 assert!(result.reasons.iter().any(|r| r.included));
918 assert!(result.reasons.iter().any(|r| !r.included));
919 for reason in &result.reasons {
920 assert_eq!(reason.strategy, "recency");
921 }
922 }
923
924 #[test]
925 fn empty_artifact_reports_dropped() {
926 let mut empty = artifact("01", "");
927 empty.text = Some(String::new());
928 let options = AssembleOptions::default();
929 let result = assemble_context(&[empty], &options, None);
930 assert!(result.chunks.is_empty());
931 assert!(result
932 .dropped
933 .iter()
934 .any(|d| d.reason == "empty_text" || d.reason == "no_text"));
935 }
936}