1use std::collections::BTreeSet;
10use std::fmt;
11
12use serde::{Deserialize, Deserializer, Serialize, Serializer};
13use sha2::{Digest, Sha256};
14
15use super::atom::{Atom, AtomError, AtomId};
16
17const INTENT_ID_BYTES: usize = 32;
18const DEFAULT_MAX_EVENT_GAP: u64 = 8;
19const DEFAULT_CONFIDENCE: f32 = 0.75;
20const SEMANTIC_CONFIDENCE: f32 = 0.9;
21
22#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
24pub struct IntentId(pub [u8; INTENT_ID_BYTES]);
25
26impl IntentId {
27 pub fn to_hex(&self) -> String {
29 hex::encode(self.0)
30 }
31
32 pub fn from_hex(raw: &str) -> Result<Self, IntentError> {
34 let bytes = hex::decode(raw)
35 .map_err(|error| IntentError::Invalid(format!("invalid IntentId hex: {error}")))?;
36 if bytes.len() != INTENT_ID_BYTES {
37 return Err(IntentError::Invalid(format!(
38 "IntentId must be {INTENT_ID_BYTES} bytes, got {}",
39 bytes.len()
40 )));
41 }
42 let mut out = [0u8; INTENT_ID_BYTES];
43 out.copy_from_slice(&bytes);
44 Ok(Self(out))
45 }
46}
47
48impl fmt::Debug for IntentId {
49 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
50 write!(f, "IntentId({})", self.to_hex())
51 }
52}
53
54impl fmt::Display for IntentId {
55 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
56 write!(f, "{}", self.to_hex())
57 }
58}
59
60impl Serialize for IntentId {
61 fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
62 serializer.serialize_str(&self.to_hex())
63 }
64}
65
66impl<'de> Deserialize<'de> for IntentId {
67 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
68 let raw = String::deserialize(deserializer)?;
69 IntentId::from_hex(&raw).map_err(serde::de::Error::custom)
70 }
71}
72
73#[derive(Debug)]
75pub enum IntentError {
76 Invalid(String),
78}
79
80impl fmt::Display for IntentError {
81 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82 match self {
83 IntentError::Invalid(message) => write!(f, "intent invalid: {message}"),
84 }
85 }
86}
87
88impl std::error::Error for IntentError {}
89
90#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
93pub struct TranscriptSpan {
94 pub transcript_ref: String,
96 pub start_event_index: u64,
98 pub end_event_index: u64,
100}
101
102impl TranscriptSpan {
103 pub fn new(
105 transcript_ref: impl Into<String>,
106 start_event_index: u64,
107 end_event_index: u64,
108 ) -> Result<Self, IntentError> {
109 if end_event_index < start_event_index {
110 return Err(IntentError::Invalid(format!(
111 "end_event_index {end_event_index} precedes start_event_index {start_event_index}"
112 )));
113 }
114 Ok(Self {
115 transcript_ref: transcript_ref.into(),
116 start_event_index,
117 end_event_index,
118 })
119 }
120
121 fn extend_to(&mut self, event_index: u64) {
122 self.start_event_index = self.start_event_index.min(event_index);
123 self.end_event_index = self.end_event_index.max(event_index);
124 }
125}
126
127#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
129pub struct Intent {
130 pub id: IntentId,
131 pub atoms: Vec<AtomId>,
132 pub goal_description: String,
133 pub origin_transcript_span: TranscriptSpan,
134 pub confidence: f32,
135}
136
137impl Intent {
138 pub fn new(
140 atoms: Vec<AtomId>,
141 goal_description: impl Into<String>,
142 origin_transcript_span: TranscriptSpan,
143 confidence: f32,
144 ) -> Result<Self, IntentError> {
145 if atoms.is_empty() {
146 return Err(IntentError::Invalid(
147 "intent must contain at least one atom".to_string(),
148 ));
149 }
150 let goal_description = goal_description.into();
151 let confidence = normalize_confidence(confidence)?;
152 let id = derive_intent_id(&atoms, &goal_description, &origin_transcript_span);
153 Ok(Self {
154 id,
155 atoms,
156 goal_description,
157 origin_transcript_span,
158 confidence,
159 })
160 }
161
162 pub fn seal(&self) -> SealedIntent {
166 SealedIntent {
167 id: self.id,
168 atoms: self.atoms.clone(),
169 goal_description: self.goal_description.clone(),
170 origin_transcript_span: self.origin_transcript_span.clone(),
171 confidence: self.confidence,
172 }
173 }
174}
175
176#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
178pub struct SealedIntent {
179 pub id: IntentId,
180 pub atoms: Vec<AtomId>,
181 pub goal_description: String,
182 pub origin_transcript_span: TranscriptSpan,
183 pub confidence: f32,
184}
185
186#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
189pub struct ObservedAtom {
190 pub atom_id: AtomId,
191 pub agent_run_id: String,
192 pub transcript_ref: String,
193 pub transcript_event_index: u64,
194 #[serde(default, skip_serializing_if = "Option::is_none")]
195 pub tool_call_id: Option<String>,
196 #[serde(default, skip_serializing_if = "Option::is_none")]
197 pub goal_description: Option<String>,
198}
199
200impl ObservedAtom {
201 pub fn from_atom(atom: &Atom, transcript_event_index: u64) -> Self {
203 Self {
204 atom_id: atom.id,
205 agent_run_id: atom.provenance.agent_run_id.clone(),
206 transcript_ref: atom.provenance.transcript_ref.clone(),
207 transcript_event_index,
208 tool_call_id: atom.provenance.tool_call_id.clone(),
209 goal_description: None,
210 }
211 }
212
213 pub fn with_goal_description(mut self, goal_description: impl Into<String>) -> Self {
216 self.goal_description = Some(goal_description.into());
217 self
218 }
219}
220
221#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
223pub struct IntentClusterOptions {
224 pub max_event_gap: u64,
226 pub semantic_boundary_budget: usize,
229}
230
231impl Default for IntentClusterOptions {
232 fn default() -> Self {
233 Self {
234 max_event_gap: DEFAULT_MAX_EVENT_GAP,
235 semantic_boundary_budget: 0,
236 }
237 }
238}
239
240#[derive(Clone, Debug, PartialEq, Eq)]
242pub struct IntentClusterer {
243 options: IntentClusterOptions,
244}
245
246impl IntentClusterer {
247 pub fn new(options: IntentClusterOptions) -> Self {
248 Self { options }
249 }
250
251 pub fn options(&self) -> &IntentClusterOptions {
252 &self.options
253 }
254
255 pub fn cluster<I>(&self, observations: I) -> Vec<Intent>
257 where
258 I: IntoIterator<Item = ObservedAtom>,
259 {
260 self.cluster_internal(observations, None)
261 }
262
263 pub fn cluster_with_classifier<I, C>(&self, observations: I, classifier: &mut C) -> Vec<Intent>
266 where
267 I: IntoIterator<Item = ObservedAtom>,
268 C: IntentBoundaryClassifier,
269 {
270 self.cluster_internal(
271 observations,
272 Some(classifier as &mut (dyn IntentBoundaryClassifier + '_)),
273 )
274 }
275
276 fn cluster_internal<I>(
277 &self,
278 observations: I,
279 mut classifier: Option<&mut (dyn IntentBoundaryClassifier + '_)>,
280 ) -> Vec<Intent>
281 where
282 I: IntoIterator<Item = ObservedAtom>,
283 {
284 let mut observations: Vec<ObservedAtom> = observations.into_iter().collect();
285 observations.sort_by(|left, right| {
286 left.transcript_ref
287 .cmp(&right.transcript_ref)
288 .then_with(|| left.agent_run_id.cmp(&right.agent_run_id))
289 .then_with(|| {
290 left.transcript_event_index
291 .cmp(&right.transcript_event_index)
292 })
293 .then_with(|| left.atom_id.0.cmp(&right.atom_id.0))
294 });
295
296 let mut builder: Option<IntentBuilder> = None;
297 let mut intents = Vec::new();
298 let mut semantic_budget_remaining = self.options.semantic_boundary_budget;
299
300 for observation in observations {
301 if builder.is_none() {
302 builder = Some(IntentBuilder::new(observation));
303 continue;
304 }
305
306 let decision = {
307 let active = builder.as_ref().expect("active builder has observations");
308 let previous = active.last().expect("active builder has observations");
309 self.boundary_decision(
310 previous,
311 &observation,
312 classifier.as_deref_mut(),
313 &mut semantic_budget_remaining,
314 )
315 };
316
317 match decision {
318 BoundaryDecision::Merge { confidence } => builder
319 .as_mut()
320 .expect("active builder has observations")
321 .push(observation, confidence),
322 BoundaryDecision::Split => {
323 intents.push(
324 builder
325 .take()
326 .expect("active builder has observations")
327 .finish(),
328 );
329 builder = Some(IntentBuilder::new(observation));
330 }
331 }
332 }
333
334 if let Some(active) = builder {
335 intents.push(active.finish());
336 }
337
338 intents
339 }
340
341 fn boundary_decision(
342 &self,
343 previous: &ObservedAtom,
344 next: &ObservedAtom,
345 classifier: Option<&mut (dyn IntentBoundaryClassifier + '_)>,
346 semantic_budget_remaining: &mut usize,
347 ) -> BoundaryDecision {
348 if previous.agent_run_id != next.agent_run_id
349 || previous.transcript_ref != next.transcript_ref
350 {
351 return BoundaryDecision::Split;
352 }
353
354 let gap = next
355 .transcript_event_index
356 .saturating_sub(previous.transcript_event_index);
357 if gap <= self.options.max_event_gap {
358 return BoundaryDecision::Merge {
359 confidence: DEFAULT_CONFIDENCE,
360 };
361 }
362
363 let Some(classifier) = classifier else {
364 return BoundaryDecision::Split;
365 };
366 if *semantic_budget_remaining == 0 {
367 return BoundaryDecision::Split;
368 }
369
370 *semantic_budget_remaining -= 1;
371 let dispute = IntentBoundaryDispute {
372 previous,
373 next,
374 gap,
375 };
376 match classifier.classify(&dispute) {
377 IntentBoundaryDecision::Merge => BoundaryDecision::Merge {
378 confidence: SEMANTIC_CONFIDENCE,
379 },
380 IntentBoundaryDecision::Split => BoundaryDecision::Split,
381 }
382 }
383}
384
385impl Default for IntentClusterer {
386 fn default() -> Self {
387 Self::new(IntentClusterOptions::default())
388 }
389}
390
391#[derive(Clone, Copy, Debug)]
393pub struct IntentBoundaryDispute<'a> {
394 pub previous: &'a ObservedAtom,
395 pub next: &'a ObservedAtom,
396 pub gap: u64,
397}
398
399#[derive(Clone, Copy, Debug, PartialEq, Eq)]
401pub enum IntentBoundaryDecision {
402 Merge,
403 Split,
404}
405
406pub trait IntentBoundaryClassifier {
408 fn classify(&mut self, dispute: &IntentBoundaryDispute<'_>) -> IntentBoundaryDecision;
409}
410
411enum BoundaryDecision {
412 Merge { confidence: f32 },
413 Split,
414}
415
416struct IntentBuilder {
417 observations: Vec<ObservedAtom>,
418 span: TranscriptSpan,
419 confidence: f32,
420}
421
422impl IntentBuilder {
423 fn new(observation: ObservedAtom) -> Self {
424 let span = TranscriptSpan {
425 transcript_ref: observation.transcript_ref.clone(),
426 start_event_index: observation.transcript_event_index,
427 end_event_index: observation.transcript_event_index,
428 };
429 Self {
430 observations: vec![observation],
431 span,
432 confidence: DEFAULT_CONFIDENCE,
433 }
434 }
435
436 fn last(&self) -> Option<&ObservedAtom> {
437 self.observations.last()
438 }
439
440 fn push(&mut self, observation: ObservedAtom, confidence: f32) {
441 self.span.extend_to(observation.transcript_event_index);
442 self.confidence = self.confidence.min(confidence);
443 self.observations.push(observation);
444 }
445
446 fn finish(self) -> Intent {
447 let atoms: Vec<AtomId> = self
448 .observations
449 .iter()
450 .map(|observation| observation.atom_id)
451 .collect();
452 Intent::new(
453 atoms,
454 goal_description(&self.observations, &self.span),
455 self.span,
456 self.confidence,
457 )
458 .expect("builder always contains at least one observation with valid confidence")
459 }
460}
461
462fn goal_description(observations: &[ObservedAtom], span: &TranscriptSpan) -> String {
463 let mut goals = BTreeSet::new();
464 for observation in observations {
465 if let Some(goal) = observation
466 .goal_description
467 .as_deref()
468 .map(str::trim)
469 .filter(|goal| !goal.is_empty())
470 {
471 goals.insert(goal.to_string());
472 }
473 }
474
475 if !goals.is_empty() {
476 return goals.into_iter().collect::<Vec<_>>().join("; ");
477 }
478
479 let first = observations
480 .first()
481 .expect("goal_description requires observations");
482 let tool_calls: BTreeSet<&str> = observations
483 .iter()
484 .filter_map(|observation| observation.tool_call_id.as_deref())
485 .collect();
486 if tool_calls.len() == 1 {
487 return format!(
488 "tool call {} in {} events {}..{}",
489 tool_calls.iter().next().unwrap(),
490 span.transcript_ref,
491 span.start_event_index,
492 span.end_event_index
493 );
494 }
495
496 format!(
497 "agent run {} in {} events {}..{}",
498 first.agent_run_id, span.transcript_ref, span.start_event_index, span.end_event_index
499 )
500}
501
502fn derive_intent_id(
503 atoms: &[AtomId],
504 goal_description: &str,
505 origin_transcript_span: &TranscriptSpan,
506) -> IntentId {
507 let mut hasher = Sha256::new();
508 hasher.update(b"FINT");
509 hasher.update(origin_transcript_span.transcript_ref.as_bytes());
510 hasher.update(origin_transcript_span.start_event_index.to_le_bytes());
511 hasher.update(origin_transcript_span.end_event_index.to_le_bytes());
512 hasher.update(goal_description.as_bytes());
513 for atom in atoms {
514 hasher.update(atom.0);
515 }
516 IntentId(hasher.finalize().into())
517}
518
519fn normalize_confidence(confidence: f32) -> Result<f32, IntentError> {
520 if !confidence.is_finite() {
521 return Err(IntentError::Invalid(
522 "confidence must be a finite number".to_string(),
523 ));
524 }
525 if !(0.0..=1.0).contains(&confidence) {
526 return Err(IntentError::Invalid(format!(
527 "confidence must be between 0.0 and 1.0, got {confidence}"
528 )));
529 }
530 Ok(confidence)
531}
532
533impl From<AtomError> for IntentError {
534 fn from(error: AtomError) -> Self {
535 IntentError::Invalid(error.to_string())
536 }
537}
538
539#[cfg(test)]
540mod tests {
541 use super::*;
542 use ed25519_dalek::SigningKey;
543 use time::format_description::well_known::Rfc3339;
544 use time::OffsetDateTime;
545
546 fn deterministic_signing_key(seed: u8) -> SigningKey {
547 let mut bytes = [0u8; 32];
548 for slot in bytes.iter_mut() {
549 *slot = seed;
550 }
551 SigningKey::from_bytes(&bytes)
552 }
553
554 fn atom(suffix: &str, run_id: &str, transcript_ref: &str, tool_call_id: Option<&str>) -> Atom {
555 let principal = deterministic_signing_key(1);
556 let persona = deterministic_signing_key(2);
557 let mut provenance = crate::flow::Provenance {
558 principal: "user:alice".to_string(),
559 persona: "ship-captain".to_string(),
560 agent_run_id: run_id.to_string(),
561 tool_call_id: tool_call_id.map(ToString::to_string),
562 trace_id: format!("trace-{suffix}"),
563 transcript_ref: transcript_ref.to_string(),
564 timestamp: OffsetDateTime::parse("2026-04-24T12:34:56Z", &Rfc3339).unwrap(),
565 };
566 provenance.timestamp += time::Duration::seconds(suffix.len() as i64);
567 Atom::sign(
568 vec![crate::flow::TextOp::Insert {
569 offset: suffix.len() as u64,
570 content: suffix.to_string(),
571 }],
572 Vec::new(),
573 provenance,
574 None,
575 &principal,
576 &persona,
577 )
578 .unwrap()
579 }
580
581 fn observed(
582 suffix: &str,
583 run_id: &str,
584 transcript_ref: &str,
585 event_index: u64,
586 tool_call_id: Option<&str>,
587 goal: Option<&str>,
588 ) -> ObservedAtom {
589 let atom = atom(suffix, run_id, transcript_ref, tool_call_id);
590 let observed = ObservedAtom::from_atom(&atom, event_index);
591 match goal {
592 Some(goal) => observed.with_goal_description(goal),
593 None => observed,
594 }
595 }
596
597 #[test]
598 fn default_clustering_groups_same_run_atoms_with_close_transcript_events() {
599 let observations = vec![
600 observed(
601 "a",
602 "run-1",
603 "transcript:1",
604 10,
605 Some("tc-1"),
606 Some("edit README"),
607 ),
608 observed(
609 "b",
610 "run-1",
611 "transcript:1",
612 13,
613 Some("tc-2"),
614 Some("edit README"),
615 ),
616 observed(
617 "c",
618 "run-1",
619 "transcript:1",
620 40,
621 Some("tc-3"),
622 Some("add tests"),
623 ),
624 ];
625
626 let intents = IntentClusterer::default().cluster(observations);
627
628 assert_eq!(intents.len(), 2);
629 assert_eq!(intents[0].atoms.len(), 2);
630 assert_eq!(intents[0].origin_transcript_span.start_event_index, 10);
631 assert_eq!(intents[0].origin_transcript_span.end_event_index, 13);
632 assert_eq!(intents[0].goal_description, "edit README");
633 assert_eq!(intents[1].atoms.len(), 1);
634 assert_eq!(intents[1].goal_description, "add tests");
635 }
636
637 #[test]
638 fn clustering_respects_agent_run_and_transcript_boundaries() {
639 let observations = vec![
640 observed("a", "run-1", "transcript:1", 10, None, None),
641 observed("b", "run-2", "transcript:1", 11, None, None),
642 observed("c", "run-1", "transcript:2", 12, None, None),
643 ];
644
645 let intents = IntentClusterer::default().cluster(observations);
646
647 assert_eq!(intents.len(), 3);
648 assert!(intents
649 .iter()
650 .all(|intent| intent.atoms.len() == 1 && intent.confidence == DEFAULT_CONFIDENCE));
651 }
652
653 #[test]
654 fn clustering_is_stable_for_unsorted_transcript_tool_logs() {
655 let a = observed("a", "run-1", "transcript:1", 2, Some("tc-1"), None);
656 let b = observed("b", "run-1", "transcript:1", 1, Some("tc-1"), None);
657
658 let intents = IntentClusterer::default().cluster(vec![a.clone(), b.clone()]);
659
660 assert_eq!(intents.len(), 1);
661 assert_eq!(intents[0].atoms, vec![b.atom_id, a.atom_id]);
662 assert_eq!(
663 intents[0].goal_description,
664 "tool call tc-1 in transcript:1 events 1..2"
665 );
666 }
667
668 #[test]
669 fn semantic_classifier_can_merge_budgeted_boundary_disputes() {
670 #[derive(Default)]
671 struct MergeOnce {
672 calls: usize,
673 }
674 impl IntentBoundaryClassifier for MergeOnce {
675 fn classify(&mut self, dispute: &IntentBoundaryDispute<'_>) -> IntentBoundaryDecision {
676 self.calls += 1;
677 assert_eq!(dispute.previous.agent_run_id, "run-1");
678 assert_eq!(dispute.next.agent_run_id, "run-1");
679 assert_eq!(dispute.gap, 20);
680 IntentBoundaryDecision::Merge
681 }
682 }
683
684 let clusterer = IntentClusterer::new(IntentClusterOptions {
685 max_event_gap: 5,
686 semantic_boundary_budget: 1,
687 });
688 let observations = vec![
689 observed("a", "run-1", "transcript:1", 0, None, Some("rename API")),
690 observed("b", "run-1", "transcript:1", 20, None, Some("rename API")),
691 observed("c", "run-1", "transcript:1", 40, None, Some("rename API")),
692 ];
693 let mut classifier = MergeOnce::default();
694
695 let intents = clusterer.cluster_with_classifier(observations, &mut classifier);
696
697 assert_eq!(classifier.calls, 1);
698 assert_eq!(intents.len(), 2);
699 assert_eq!(intents[0].atoms.len(), 2);
700 assert_eq!(intents[0].confidence, DEFAULT_CONFIDENCE);
701 assert_eq!(intents[1].atoms.len(), 1);
702 }
703
704 #[test]
705 fn semantic_classifier_never_crosses_hard_boundaries() {
706 #[derive(Default)]
707 struct NeverCalled;
708 impl IntentBoundaryClassifier for NeverCalled {
709 fn classify(&mut self, _: &IntentBoundaryDispute<'_>) -> IntentBoundaryDecision {
710 panic!("hard agent/transcript boundaries must not invoke semantic classifier");
711 }
712 }
713
714 let clusterer = IntentClusterer::new(IntentClusterOptions {
715 max_event_gap: 0,
716 semantic_boundary_budget: 10,
717 });
718 let observations = vec![
719 observed("a", "run-1", "transcript:1", 0, None, None),
720 observed("b", "run-2", "transcript:1", 1, None, None),
721 observed("c", "run-1", "transcript:2", 2, None, None),
722 ];
723 let mut classifier = NeverCalled;
724
725 let intents = clusterer.cluster_with_classifier(observations, &mut classifier);
726
727 assert_eq!(intents.len(), 3);
728 }
729
730 #[test]
731 fn sealing_captures_current_atom_set() {
732 let observations = vec![
733 observed("a", "run-1", "transcript:1", 0, None, Some("ship feature")),
734 observed("b", "run-1", "transcript:1", 1, None, Some("ship feature")),
735 ];
736 let mut intent = IntentClusterer::default()
737 .cluster(observations)
738 .pop()
739 .expect("one intent");
740
741 let sealed = intent.seal();
742 intent.atoms.pop();
743
744 assert_eq!(sealed.atoms.len(), 2);
745 assert_eq!(intent.atoms.len(), 1);
746 assert_eq!(sealed.goal_description, "ship feature");
747 }
748
749 #[test]
750 fn intent_id_round_trips_through_json() {
751 let observations = vec![observed(
752 "a",
753 "run-1",
754 "transcript:1",
755 0,
756 None,
757 Some("ship feature"),
758 )];
759 let intent = IntentClusterer::default()
760 .cluster(observations)
761 .pop()
762 .expect("one intent");
763
764 let raw = serde_json::to_string(&intent).unwrap();
765 let decoded: Intent = serde_json::from_str(&raw).unwrap();
766
767 assert_eq!(decoded, intent);
768 assert_eq!(IntentId::from_hex(&intent.id.to_hex()).unwrap(), intent.id);
769 }
770}