1use super::{OnionSlice, OnionSliceConfig, SliceLayer, extract_keywords};
2use serde_json::Value;
3
4#[derive(Debug, Clone)]
5pub struct SemanticBlock {
6 pub role_heading: String,
7 pub primary_label: &'static str,
8 pub content: String,
9 pub summary: String,
10 pub facets: Vec<SemanticFacet>,
11}
12
13#[derive(Debug, Clone, PartialEq, Eq)]
14pub struct SemanticFacet {
15 pub label: &'static str,
16 pub text: String,
17}
18
19#[derive(Debug, Clone)]
20struct RawBlock {
21 role: String,
22 content: String,
23}
24
25pub fn is_structured_conversation(metadata: &Value) -> bool {
26 metadata.as_object().is_some_and(|object| {
27 matches!(
28 object.get("type").and_then(|value| value.as_str()),
29 Some("conversation" | "transcript_turn")
30 ) || matches!(
31 object.get("format").and_then(|value| value.as_str()),
32 Some("markdown_transcript" | "sessions" | "claude_web" | "chatgpt")
33 )
34 })
35}
36
37pub fn parse_blocks(content: &str, metadata: &Value) -> Vec<SemanticBlock> {
38 let raw_blocks = if metadata
39 .get("format")
40 .and_then(|value| value.as_str())
41 .is_some_and(|format| format == "markdown_transcript")
42 || content
43 .lines()
44 .any(|line| parse_markdown_heading(line).is_some())
45 {
46 parse_markdown_transcript_blocks(content)
47 } else {
48 vec![RawBlock {
49 role: metadata
50 .get("role")
51 .and_then(|value| value.as_str())
52 .unwrap_or("message")
53 .to_string(),
54 content: content.trim().to_string(),
55 }]
56 };
57
58 raw_blocks
59 .into_iter()
60 .filter_map(|block| {
61 let content = block.content.trim();
62 if content.is_empty() {
63 return None;
64 }
65
66 let role_key = normalize_role_key(&block.role);
67 let primary_label = primary_label(&role_key);
68 let summary = summarize_text(content, 96);
69
70 let mut facets = vec![SemanticFacet {
71 label: primary_label,
72 text: summary.clone(),
73 }];
74
75 if let Some(decision) = infer_decision(content) {
76 facets.push(SemanticFacet {
77 label: "Decision",
78 text: decision,
79 });
80 }
81 if let Some(next) = infer_next_action(content) {
82 facets.push(SemanticFacet {
83 label: "Next",
84 text: next,
85 });
86 }
87 if let Some(entities) = infer_entities(metadata, content) {
88 facets.push(SemanticFacet {
89 label: "Entities",
90 text: entities,
91 });
92 }
93
94 dedupe_facets(&mut facets);
95
96 Some(SemanticBlock {
97 role_heading: role_heading(&role_key, &block.role),
98 primary_label,
99 content: content.to_string(),
100 summary,
101 facets,
102 })
103 })
104 .collect()
105}
106
107fn parse_markdown_transcript_blocks(content: &str) -> Vec<RawBlock> {
108 let mut blocks = Vec::new();
109 let mut current_role: Option<String> = None;
110 let mut current_lines = Vec::new();
111 let mut in_fence = false;
116
117 for line in content.lines() {
118 if is_fence_marker(line) {
119 in_fence = !in_fence;
120 current_lines.push(line.to_string());
121 continue;
122 }
123
124 if !in_fence && let Some(role) = parse_markdown_heading(line) {
125 if let Some(existing_role) = current_role.take() {
126 push_raw_block(&mut blocks, existing_role, ¤t_lines.join("\n"));
127 }
128 current_role = Some(role.to_string());
129 current_lines.clear();
130 continue;
131 }
132
133 current_lines.push(line.to_string());
134 }
135
136 if let Some(existing_role) = current_role {
137 push_raw_block(&mut blocks, existing_role, ¤t_lines.join("\n"));
138 }
139
140 if blocks.is_empty() {
141 push_raw_block(&mut blocks, "transcript".to_string(), content);
142 }
143
144 blocks
145}
146
147fn is_fence_marker(line: &str) -> bool {
153 let trimmed = line.trim_start();
154 trimmed.starts_with("```") || trimmed.starts_with("~~~")
155}
156
157fn push_raw_block(blocks: &mut Vec<RawBlock>, role: String, content: &str) {
158 let trimmed = content.trim();
159 if trimmed.is_empty() {
160 return;
161 }
162
163 blocks.push(RawBlock {
164 role,
165 content: trimmed.to_string(),
166 });
167}
168
169fn parse_markdown_heading(line: &str) -> Option<&'static str> {
170 let trimmed = line.trim();
171 if trimmed.eq_ignore_ascii_case("User request:") {
173 return Some("user");
174 }
175 if trimmed.eq_ignore_ascii_case("Assistant response:") {
176 return Some("assistant");
177 }
178 if trimmed.eq_ignore_ascii_case("Reasoning focus:") {
179 return Some("reasoning");
180 }
181
182 let stripped = trimmed
186 .trim_start_matches('#')
187 .trim_start_matches(['[', '*'])
188 .trim_end_matches([':', ']'])
189 .trim();
190
191 if stripped.eq_ignore_ascii_case("user") || stripped.eq_ignore_ascii_case("human") {
192 return Some("user");
193 }
194 if stripped.eq_ignore_ascii_case("assistant")
195 || stripped.eq_ignore_ascii_case("model")
196 || stripped.eq_ignore_ascii_case("ai")
197 {
198 return Some("assistant");
199 }
200 if stripped.eq_ignore_ascii_case("system") || stripped.eq_ignore_ascii_case("system context") {
201 return Some("system");
202 }
203 if stripped.eq_ignore_ascii_case("tool")
204 || stripped.eq_ignore_ascii_case("tool output")
205 || stripped.eq_ignore_ascii_case("tool result")
206 {
207 return Some("tool");
208 }
209 if stripped.eq_ignore_ascii_case("reasoning") || stripped.eq_ignore_ascii_case("thought") {
210 return Some("reasoning");
211 }
212
213 None
214}
215
216fn normalize_role_key(role: &str) -> String {
217 match role.trim().to_ascii_lowercase().as_str() {
218 "human" => "user".to_string(),
219 "bot" => "assistant".to_string(),
220 other => other.to_string(),
221 }
222}
223
224fn primary_label(role: &str) -> &'static str {
225 match role {
226 "user" => "Request",
227 "assistant" => "Response",
228 "reasoning" => "Reasoning",
229 "system" => "Context",
230 "tool" => "Tool",
231 _ => "Message",
232 }
233}
234
235fn role_heading(role: &str, fallback: &str) -> String {
236 match role {
237 "user" => "User request".to_string(),
238 "assistant" => "Assistant response".to_string(),
239 "reasoning" => "Reasoning focus".to_string(),
240 "system" => "System context".to_string(),
241 "tool" => "Tool output".to_string(),
242 _ => title_case(fallback),
243 }
244}
245
246fn title_case(input: &str) -> String {
247 let trimmed = input.trim();
248 if trimmed.is_empty() {
249 return "Message".to_string();
250 }
251
252 let mut chars = trimmed.chars();
253 let Some(first) = chars.next() else {
254 return "Message".to_string();
255 };
256
257 let mut result = first.to_uppercase().collect::<String>();
258 result.push_str(chars.as_str());
259 result
260}
261
262fn summarize_text(text: &str, max_chars: usize) -> String {
263 let candidate = first_non_empty_line(text)
264 .or_else(|| sentence_candidates(text).into_iter().next())
265 .unwrap_or_else(|| collapse_whitespace(text));
266 truncate_at_word_boundary(&candidate, max_chars)
267}
268
269fn first_non_empty_line(text: &str) -> Option<String> {
270 text.lines()
271 .map(str::trim)
272 .find(|line| !line.is_empty())
273 .map(collapse_whitespace)
274}
275
276fn sentence_candidates(text: &str) -> Vec<String> {
277 let normalized = text.replace('\n', " ");
278 normalized
279 .split(['.', '!', '?'])
280 .map(str::trim)
281 .filter(|segment| !segment.is_empty())
282 .map(collapse_whitespace)
283 .collect()
284}
285
286const INLINE_SEMANTIC_LABELS: [&str; 9] = [
287 "decision:",
288 "decided:",
289 "resolution:",
290 "next action:",
291 "next steps:",
292 "next:",
293 "todo:",
294 "action item:",
295 "follow-up:",
296];
297
298fn find_labeled_fragment(text: &str, labels: &[&str]) -> Option<String> {
299 text.lines().map(str::trim).find_map(|line| {
300 let lower = line.to_ascii_lowercase();
301 labels.iter().find_map(|label| {
302 let start = lower.find(label)?;
303 let remainder = &line[start + label.len()..];
304 let remainder_lower = remainder.to_ascii_lowercase();
305 let cut_idx = INLINE_SEMANTIC_LABELS
306 .iter()
307 .filter_map(|other_label| remainder_lower.find(other_label))
308 .min()
309 .unwrap_or(remainder.len());
310 let fragment = remainder[..cut_idx].trim();
311
312 if fragment.is_empty() {
313 None
314 } else {
315 Some(truncate_at_word_boundary(
316 &collapse_whitespace(fragment),
317 96,
318 ))
319 }
320 })
321 })
322}
323
324fn find_candidate_by_keywords(text: &str, keywords: &[&str]) -> Option<String> {
325 text.lines()
326 .chain(text.split(['.', '!', '?']))
327 .map(str::trim)
328 .filter(|segment| !segment.is_empty())
329 .find(|segment| {
330 let lower = segment.to_ascii_lowercase();
331 keywords.iter().any(|keyword| lower.contains(keyword))
332 })
333 .map(|segment| truncate_at_word_boundary(&collapse_whitespace(segment), 96))
334}
335
336fn infer_decision(text: &str) -> Option<String> {
337 find_labeled_fragment(text, &["decision:", "decided:", "resolution:"]).or_else(|| {
338 find_candidate_by_keywords(
339 text,
340 &[
341 "decid",
342 "agreed",
343 "going with",
344 "chosen",
345 "we will use",
346 "resolved",
347 ],
348 )
349 })
350}
351
352fn infer_next_action(text: &str) -> Option<String> {
353 find_labeled_fragment(
354 text,
355 &[
356 "next action:",
357 "next steps:",
358 "next:",
359 "todo:",
360 "action item:",
361 "follow-up:",
362 ],
363 )
364 .or_else(|| {
365 find_candidate_by_keywords(
366 text,
367 &[
368 "next",
369 "todo",
370 "follow up",
371 "follow-up",
372 "need to",
373 "i'll",
374 "we'll",
375 "will add",
376 "will wire",
377 "plan to",
378 ],
379 )
380 })
381}
382
383fn infer_entities(metadata: &Value, content: &str) -> Option<String> {
384 let mut entities = Vec::new();
385
386 if let Some(object) = metadata.as_object() {
387 for key in ["project", "title", "conversation", "session", "agent"] {
388 if let Some(value) = object.get(key).and_then(|value| value.as_str()) {
389 let trimmed = value.trim();
390 if !trimmed.is_empty() && trimmed != "unknown" {
391 entities.push(trimmed.to_string());
392 }
393 }
394 }
395 }
396
397 for keyword in extract_keywords(content, 6) {
398 if keyword.len() > 3 {
399 entities.push(keyword);
400 }
401 }
402
403 entities.dedup();
404 if entities.is_empty() {
405 None
406 } else {
407 Some(truncate_at_word_boundary(&entities.join(", "), 96))
408 }
409}
410
411fn dedupe_facets(facets: &mut Vec<SemanticFacet>) {
412 let mut unique = Vec::with_capacity(facets.len());
413 for facet in facets.drain(..) {
414 let is_duplicate = unique.iter().any(|existing: &SemanticFacet| {
415 existing.label == facet.label || existing.text.eq_ignore_ascii_case(&facet.text)
416 });
417 if !is_duplicate {
418 unique.push(facet);
419 }
420 }
421 *facets = unique;
422}
423
424fn collect_facets(blocks: &[SemanticBlock], labels: &[&'static str]) -> Vec<String> {
425 let mut segments = Vec::new();
426 for label in labels {
427 for block in blocks {
428 if let Some(facet) = block.facets.iter().find(|facet| facet.label == *label) {
429 let segment = format!("{}: {}", facet.label, facet.text);
430 if !segments.iter().any(|existing| existing == &segment) {
431 segments.push(segment);
432 }
433 }
434 }
435 }
436 segments
437}
438
439fn pack_segments(segments: &[String], target_chars: usize) -> String {
440 let mut result = String::new();
441
442 for segment in segments {
443 let candidate = if result.is_empty() {
444 segment.clone()
445 } else {
446 format!("{result} | {segment}")
447 };
448
449 if candidate.chars().count() <= target_chars {
450 result = candidate;
451 continue;
452 }
453
454 if result.is_empty() {
455 return truncate_at_word_boundary(segment, target_chars);
456 }
457
458 if result.chars().count() + 4 <= target_chars {
459 result.push_str(" | …");
460 }
461 break;
462 }
463
464 if result.is_empty() {
465 truncate_at_word_boundary(&segments.join(" | "), target_chars)
466 } else {
467 result
468 }
469}
470
471fn collapse_whitespace(text: &str) -> String {
472 text.split_whitespace().collect::<Vec<_>>().join(" ")
473}
474
475fn truncate_at_word_boundary(text: &str, max_chars: usize) -> String {
476 if text.chars().count() <= max_chars {
477 return text.to_string();
478 }
479
480 let byte_idx = text
481 .char_indices()
482 .nth(max_chars)
483 .map(|(idx, _)| idx)
484 .unwrap_or(text.len());
485 let truncated = &text[..byte_idx];
486
487 if let Some(last_space) = truncated.rfind(' ') {
488 format!("{}...", &truncated[..last_space])
489 } else {
490 format!("{}...", truncated)
491 }
492}
493
494pub fn create_structured_outer(blocks: &[SemanticBlock], target_chars: usize) -> String {
495 let mut segments: Vec<String> = blocks
496 .iter()
497 .map(|block| format!("{}: {}", block.primary_label, block.summary))
498 .collect();
499 segments.extend(collect_facets(blocks, &["Decision", "Next", "Entities"]));
500 pack_segments(&segments, target_chars)
501}
502
503pub fn create_structured_middle(blocks: &[SemanticBlock], target_chars: usize) -> String {
504 let mut sections = Vec::new();
505
506 for block in blocks {
507 sections.push(format!(
508 "{}: {}",
509 block.primary_label,
510 truncate_at_word_boundary(
511 &block.summary,
512 match block.primary_label {
513 "Request" => 36,
514 "Response" => 44,
515 "Reasoning" => 40,
516 _ => 40,
517 }
518 )
519 ));
520 }
521
522 for facet in collect_facets(blocks, &["Decision"]) {
523 sections.push(truncate_at_word_boundary(&facet, 44));
524 }
525 for facet in collect_facets(blocks, &["Next"]) {
526 sections.push(truncate_at_word_boundary(&facet, 44));
527 }
528 for facet in collect_facets(blocks, &["Entities"]) {
529 sections.push(truncate_at_word_boundary(&facet, 36));
530 }
531
532 truncate_at_word_boundary(§ions.join("\n"), target_chars)
533}
534
535pub fn create_structured_inner(blocks: &[SemanticBlock], target_chars: usize) -> String {
536 let chars_per_block = (target_chars / blocks.len().max(1)).max(120);
537 let mut sections = Vec::new();
538
539 for block in blocks {
540 let excerpt =
541 truncate_at_word_boundary(&collapse_whitespace(&block.content), chars_per_block);
542 sections.push(format!("{}:\n{}", block.role_heading, excerpt));
543 }
544
545 let details = collect_facets(blocks, &["Decision", "Next", "Entities"]);
546 if !details.is_empty() {
547 sections.push(details.join("\n"));
548 }
549
550 truncate_at_word_boundary(§ions.join("\n\n"), target_chars)
551}
552
553pub fn create_structured_onion_slices(
554 content: &str,
555 metadata: &Value,
556 config: &OnionSliceConfig,
557) -> Vec<OnionSlice> {
558 let content = content.trim();
559 let blocks = parse_blocks(content, metadata);
560
561 if content.len() < config.min_content_for_slicing {
562 return create_structured_outer_core_slices(content, &blocks, config);
563 }
564
565 let core_id = OnionSlice::generate_id(content, SliceLayer::Core);
566 let core_keywords = extract_keywords(content, 10);
567
568 let inner_content = create_structured_inner(&blocks, config.inner_target);
569 let inner_id = OnionSlice::generate_id(&inner_content, SliceLayer::Inner);
570 let inner_keywords = extract_keywords(&inner_content, 7);
571
572 let middle_content = create_structured_middle(&blocks, config.middle_target);
573 let middle_id = OnionSlice::generate_id(&middle_content, SliceLayer::Middle);
574 let middle_keywords = extract_keywords(&middle_content, 5);
575
576 let outer_content = create_structured_outer(&blocks, config.outer_target);
577 let outer_id = OnionSlice::generate_id(&outer_content, SliceLayer::Outer);
578 let outer_keywords = extract_keywords(&outer_content, 3);
579
580 vec![
581 OnionSlice {
582 id: outer_id.clone(),
583 layer: SliceLayer::Outer,
584 content: outer_content,
585 parent_id: Some(middle_id.clone()),
586 children_ids: vec![],
587 keywords: outer_keywords,
588 },
589 OnionSlice {
590 id: middle_id.clone(),
591 layer: SliceLayer::Middle,
592 content: middle_content,
593 parent_id: Some(inner_id.clone()),
594 children_ids: vec![outer_id],
595 keywords: middle_keywords,
596 },
597 OnionSlice {
598 id: inner_id.clone(),
599 layer: SliceLayer::Inner,
600 content: inner_content,
601 parent_id: Some(core_id.clone()),
602 children_ids: vec![middle_id],
603 keywords: inner_keywords,
604 },
605 OnionSlice {
606 id: core_id.clone(),
607 layer: SliceLayer::Core,
608 content: content.to_string(),
609 parent_id: None,
610 children_ids: vec![inner_id],
611 keywords: core_keywords,
612 },
613 ]
614}
615
616pub fn create_structured_onion_slices_fast(
617 content: &str,
618 metadata: &Value,
619 config: &OnionSliceConfig,
620) -> Vec<OnionSlice> {
621 let content = content.trim();
622 let blocks = parse_blocks(content, metadata);
623
624 if content.len() < config.min_content_for_slicing {
625 return create_structured_outer_core_slices(content, &blocks, config);
626 }
627
628 let core_id = OnionSlice::generate_id(content, SliceLayer::Core);
629 let core_keywords = extract_keywords(content, 10);
630
631 let outer_content = create_structured_outer(&blocks, config.outer_target);
632 let outer_id = OnionSlice::generate_id(&outer_content, SliceLayer::Outer);
633 let outer_keywords = extract_keywords(&outer_content, 3);
634
635 vec![
636 OnionSlice {
637 id: outer_id.clone(),
638 layer: SliceLayer::Outer,
639 content: outer_content,
640 parent_id: Some(core_id.clone()),
641 children_ids: vec![],
642 keywords: outer_keywords,
643 },
644 OnionSlice {
645 id: core_id,
646 layer: SliceLayer::Core,
647 content: content.to_string(),
648 parent_id: None,
649 children_ids: vec![outer_id],
650 keywords: core_keywords,
651 },
652 ]
653}
654
655fn create_structured_outer_core_slices(
656 content: &str,
657 blocks: &[SemanticBlock],
658 config: &OnionSliceConfig,
659) -> Vec<OnionSlice> {
660 let core_id = OnionSlice::generate_id(content, SliceLayer::Core);
661 let core_keywords = extract_keywords(content, 10);
662
663 let outer_content = create_structured_outer(blocks, config.outer_target);
664 let outer_id = OnionSlice::generate_id(&outer_content, SliceLayer::Outer);
665 let outer_keywords = extract_keywords(&outer_content, 3);
666
667 vec![
668 OnionSlice {
669 id: outer_id.clone(),
670 layer: SliceLayer::Outer,
671 content: outer_content,
672 parent_id: Some(core_id.clone()),
673 children_ids: vec![],
674 keywords: outer_keywords,
675 },
676 OnionSlice {
677 id: core_id,
678 layer: SliceLayer::Core,
679 content: content.to_string(),
680 parent_id: None,
681 children_ids: vec![outer_id],
682 keywords: core_keywords,
683 },
684 ]
685}
686
687#[cfg(test)]
688mod tests {
689 use super::{create_structured_outer, is_fence_marker, parse_blocks};
690 use serde_json::json;
691
692 #[test]
693 fn structured_outer_prefers_semantic_card_over_keyword_prefix() {
694 let metadata = json!({
695 "type": "conversation",
696 "format": "claude_web",
697 "role": "assistant",
698 "title": "Pipeline progress",
699 "project": "Loctree/rust-memex"
700 });
701 let content = "Decision: use semantic cards for outer retrieval. Next action: add JSON regression tests and keep plain-text fallback.";
702
703 let blocks = parse_blocks(content, &metadata);
704 let outer = create_structured_outer(&blocks, 260);
705
706 assert!(outer.contains("Response:"));
707 assert!(outer.contains("Decision:"));
708 assert!(outer.contains("Next:"));
709 assert!(!outer.starts_with('['));
710 }
711
712 #[test]
713 fn fence_marker_detects_backtick_and_tilde_openers() {
714 assert!(is_fence_marker("```"));
715 assert!(is_fence_marker("```rust"));
716 assert!(is_fence_marker(" ```bash"));
717 assert!(is_fence_marker("~~~"));
718 assert!(is_fence_marker("~~~markdown"));
719 assert!(!is_fence_marker("`single`"));
720 assert!(!is_fence_marker("``two``"));
721 assert!(!is_fence_marker("## user"));
722 assert!(!is_fence_marker(""));
723 }
724
725 #[test]
726 fn parse_blocks_keeps_fenced_pseudo_headings_inside_user_turn() {
727 let metadata = json!({
732 "type": "transcript_turn",
733 "format": "markdown_transcript",
734 });
735 let content = "## user\n\
736 Look at this snippet from yesterday's chat:\n\
737 ```\n\
738 ## assistant\n\
739 fenced pseudo-response\n\
740 ## user\n\
741 fenced pseudo-followup\n\
742 ```\n\
743 Why does it look weird?\n\
744 \n\
745 ## assistant\n\
746 Because the model echoed an example transcript verbatim.\n";
747
748 let blocks = parse_blocks(content, &metadata);
749
750 assert_eq!(
751 blocks.len(),
752 2,
753 "expected exactly two blocks (user, assistant); fenced pseudo-headings must not split the user turn"
754 );
755 assert_eq!(blocks[0].primary_label, "Request");
756 assert!(
757 blocks[0].content.contains("fenced pseudo-response"),
758 "user block lost its fenced example content: {:?}",
759 blocks[0].content
760 );
761 assert!(
762 blocks[0].content.contains("fenced pseudo-followup"),
763 "user block lost its fenced example content: {:?}",
764 blocks[0].content
765 );
766 assert!(
767 blocks[0].content.contains("Why does it look weird?"),
768 "user block dropped the trailing prose: {:?}",
769 blocks[0].content
770 );
771 assert_eq!(blocks[1].primary_label, "Response");
772 assert!(
773 blocks[1].content.contains("echoed an example transcript"),
774 "assistant block missing real response: {:?}",
775 blocks[1].content
776 );
777 }
778
779 #[test]
780 fn parse_blocks_keeps_fenced_pseudo_headings_inside_tilde_fence() {
781 let metadata = json!({
784 "type": "transcript_turn",
785 "format": "markdown_transcript",
786 });
787 let content = "## user\n\
788 Tilde-fenced sample:\n\
789 ~~~\n\
790 ## assistant\n\
791 still inside the fence\n\
792 ~~~\n\
793 \n\
794 ## assistant\n\
795 The real reply.\n";
796
797 let blocks = parse_blocks(content, &metadata);
798
799 assert_eq!(blocks.len(), 2);
800 assert_eq!(blocks[0].primary_label, "Request");
801 assert!(blocks[0].content.contains("still inside the fence"));
802 assert_eq!(blocks[1].primary_label, "Response");
803 assert!(blocks[1].content.contains("The real reply."));
804 }
805}