trusty_memory/kg_extract.rs
1//! Deterministic KG triple extraction from drawer content.
2//!
3//! Why: Issue #97 — `memory_remember` should populate the knowledge graph
4//! automatically so palaces with drawers always have a non-empty KG. Calling an
5//! LLM on every write would blow up latency and require network access; a
6//! deterministic heuristic stays fast and offline while still producing useful
7//! triples for tag membership, key-phrase mentions, and obvious is-a / has-a /
8//! works-at patterns. The visual graph view (the other half of #97) renders
9//! whatever shows up here, so this pass is the data source for "every palace
10//! has a graph".
11//! What: A pure function `extract_triples` that takes drawer content + tags +
12//! drawer id and returns a `Vec<Triple>` with `provenance = "auto:remember"`.
13//! The current heuristics are tag→drawer, room→drawer, hashtag→drawer, and a
14//! short pattern table (`X is a Y`, `X works at Y`, `X uses Y`, `X depends on
15//! Y`). Drawer ids are encoded as `drawer:<uuid>` so the subject keeps a
16//! stable, palace-unique identity that the graph view can dereference back
17//! to the source drawer.
18//! Test: `extract_triples_emits_tag_triples`,
19//! `extract_triples_emits_hashtag_mentions`,
20//! `extract_triples_extracts_is_a_pattern`,
21//! `extract_triples_never_panics_on_empty_input`.
22
23use chrono::Utc;
24use std::collections::HashSet;
25use trusty_common::memory_core::store::kg::Triple;
26use uuid::Uuid;
27
28/// Default tags that cause a drawer to be skipped during auto-extraction.
29///
30/// Why: Drawers tagged with these labels are by definition non-factual project
31/// knowledge (test fixtures, QA scaffolding, synthetic content) and should not
32/// pollute the KG with noise triples.
33/// What: A static slice of lowercase tag strings; matched case-insensitively
34/// during extraction.
35/// Test: `extract_triples_skips_denied_tags`.
36pub const DEFAULT_DENY_TAGS: &[&str] = &["cross-project-qa", "test", "fixture"];
37
38/// Configuration for a single extraction pass.
39///
40/// Why: Bundles per-run configuration so `extract_triples` can be called with
41/// different deny-lists (e.g. the default prod list vs. an empty list in
42/// integration tests) without changing the function signature.
43/// What: Contains a `deny_tags` slice; the extractor skips any drawer whose
44/// tags intersect this set.
45/// Test: `extract_triples_skips_denied_tags`, `extract_triples_empty_deny_list`.
46#[derive(Debug, Clone)]
47pub struct KgExtractConfig<'a> {
48 /// Tags that cause extraction to be skipped entirely. Compared
49 /// case-insensitively against the drawer's tag list.
50 pub deny_tags: &'a [&'a str],
51}
52
53impl Default for KgExtractConfig<'_> {
54 fn default() -> Self {
55 Self {
56 deny_tags: DEFAULT_DENY_TAGS,
57 }
58 }
59}
60
61/// Provenance tag stamped on every auto-extracted triple.
62///
63/// Why: Operators need a stable string to filter / retract the auto-extracted
64/// subset without scanning content. Centralising the constant keeps every
65/// emitter and the back-fill CLI in sync.
66/// What: A `&'static str` containing the literal `auto:remember`.
67/// Test: `extract_triples_stamps_provenance`.
68pub const AUTO_PROVENANCE: &str = "auto:remember";
69
70/// Confidence applied to auto-extracted triples.
71///
72/// Why: Heuristic extraction is not authoritative; downstream rankers can use
73/// the confidence to prefer explicit `kg_assert` triples over auto-extracted
74/// noise.
75/// What: `0.6` — high enough to surface in queries, low enough to be
76/// over-ridden by a manual `kg_assert` of the same `(subject, predicate)`.
77/// Test: `extract_triples_uses_reduced_confidence`.
78pub const AUTO_CONFIDENCE: f32 = 0.6;
79
80/// Subject prefix used for drawer-identity triples.
81///
82/// Why: A stable, palace-unique identifier lets the graph view dereference a
83/// node back to the source drawer (and the back-fill CLI dedupe by drawer).
84/// What: `drawer:` — concatenated with the drawer UUID hyphenated form.
85/// Test: every test in this module asserts the prefix.
86pub const DRAWER_SUBJECT_PREFIX: &str = "drawer:";
87
88/// Subject prefix used for tag entities.
89///
90/// Why: The KG enforces at most one active triple per `(subject, predicate)`,
91/// so we can't emit `drawer:X has-tag t1; drawer:X has-tag t2` — the second
92/// assert would close the first. By promoting each tag to its own subject
93/// (`tag:t1`, `tag:t2`) we keep multiple tags as distinct edges and the graph
94/// view gets natural tag-clusters around each drawer.
95/// What: `tag:` — concatenated with the lower-cased tag string.
96/// Test: `extract_triples_emits_tag_triples`.
97pub const TAG_SUBJECT_PREFIX: &str = "tag:";
98
99/// Subject prefix used for free-text mention entities.
100///
101/// Why: Same temporal-invariant reasoning as `TAG_SUBJECT_PREFIX`. Hashtag
102/// mentions and other discovered topical terms become their own subjects so
103/// multiple mentions per drawer survive the assert pipeline.
104/// What: `topic:` — concatenated with the lower-cased term.
105/// Test: `extract_triples_emits_hashtag_mentions`.
106pub const TOPIC_SUBJECT_PREFIX: &str = "topic:";
107
108/// Subject prefix used for room entities.
109///
110/// Why: A drawer can only sit in one room, but encoding the room as its own
111/// subject keeps the graph topology consistent (all "discovered metadata"
112/// entities live under prefixed namespaces) and lets multiple drawers from
113/// the same room cluster around a shared room node.
114/// What: `room:` — concatenated with the room label.
115/// Test: `extract_triples_emits_tag_triples`.
116pub const ROOM_SUBJECT_PREFIX: &str = "room:";
117
118/// Build the drawer subject string used as the (s) for every per-drawer
119/// triple emitted by this module.
120///
121/// Why: Centralises the `drawer:<uuid>` encoding so call sites cannot drift.
122/// What: Returns `format!("{DRAWER_SUBJECT_PREFIX}{id}")`.
123/// Test: covered by every extractor test.
124pub fn drawer_subject(id: Uuid) -> String {
125 format!("{DRAWER_SUBJECT_PREFIX}{id}")
126}
127
128/// Inputs to a single extraction pass.
129///
130/// Why: Bundling the inputs keeps `extract_triples` signature small and lets
131/// us add new fields (e.g. drawer_type) without breaking call sites.
132/// What: Plain data struct; all fields are borrowed so the caller keeps
133/// ownership.
134/// Test: indirectly via every test that constructs one.
135#[derive(Debug, Clone)]
136pub struct ExtractInput<'a> {
137 pub drawer_id: Uuid,
138 pub content: &'a str,
139 pub tags: &'a [String],
140 pub room: Option<&'a str>,
141}
142
143/// Run the deterministic heuristic extractor with default config.
144///
145/// Why: Convenience wrapper that uses [`KgExtractConfig::default`] (the
146/// production deny-list) so call sites that do not need a custom config
147/// remain unchanged.
148/// What: Delegates to [`extract_triples_with_config`] with a default config.
149/// Test: All existing tests call this helper and implicitly exercise the default
150/// deny-list path.
151pub fn extract_triples(input: &ExtractInput<'_>) -> Vec<Triple> {
152 extract_triples_with_config(input, &KgExtractConfig::default())
153}
154
155/// Run the deterministic heuristic extractor.
156///
157/// Why: Single entry point so `memory_remember`, `memory_note`, and the
158/// back-fill CLI all share the same logic. Pure function — no I/O, no async —
159/// so it can be unit-tested cheaply. Accepts a [`KgExtractConfig`] so callers
160/// can override the deny-list without touching the function signature.
161/// What: First checks whether any of the drawer's tags appear in
162/// `config.deny_tags` (case-insensitive); when a match is found the function
163/// returns immediately with an empty vec and logs a debug message. Otherwise
164/// walks `tags`, content tokens, and a small pattern list to emit `Triple`s;
165/// deduplicates so the same `(subject, predicate, object)` never appears twice
166/// in a single pass.
167/// Test: `extract_triples_skips_denied_tags`, `extract_triples_emits_tag_triples`,
168/// plus all other tests in this file.
169pub fn extract_triples_with_config(
170 input: &ExtractInput<'_>,
171 config: &KgExtractConfig<'_>,
172) -> Vec<Triple> {
173 // Deny-list check: if any tag on this drawer is in the deny set, skip
174 // extraction entirely. The check is case-insensitive to tolerate mixed-
175 // case tags from different clients.
176 let denied = input.tags.iter().any(|t| {
177 let lower = t.trim().to_lowercase();
178 config.deny_tags.contains(&lower.as_str())
179 });
180 if denied {
181 tracing::debug!(
182 drawer_id = %input.drawer_id,
183 tags = ?input.tags,
184 "kg_extract: skipping drawer — tag matches deny-list"
185 );
186 return Vec::new();
187 }
188 let now = Utc::now();
189 let subject = drawer_subject(input.drawer_id);
190 let mut out: Vec<Triple> = Vec::new();
191 let mut seen: HashSet<(String, String, String)> = HashSet::new();
192
193 let push = |out: &mut Vec<Triple>,
194 seen: &mut HashSet<(String, String, String)>,
195 s: String,
196 p: String,
197 o: String| {
198 let key = (s.clone(), p.clone(), o.clone());
199 if seen.insert(key) {
200 out.push(Triple {
201 subject: s,
202 predicate: p,
203 object: o,
204 valid_from: now,
205 valid_to: None,
206 confidence: AUTO_CONFIDENCE,
207 provenance: Some(AUTO_PROVENANCE.to_string()),
208 });
209 }
210 };
211
212 // Tag membership — each tag becomes its own subject so multiple tags on
213 // the same drawer don't collide under the "one active triple per
214 // (s, p)" invariant. Edge direction is `tag:<t> tags drawer:<id>` so the
215 // graph clusters drawers under their shared tag nodes.
216 for tag in input.tags {
217 let clean = tag.trim();
218 if clean.is_empty() {
219 continue;
220 }
221 push(
222 &mut out,
223 &mut seen,
224 format!("{TAG_SUBJECT_PREFIX}{}", clean.to_lowercase()),
225 "tags".to_string(),
226 subject.clone(),
227 );
228 }
229
230 // Room membership — `room:<r> contains drawer:<id>` for the same reason
231 // (multiple drawers per room must coexist).
232 if let Some(room) = input.room {
233 let clean = room.trim();
234 if !clean.is_empty() {
235 push(
236 &mut out,
237 &mut seen,
238 format!("{ROOM_SUBJECT_PREFIX}{clean}"),
239 "contains".to_string(),
240 subject.clone(),
241 );
242 }
243 }
244
245 // Hashtag-style mentions — `topic:<term> mentioned-in drawer:<id>` so
246 // multiple terms per drawer can coexist as distinct active edges.
247 for term in extract_hashtags(input.content) {
248 push(
249 &mut out,
250 &mut seen,
251 format!("{TOPIC_SUBJECT_PREFIX}{term}"),
252 "mentioned-in".to_string(),
253 subject.clone(),
254 );
255 }
256
257 // Simple natural-language patterns. Each yields a free-form
258 // `<subject> <predicate> <object>` triple anchored to entities found in
259 // the content (not the drawer subject), so the graph develops topical
260 // edges over time.
261 for (s, p, o) in extract_patterns(input.content) {
262 push(&mut out, &mut seen, s, p, o);
263 }
264
265 out
266}
267
268/// Pull `#hashtag`-style tokens out of free-form content.
269///
270/// Why: Hashtags are a cheap, intentional signal — when a user writes `#rust`
271/// or `#design-doc` we should record the mention so the graph picks it up.
272/// What: Walks the string, captures runs of `[a-zA-Z0-9_-]` following a `#`,
273/// lower-cases and deduplicates. Skips empty captures (a lone `#`).
274/// Test: `extract_triples_emits_hashtag_mentions`.
275fn extract_hashtags(content: &str) -> Vec<String> {
276 let mut out: Vec<String> = Vec::new();
277 let mut seen: HashSet<String> = HashSet::new();
278 let mut iter = content.char_indices().peekable();
279 while let Some((_, c)) = iter.next() {
280 if c != '#' {
281 continue;
282 }
283 let mut term = String::new();
284 while let Some(&(_, nc)) = iter.peek() {
285 if nc.is_ascii_alphanumeric() || nc == '_' || nc == '-' {
286 term.push(nc.to_ascii_lowercase());
287 iter.next();
288 } else {
289 break;
290 }
291 }
292 if term.is_empty() {
293 continue;
294 }
295 if seen.insert(term.clone()) {
296 out.push(term);
297 }
298 }
299 out
300}
301
302/// Pattern dictionary used by `extract_patterns`.
303///
304/// Why: A small, predictable set of (predicate, marker phrases) keeps the
305/// extractor explicable and deterministic. Each entry maps a predicate to one
306/// or more space-padded marker phrases; when the marker appears in the lower-
307/// cased content we split on it and read the entity tokens immediately to
308/// each side.
309/// What: A static slice of `(predicate, &[marker, ...])`. Markers must be
310/// lower-case and surrounded by whatever whitespace the input has — we add
311/// the padding ourselves.
312/// Test: `extract_triples_extracts_is_a_pattern`.
313const PATTERN_TABLE: &[(&str, &[&str])] = &[
314 ("is-a", &[" is a ", " is an "]),
315 ("works-at", &[" works at "]),
316 ("uses", &[" uses ", " using "]),
317 ("depends-on", &[" depends on ", " requires "]),
318];
319
320/// Apply the pattern table to a single content blob.
321///
322/// Why: Keeps the matching loop out of `extract_triples` so the dispatcher
323/// stays readable.
324/// What: For every `(predicate, markers)` row, scan every marker against the
325/// lower-cased content; on the first hit emit `(left_token, predicate,
326/// right_token)` and move on to the next predicate. Only the first hit per
327/// predicate is taken to avoid combinatorial output on long texts.
328/// Test: `extract_triples_extracts_is_a_pattern`.
329fn extract_patterns(content: &str) -> Vec<(String, String, String)> {
330 let lower = content.to_lowercase();
331 let mut out: Vec<(String, String, String)> = Vec::new();
332 for (predicate, markers) in PATTERN_TABLE {
333 for marker in *markers {
334 if let Some(idx) = lower.find(marker) {
335 let left = lower[..idx].trim();
336 let right_start = idx + marker.len();
337 let right = lower[right_start..].trim();
338 let subject_tok = last_token(left);
339 let object_tok = first_token(right);
340 if !subject_tok.is_empty() && !object_tok.is_empty() {
341 out.push((subject_tok, (*predicate).to_string(), object_tok));
342 }
343 break;
344 }
345 }
346 }
347 out
348}
349
350/// Pull the final whitespace-delimited token from a fragment.
351///
352/// Why: The left side of a pattern hit can contain arbitrary preamble; the
353/// entity we care about is the noun immediately before the marker.
354/// What: Trims trailing punctuation off the last whitespace-delimited token.
355/// Test: indirectly via `extract_triples_extracts_is_a_pattern`.
356fn last_token(s: &str) -> String {
357 s.split_whitespace()
358 .last()
359 .map(|t| t.trim_end_matches([',', '.', ';', ':', '!', '?', '"', '\'']))
360 .unwrap_or("")
361 .to_string()
362}
363
364/// Pull the first whitespace-delimited token from a fragment.
365///
366/// Why: Mirror of `last_token` for the right side of a pattern hit.
367/// What: Trims leading punctuation off the first whitespace-delimited token.
368/// Test: indirectly via `extract_triples_extracts_is_a_pattern`.
369fn first_token(s: &str) -> String {
370 s.split_whitespace()
371 .next()
372 .map(|t| t.trim_end_matches([',', '.', ';', ':', '!', '?', '"', '\'']))
373 .unwrap_or("")
374 .to_string()
375}
376
377#[cfg(test)]
378mod tests {
379 use super::*;
380
381 fn input_for(content: &str, tags: &[&str], room: Option<&str>) -> (Uuid, Vec<String>) {
382 let id = Uuid::new_v4();
383 let owned_tags: Vec<String> = tags.iter().map(|s| s.to_string()).collect();
384 let _ = content; // silence unused warning if test ignores content
385 let _ = room;
386 (id, owned_tags)
387 }
388
389 /// Why: Tag-derived triples are the lowest-hanging extraction and the
390 /// graph view's first signal when no patterns fire. The KG's temporal
391 /// model only allows one active triple per `(subject, predicate)`, so
392 /// each tag becomes its own subject (`tag:<name>`) with a `tags`
393 /// predicate pointing at the drawer.
394 /// What: One `tag:<t> tags drawer:<id>` per non-empty tag, plus
395 /// `room:<r> contains drawer:<id>` when a room is supplied.
396 /// Test: This test.
397 #[test]
398 fn extract_triples_emits_tag_triples() {
399 let (id, tags) = input_for("hello world", &["rust", "design"], Some("Backend"));
400 let triples = extract_triples(&ExtractInput {
401 drawer_id: id,
402 content: "hello world",
403 tags: &tags,
404 room: Some("Backend"),
405 });
406 let object = drawer_subject(id);
407 assert!(triples
408 .iter()
409 .any(|t| t.subject == "tag:rust" && t.predicate == "tags" && t.object == object));
410 assert!(triples
411 .iter()
412 .any(|t| t.subject == "tag:design" && t.predicate == "tags" && t.object == object));
413 assert!(triples.iter().any(|t| t.subject == "room:Backend"
414 && t.predicate == "contains"
415 && t.object == object));
416 }
417
418 /// Why: Hashtag tokens are a cheap user signal; the extractor must catch
419 /// them so the graph picks up topical entities.
420 /// What: `#rust` and `#design-doc` both become `topic:<term>
421 /// mentioned-in drawer:<id>` triples, lower-cased and deduplicated.
422 /// Test: This test.
423 #[test]
424 fn extract_triples_emits_hashtag_mentions() {
425 let (id, tags) = input_for("see #Rust and #design-doc and #rust again", &[], None);
426 let triples = extract_triples(&ExtractInput {
427 drawer_id: id,
428 content: "see #Rust and #design-doc and #rust again",
429 tags: &tags,
430 room: None,
431 });
432 let mention_subjects: Vec<&str> = triples
433 .iter()
434 .filter(|t| t.predicate == "mentioned-in")
435 .map(|t| t.subject.as_str())
436 .collect();
437 assert!(mention_subjects.contains(&"topic:rust"));
438 assert!(mention_subjects.contains(&"topic:design-doc"));
439 // Dedupe — `#rust` and `#Rust` collapse.
440 assert_eq!(
441 mention_subjects
442 .iter()
443 .filter(|s| **s == "topic:rust")
444 .count(),
445 1
446 );
447 }
448
449 /// Why: `is a` is the simplest NL pattern and the most common idiom in
450 /// quick notes ("rustc is a compiler").
451 /// What: Pattern fires once per content blob; subject and object are the
452 /// nouns either side of the marker.
453 /// Test: This test.
454 #[test]
455 fn extract_triples_extracts_is_a_pattern() {
456 let (id, _) = input_for("rustc is a compiler for rust", &[], None);
457 let triples = extract_triples(&ExtractInput {
458 drawer_id: id,
459 content: "rustc is a compiler for rust",
460 tags: &[],
461 room: None,
462 });
463 assert!(triples
464 .iter()
465 .any(|t| t.subject == "rustc" && t.predicate == "is-a" && t.object == "compiler"));
466 }
467
468 /// Why: Confidence and provenance are guard-rails — extracted triples
469 /// must be recognisable and over-ridable.
470 /// What: Every triple carries `provenance = Some("auto:remember")` and
471 /// `confidence == AUTO_CONFIDENCE`.
472 /// Test: This test.
473 #[test]
474 fn extract_triples_stamps_provenance() {
475 let (id, tags) = input_for("anything", &["x"], None);
476 let triples = extract_triples(&ExtractInput {
477 drawer_id: id,
478 content: "anything",
479 tags: &tags,
480 room: None,
481 });
482 assert!(!triples.is_empty());
483 for t in &triples {
484 assert_eq!(t.provenance.as_deref(), Some(AUTO_PROVENANCE));
485 assert!((t.confidence - AUTO_CONFIDENCE).abs() < f32::EPSILON);
486 }
487 }
488
489 /// Why: Reduced confidence is the contract a manual `kg_assert` of the
490 /// same `(subject, predicate)` needs in order to "win" against the
491 /// auto-extracted edge.
492 /// What: Every triple carries `confidence == AUTO_CONFIDENCE` (currently
493 /// 0.6); the constant is asserted to stay strictly below 1.0 so manual
494 /// asserts always rank higher.
495 /// Test: This test.
496 #[test]
497 #[allow(clippy::assertions_on_constants)]
498 fn extract_triples_uses_reduced_confidence() {
499 // Why: both bounds are static facts about the AUTO_CONFIDENCE
500 // constant; the assertion is documentation for future tweakers.
501 assert!(AUTO_CONFIDENCE < 1.0);
502 assert!(AUTO_CONFIDENCE > 0.0);
503 }
504
505 /// Why: Empty / whitespace-only content must not panic or emit garbage.
506 /// What: No tags, no room, no content → empty vec.
507 /// Test: This test.
508 #[test]
509 fn extract_triples_never_panics_on_empty_input() {
510 let id = Uuid::new_v4();
511 let triples = extract_triples(&ExtractInput {
512 drawer_id: id,
513 content: "",
514 tags: &[],
515 room: None,
516 });
517 assert!(triples.is_empty());
518 }
519
520 /// Why: Edge-case test — content with no patterns but tags should still
521 /// produce the tag triples (the graph view's primary signal).
522 /// What: Single tag, no room, prose with no pattern hits → exactly one
523 /// triple shaped as `tag:meeting tags drawer:<id>`.
524 /// Test: This test.
525 #[test]
526 fn extract_triples_tags_only_path() {
527 let id = Uuid::new_v4();
528 let tags = vec!["meeting".to_string()];
529 let triples = extract_triples(&ExtractInput {
530 drawer_id: id,
531 content: "Discussed roadmap.",
532 tags: &tags,
533 room: None,
534 });
535 assert_eq!(triples.len(), 1);
536 assert_eq!(triples[0].subject, "tag:meeting");
537 assert_eq!(triples[0].predicate, "tags");
538 assert_eq!(triples[0].object, drawer_subject(id));
539 }
540
541 /// Why: Drawers tagged with deny-listed labels (test fixtures, QA scaffolding)
542 /// must not pollute the KG with non-factual content.
543 /// What: A drawer with the `test` tag must produce zero triples even when
544 /// it also has a room and content with extractable patterns.
545 /// Test: This test.
546 #[test]
547 fn extract_triples_skips_denied_tags() {
548 let id = Uuid::new_v4();
549 let tags = vec!["test".to_string(), "rust".to_string()];
550 let triples = extract_triples(&ExtractInput {
551 drawer_id: id,
552 content: "rustc is a compiler",
553 tags: &tags,
554 room: Some("Backend"),
555 });
556 assert!(
557 triples.is_empty(),
558 "a drawer with a deny-list tag must produce zero triples, got {triples:?}"
559 );
560 }
561
562 /// Why: Deny-list matching is case-insensitive so `TEST` and `Test` are
563 /// blocked the same as `test`.
564 /// What: A drawer tagged `FIXTURE` (upper-case) must still produce zero
565 /// triples.
566 /// Test: This test.
567 #[test]
568 fn extract_triples_deny_list_is_case_insensitive() {
569 let id = Uuid::new_v4();
570 let tags = vec!["FIXTURE".to_string()];
571 let triples = extract_triples(&ExtractInput {
572 drawer_id: id,
573 content: "some content",
574 tags: &tags,
575 room: None,
576 });
577 assert!(
578 triples.is_empty(),
579 "upper-cased deny tag must still be blocked"
580 );
581 }
582
583 /// Why: An empty deny-list (e.g. in integration tests that want to exercise
584 /// extraction regardless of tags) must not suppress any triples.
585 /// What: Calling `extract_triples_with_config` with `deny_tags = &[]` on a
586 /// drawer tagged `test` must produce the normal tag triple.
587 /// Test: This test.
588 #[test]
589 fn extract_triples_empty_deny_list_passes_through() {
590 let id = Uuid::new_v4();
591 let tags = vec!["test".to_string()];
592 let config = KgExtractConfig { deny_tags: &[] };
593 let triples = extract_triples_with_config(
594 &ExtractInput {
595 drawer_id: id,
596 content: "anything",
597 tags: &tags,
598 room: None,
599 },
600 &config,
601 );
602 // "test" tag should produce a tag triple when the deny-list is empty.
603 assert!(
604 !triples.is_empty(),
605 "empty deny-list must not suppress extraction"
606 );
607 }
608}