Skip to main content

trusty_memory/
kg_extract.rs

1//! Deterministic KG triple extraction from drawer content.
2//!
3//! Why: Issue #97 — `memory_remember` should populate the knowledge graph
4//! automatically so palaces with drawers always have a non-empty KG. Calling an
5//! LLM on every write would blow up latency and require network access; a
6//! deterministic heuristic stays fast and offline while still producing useful
7//! triples for tag membership, key-phrase mentions, and obvious is-a / has-a /
8//! works-at patterns. The visual graph view (the other half of #97) renders
9//! whatever shows up here, so this pass is the data source for "every palace
10//! has a graph".
11//! What: A pure function `extract_triples` that takes drawer content + tags +
12//! drawer id and returns a `Vec<Triple>` with `provenance = "auto:remember"`.
13//! The current heuristics are tag→drawer, room→drawer, hashtag→drawer, and a
14//! short pattern table (`X is a Y`, `X works at Y`, `X uses Y`, `X depends on
15//! Y`). Drawer ids are encoded as `drawer:<uuid>` so the subject keeps a
16//! stable, palace-unique identity that the graph view can dereference back
17//! to the source drawer.
18//! Test: `extract_triples_emits_tag_triples`,
19//! `extract_triples_emits_hashtag_mentions`,
20//! `extract_triples_extracts_is_a_pattern`,
21//! `extract_triples_never_panics_on_empty_input`.
22
23use chrono::Utc;
24use std::collections::HashSet;
25use trusty_common::memory_core::store::kg::Triple;
26use uuid::Uuid;
27
28/// Default tags that cause a drawer to be skipped during auto-extraction.
29///
30/// Why: Drawers tagged with these labels are by definition non-factual project
31/// knowledge (test fixtures, QA scaffolding, synthetic content) and should not
32/// pollute the KG with noise triples.
33/// What: A static slice of lowercase tag strings; matched case-insensitively
34/// during extraction.
35/// Test: `extract_triples_skips_denied_tags`.
36pub const DEFAULT_DENY_TAGS: &[&str] = &["cross-project-qa", "test", "fixture"];
37
38/// Configuration for a single extraction pass.
39///
40/// Why: Bundles per-run configuration so `extract_triples` can be called with
41/// different deny-lists (e.g. the default prod list vs. an empty list in
42/// integration tests) without changing the function signature.
43/// What: Contains a `deny_tags` slice; the extractor skips any drawer whose
44/// tags intersect this set.
45/// Test: `extract_triples_skips_denied_tags`, `extract_triples_empty_deny_list`.
46#[derive(Debug, Clone)]
47pub struct KgExtractConfig<'a> {
48    /// Tags that cause extraction to be skipped entirely. Compared
49    /// case-insensitively against the drawer's tag list.
50    pub deny_tags: &'a [&'a str],
51}
52
53impl Default for KgExtractConfig<'_> {
54    fn default() -> Self {
55        Self {
56            deny_tags: DEFAULT_DENY_TAGS,
57        }
58    }
59}
60
61/// Provenance tag stamped on every auto-extracted triple.
62///
63/// Why: Operators need a stable string to filter / retract the auto-extracted
64/// subset without scanning content. Centralising the constant keeps every
65/// emitter and the back-fill CLI in sync.
66/// What: A `&'static str` containing the literal `auto:remember`.
67/// Test: `extract_triples_stamps_provenance`.
68pub const AUTO_PROVENANCE: &str = "auto:remember";
69
70/// Confidence applied to auto-extracted triples.
71///
72/// Why: Heuristic extraction is not authoritative; downstream rankers can use
73/// the confidence to prefer explicit `kg_assert` triples over auto-extracted
74/// noise.
75/// What: `0.6` — high enough to surface in queries, low enough to be
76/// over-ridden by a manual `kg_assert` of the same `(subject, predicate)`.
77/// Test: `extract_triples_uses_reduced_confidence`.
78pub const AUTO_CONFIDENCE: f32 = 0.6;
79
80/// Subject prefix used for drawer-identity triples.
81///
82/// Why: A stable, palace-unique identifier lets the graph view dereference a
83/// node back to the source drawer (and the back-fill CLI dedupe by drawer).
84/// What: `drawer:` — concatenated with the drawer UUID hyphenated form.
85/// Test: every test in this module asserts the prefix.
86pub const DRAWER_SUBJECT_PREFIX: &str = "drawer:";
87
88/// Subject prefix used for tag entities.
89///
90/// Why: The KG enforces at most one active triple per `(subject, predicate)`,
91/// so we can't emit `drawer:X has-tag t1; drawer:X has-tag t2` — the second
92/// assert would close the first. By promoting each tag to its own subject
93/// (`tag:t1`, `tag:t2`) we keep multiple tags as distinct edges and the graph
94/// view gets natural tag-clusters around each drawer.
95/// What: `tag:` — concatenated with the lower-cased tag string.
96/// Test: `extract_triples_emits_tag_triples`.
97pub const TAG_SUBJECT_PREFIX: &str = "tag:";
98
99/// Subject prefix used for free-text mention entities.
100///
101/// Why: Same temporal-invariant reasoning as `TAG_SUBJECT_PREFIX`. Hashtag
102/// mentions and other discovered topical terms become their own subjects so
103/// multiple mentions per drawer survive the assert pipeline.
104/// What: `topic:` — concatenated with the lower-cased term.
105/// Test: `extract_triples_emits_hashtag_mentions`.
106pub const TOPIC_SUBJECT_PREFIX: &str = "topic:";
107
108/// Subject prefix used for room entities.
109///
110/// Why: A drawer can only sit in one room, but encoding the room as its own
111/// subject keeps the graph topology consistent (all "discovered metadata"
112/// entities live under prefixed namespaces) and lets multiple drawers from
113/// the same room cluster around a shared room node.
114/// What: `room:` — concatenated with the room label.
115/// Test: `extract_triples_emits_tag_triples`.
116pub const ROOM_SUBJECT_PREFIX: &str = "room:";
117
118/// Build the drawer subject string used as the (s) for every per-drawer
119/// triple emitted by this module.
120///
121/// Why: Centralises the `drawer:<uuid>` encoding so call sites cannot drift.
122/// What: Returns `format!("{DRAWER_SUBJECT_PREFIX}{id}")`.
123/// Test: covered by every extractor test.
124pub fn drawer_subject(id: Uuid) -> String {
125    format!("{DRAWER_SUBJECT_PREFIX}{id}")
126}
127
128/// Inputs to a single extraction pass.
129///
130/// Why: Bundling the inputs keeps `extract_triples` signature small and lets
131/// us add new fields (e.g. drawer_type) without breaking call sites.
132/// What: Plain data struct; all fields are borrowed so the caller keeps
133/// ownership.
134/// Test: indirectly via every test that constructs one.
135#[derive(Debug, Clone)]
136pub struct ExtractInput<'a> {
137    pub drawer_id: Uuid,
138    pub content: &'a str,
139    pub tags: &'a [String],
140    pub room: Option<&'a str>,
141}
142
143/// Run the deterministic heuristic extractor with default config.
144///
145/// Why: Convenience wrapper that uses [`KgExtractConfig::default`] (the
146/// production deny-list) so call sites that do not need a custom config
147/// remain unchanged.
148/// What: Delegates to [`extract_triples_with_config`] with a default config.
149/// Test: All existing tests call this helper and implicitly exercise the default
150/// deny-list path.
151pub fn extract_triples(input: &ExtractInput<'_>) -> Vec<Triple> {
152    extract_triples_with_config(input, &KgExtractConfig::default())
153}
154
155/// Run the deterministic heuristic extractor.
156///
157/// Why: Single entry point so `memory_remember`, `memory_note`, and the
158/// back-fill CLI all share the same logic. Pure function — no I/O, no async —
159/// so it can be unit-tested cheaply. Accepts a [`KgExtractConfig`] so callers
160/// can override the deny-list without touching the function signature.
161/// What: First checks whether any of the drawer's tags appear in
162/// `config.deny_tags` (case-insensitive); when a match is found the function
163/// returns immediately with an empty vec and logs a debug message. Otherwise
164/// walks `tags`, content tokens, and a small pattern list to emit `Triple`s;
165/// deduplicates so the same `(subject, predicate, object)` never appears twice
166/// in a single pass.
167/// Test: `extract_triples_skips_denied_tags`, `extract_triples_emits_tag_triples`,
168/// plus all other tests in this file.
169pub fn extract_triples_with_config(
170    input: &ExtractInput<'_>,
171    config: &KgExtractConfig<'_>,
172) -> Vec<Triple> {
173    // Deny-list check: if any tag on this drawer is in the deny set, skip
174    // extraction entirely. The check is case-insensitive to tolerate mixed-
175    // case tags from different clients.
176    let denied = input.tags.iter().any(|t| {
177        let lower = t.trim().to_lowercase();
178        config.deny_tags.contains(&lower.as_str())
179    });
180    if denied {
181        tracing::debug!(
182            drawer_id = %input.drawer_id,
183            tags = ?input.tags,
184            "kg_extract: skipping drawer — tag matches deny-list"
185        );
186        return Vec::new();
187    }
188    let now = Utc::now();
189    let subject = drawer_subject(input.drawer_id);
190    let mut out: Vec<Triple> = Vec::new();
191    let mut seen: HashSet<(String, String, String)> = HashSet::new();
192
193    let push = |out: &mut Vec<Triple>,
194                seen: &mut HashSet<(String, String, String)>,
195                s: String,
196                p: String,
197                o: String| {
198        let key = (s.clone(), p.clone(), o.clone());
199        if seen.insert(key) {
200            out.push(Triple {
201                subject: s,
202                predicate: p,
203                object: o,
204                valid_from: now,
205                valid_to: None,
206                confidence: AUTO_CONFIDENCE,
207                provenance: Some(AUTO_PROVENANCE.to_string()),
208            });
209        }
210    };
211
212    // Tag membership — each tag becomes its own subject so multiple tags on
213    // the same drawer don't collide under the "one active triple per
214    // (s, p)" invariant. Edge direction is `tag:<t> tags drawer:<id>` so the
215    // graph clusters drawers under their shared tag nodes.
216    for tag in input.tags {
217        let clean = tag.trim();
218        if clean.is_empty() {
219            continue;
220        }
221        push(
222            &mut out,
223            &mut seen,
224            format!("{TAG_SUBJECT_PREFIX}{}", clean.to_lowercase()),
225            "tags".to_string(),
226            subject.clone(),
227        );
228    }
229
230    // Room membership — `room:<r> contains drawer:<id>` for the same reason
231    // (multiple drawers per room must coexist).
232    if let Some(room) = input.room {
233        let clean = room.trim();
234        if !clean.is_empty() {
235            push(
236                &mut out,
237                &mut seen,
238                format!("{ROOM_SUBJECT_PREFIX}{clean}"),
239                "contains".to_string(),
240                subject.clone(),
241            );
242        }
243    }
244
245    // Hashtag-style mentions — `topic:<term> mentioned-in drawer:<id>` so
246    // multiple terms per drawer can coexist as distinct active edges.
247    for term in extract_hashtags(input.content) {
248        push(
249            &mut out,
250            &mut seen,
251            format!("{TOPIC_SUBJECT_PREFIX}{term}"),
252            "mentioned-in".to_string(),
253            subject.clone(),
254        );
255    }
256
257    // Simple natural-language patterns. Each yields a free-form
258    // `<subject> <predicate> <object>` triple anchored to entities found in
259    // the content (not the drawer subject), so the graph develops topical
260    // edges over time.
261    for (s, p, o) in extract_patterns(input.content) {
262        push(&mut out, &mut seen, s, p, o);
263    }
264
265    out
266}
267
268/// Pull `#hashtag`-style tokens out of free-form content.
269///
270/// Why: Hashtags are a cheap, intentional signal — when a user writes `#rust`
271/// or `#design-doc` we should record the mention so the graph picks it up.
272/// What: Walks the string, captures runs of `[a-zA-Z0-9_-]` following a `#`,
273/// lower-cases and deduplicates. Skips empty captures (a lone `#`).
274/// Test: `extract_triples_emits_hashtag_mentions`.
275fn extract_hashtags(content: &str) -> Vec<String> {
276    let mut out: Vec<String> = Vec::new();
277    let mut seen: HashSet<String> = HashSet::new();
278    let mut iter = content.char_indices().peekable();
279    while let Some((_, c)) = iter.next() {
280        if c != '#' {
281            continue;
282        }
283        let mut term = String::new();
284        while let Some(&(_, nc)) = iter.peek() {
285            if nc.is_ascii_alphanumeric() || nc == '_' || nc == '-' {
286                term.push(nc.to_ascii_lowercase());
287                iter.next();
288            } else {
289                break;
290            }
291        }
292        if term.is_empty() {
293            continue;
294        }
295        if seen.insert(term.clone()) {
296            out.push(term);
297        }
298    }
299    out
300}
301
302/// Pattern dictionary used by `extract_patterns`.
303///
304/// Why: A small, predictable set of (predicate, marker phrases) keeps the
305/// extractor explicable and deterministic. Each entry maps a predicate to one
306/// or more space-padded marker phrases; when the marker appears in the lower-
307/// cased content we split on it and read the entity tokens immediately to
308/// each side.
309/// What: A static slice of `(predicate, &[marker, ...])`. Markers must be
310/// lower-case and surrounded by whatever whitespace the input has — we add
311/// the padding ourselves.
312/// Test: `extract_triples_extracts_is_a_pattern`.
313const PATTERN_TABLE: &[(&str, &[&str])] = &[
314    ("is-a", &[" is a ", " is an "]),
315    ("works-at", &[" works at "]),
316    ("uses", &[" uses ", " using "]),
317    ("depends-on", &[" depends on ", " requires "]),
318];
319
320/// Apply the pattern table to a single content blob.
321///
322/// Why: Keeps the matching loop out of `extract_triples` so the dispatcher
323/// stays readable.
324/// What: For every `(predicate, markers)` row, scan every marker against the
325/// lower-cased content; on the first hit emit `(left_token, predicate,
326/// right_token)` and move on to the next predicate. Only the first hit per
327/// predicate is taken to avoid combinatorial output on long texts.
328/// Test: `extract_triples_extracts_is_a_pattern`.
329fn extract_patterns(content: &str) -> Vec<(String, String, String)> {
330    let lower = content.to_lowercase();
331    let mut out: Vec<(String, String, String)> = Vec::new();
332    for (predicate, markers) in PATTERN_TABLE {
333        for marker in *markers {
334            if let Some(idx) = lower.find(marker) {
335                let left = lower[..idx].trim();
336                let right_start = idx + marker.len();
337                let right = lower[right_start..].trim();
338                let subject_tok = last_token(left);
339                let object_tok = first_token(right);
340                if !subject_tok.is_empty() && !object_tok.is_empty() {
341                    out.push((subject_tok, (*predicate).to_string(), object_tok));
342                }
343                break;
344            }
345        }
346    }
347    out
348}
349
350/// Pull the final whitespace-delimited token from a fragment.
351///
352/// Why: The left side of a pattern hit can contain arbitrary preamble; the
353/// entity we care about is the noun immediately before the marker.
354/// What: Trims trailing punctuation off the last whitespace-delimited token.
355/// Test: indirectly via `extract_triples_extracts_is_a_pattern`.
356fn last_token(s: &str) -> String {
357    s.split_whitespace()
358        .last()
359        .map(|t| t.trim_end_matches([',', '.', ';', ':', '!', '?', '"', '\'']))
360        .unwrap_or("")
361        .to_string()
362}
363
364/// Pull the first whitespace-delimited token from a fragment.
365///
366/// Why: Mirror of `last_token` for the right side of a pattern hit.
367/// What: Trims leading punctuation off the first whitespace-delimited token.
368/// Test: indirectly via `extract_triples_extracts_is_a_pattern`.
369fn first_token(s: &str) -> String {
370    s.split_whitespace()
371        .next()
372        .map(|t| t.trim_end_matches([',', '.', ';', ':', '!', '?', '"', '\'']))
373        .unwrap_or("")
374        .to_string()
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    fn input_for(content: &str, tags: &[&str], room: Option<&str>) -> (Uuid, Vec<String>) {
382        let id = Uuid::new_v4();
383        let owned_tags: Vec<String> = tags.iter().map(|s| s.to_string()).collect();
384        let _ = content; // silence unused warning if test ignores content
385        let _ = room;
386        (id, owned_tags)
387    }
388
389    /// Why: Tag-derived triples are the lowest-hanging extraction and the
390    /// graph view's first signal when no patterns fire. The KG's temporal
391    /// model only allows one active triple per `(subject, predicate)`, so
392    /// each tag becomes its own subject (`tag:<name>`) with a `tags`
393    /// predicate pointing at the drawer.
394    /// What: One `tag:<t> tags drawer:<id>` per non-empty tag, plus
395    /// `room:<r> contains drawer:<id>` when a room is supplied.
396    /// Test: This test.
397    #[test]
398    fn extract_triples_emits_tag_triples() {
399        let (id, tags) = input_for("hello world", &["rust", "design"], Some("Backend"));
400        let triples = extract_triples(&ExtractInput {
401            drawer_id: id,
402            content: "hello world",
403            tags: &tags,
404            room: Some("Backend"),
405        });
406        let object = drawer_subject(id);
407        assert!(triples
408            .iter()
409            .any(|t| t.subject == "tag:rust" && t.predicate == "tags" && t.object == object));
410        assert!(triples
411            .iter()
412            .any(|t| t.subject == "tag:design" && t.predicate == "tags" && t.object == object));
413        assert!(triples.iter().any(|t| t.subject == "room:Backend"
414            && t.predicate == "contains"
415            && t.object == object));
416    }
417
418    /// Why: Hashtag tokens are a cheap user signal; the extractor must catch
419    /// them so the graph picks up topical entities.
420    /// What: `#rust` and `#design-doc` both become `topic:<term>
421    /// mentioned-in drawer:<id>` triples, lower-cased and deduplicated.
422    /// Test: This test.
423    #[test]
424    fn extract_triples_emits_hashtag_mentions() {
425        let (id, tags) = input_for("see #Rust and #design-doc and #rust again", &[], None);
426        let triples = extract_triples(&ExtractInput {
427            drawer_id: id,
428            content: "see #Rust and #design-doc and #rust again",
429            tags: &tags,
430            room: None,
431        });
432        let mention_subjects: Vec<&str> = triples
433            .iter()
434            .filter(|t| t.predicate == "mentioned-in")
435            .map(|t| t.subject.as_str())
436            .collect();
437        assert!(mention_subjects.contains(&"topic:rust"));
438        assert!(mention_subjects.contains(&"topic:design-doc"));
439        // Dedupe — `#rust` and `#Rust` collapse.
440        assert_eq!(
441            mention_subjects
442                .iter()
443                .filter(|s| **s == "topic:rust")
444                .count(),
445            1
446        );
447    }
448
449    /// Why: `is a` is the simplest NL pattern and the most common idiom in
450    /// quick notes ("rustc is a compiler").
451    /// What: Pattern fires once per content blob; subject and object are the
452    /// nouns either side of the marker.
453    /// Test: This test.
454    #[test]
455    fn extract_triples_extracts_is_a_pattern() {
456        let (id, _) = input_for("rustc is a compiler for rust", &[], None);
457        let triples = extract_triples(&ExtractInput {
458            drawer_id: id,
459            content: "rustc is a compiler for rust",
460            tags: &[],
461            room: None,
462        });
463        assert!(triples
464            .iter()
465            .any(|t| t.subject == "rustc" && t.predicate == "is-a" && t.object == "compiler"));
466    }
467
468    /// Why: Confidence and provenance are guard-rails — extracted triples
469    /// must be recognisable and over-ridable.
470    /// What: Every triple carries `provenance = Some("auto:remember")` and
471    /// `confidence == AUTO_CONFIDENCE`.
472    /// Test: This test.
473    #[test]
474    fn extract_triples_stamps_provenance() {
475        let (id, tags) = input_for("anything", &["x"], None);
476        let triples = extract_triples(&ExtractInput {
477            drawer_id: id,
478            content: "anything",
479            tags: &tags,
480            room: None,
481        });
482        assert!(!triples.is_empty());
483        for t in &triples {
484            assert_eq!(t.provenance.as_deref(), Some(AUTO_PROVENANCE));
485            assert!((t.confidence - AUTO_CONFIDENCE).abs() < f32::EPSILON);
486        }
487    }
488
489    /// Why: Reduced confidence is the contract a manual `kg_assert` of the
490    /// same `(subject, predicate)` needs in order to "win" against the
491    /// auto-extracted edge.
492    /// What: Every triple carries `confidence == AUTO_CONFIDENCE` (currently
493    /// 0.6); the constant is asserted to stay strictly below 1.0 so manual
494    /// asserts always rank higher.
495    /// Test: This test.
496    #[test]
497    #[allow(clippy::assertions_on_constants)]
498    fn extract_triples_uses_reduced_confidence() {
499        // Why: both bounds are static facts about the AUTO_CONFIDENCE
500        // constant; the assertion is documentation for future tweakers.
501        assert!(AUTO_CONFIDENCE < 1.0);
502        assert!(AUTO_CONFIDENCE > 0.0);
503    }
504
505    /// Why: Empty / whitespace-only content must not panic or emit garbage.
506    /// What: No tags, no room, no content → empty vec.
507    /// Test: This test.
508    #[test]
509    fn extract_triples_never_panics_on_empty_input() {
510        let id = Uuid::new_v4();
511        let triples = extract_triples(&ExtractInput {
512            drawer_id: id,
513            content: "",
514            tags: &[],
515            room: None,
516        });
517        assert!(triples.is_empty());
518    }
519
520    /// Why: Edge-case test — content with no patterns but tags should still
521    /// produce the tag triples (the graph view's primary signal).
522    /// What: Single tag, no room, prose with no pattern hits → exactly one
523    /// triple shaped as `tag:meeting tags drawer:<id>`.
524    /// Test: This test.
525    #[test]
526    fn extract_triples_tags_only_path() {
527        let id = Uuid::new_v4();
528        let tags = vec!["meeting".to_string()];
529        let triples = extract_triples(&ExtractInput {
530            drawer_id: id,
531            content: "Discussed roadmap.",
532            tags: &tags,
533            room: None,
534        });
535        assert_eq!(triples.len(), 1);
536        assert_eq!(triples[0].subject, "tag:meeting");
537        assert_eq!(triples[0].predicate, "tags");
538        assert_eq!(triples[0].object, drawer_subject(id));
539    }
540
541    /// Why: Drawers tagged with deny-listed labels (test fixtures, QA scaffolding)
542    /// must not pollute the KG with non-factual content.
543    /// What: A drawer with the `test` tag must produce zero triples even when
544    /// it also has a room and content with extractable patterns.
545    /// Test: This test.
546    #[test]
547    fn extract_triples_skips_denied_tags() {
548        let id = Uuid::new_v4();
549        let tags = vec!["test".to_string(), "rust".to_string()];
550        let triples = extract_triples(&ExtractInput {
551            drawer_id: id,
552            content: "rustc is a compiler",
553            tags: &tags,
554            room: Some("Backend"),
555        });
556        assert!(
557            triples.is_empty(),
558            "a drawer with a deny-list tag must produce zero triples, got {triples:?}"
559        );
560    }
561
562    /// Why: Deny-list matching is case-insensitive so `TEST` and `Test` are
563    /// blocked the same as `test`.
564    /// What: A drawer tagged `FIXTURE` (upper-case) must still produce zero
565    /// triples.
566    /// Test: This test.
567    #[test]
568    fn extract_triples_deny_list_is_case_insensitive() {
569        let id = Uuid::new_v4();
570        let tags = vec!["FIXTURE".to_string()];
571        let triples = extract_triples(&ExtractInput {
572            drawer_id: id,
573            content: "some content",
574            tags: &tags,
575            room: None,
576        });
577        assert!(
578            triples.is_empty(),
579            "upper-cased deny tag must still be blocked"
580        );
581    }
582
583    /// Why: An empty deny-list (e.g. in integration tests that want to exercise
584    /// extraction regardless of tags) must not suppress any triples.
585    /// What: Calling `extract_triples_with_config` with `deny_tags = &[]` on a
586    /// drawer tagged `test` must produce the normal tag triple.
587    /// Test: This test.
588    #[test]
589    fn extract_triples_empty_deny_list_passes_through() {
590        let id = Uuid::new_v4();
591        let tags = vec!["test".to_string()];
592        let config = KgExtractConfig { deny_tags: &[] };
593        let triples = extract_triples_with_config(
594            &ExtractInput {
595                drawer_id: id,
596                content: "anything",
597                tags: &tags,
598                room: None,
599            },
600            &config,
601        );
602        // "test" tag should produce a tag triple when the deny-list is empty.
603        assert!(
604            !triples.is_empty(),
605            "empty deny-list must not suppress extraction"
606        );
607    }
608}