use chrono::Utc;
use std::collections::HashSet;
use trusty_common::memory_core::store::kg::Triple;
use uuid::Uuid;
pub const DEFAULT_DENY_TAGS: &[&str] = &["cross-project-qa", "test", "fixture"];
#[derive(Debug, Clone)]
pub struct KgExtractConfig<'a> {
pub deny_tags: &'a [&'a str],
}
impl Default for KgExtractConfig<'_> {
fn default() -> Self {
Self {
deny_tags: DEFAULT_DENY_TAGS,
}
}
}
pub const AUTO_PROVENANCE: &str = "auto:remember";
pub const AUTO_CONFIDENCE: f32 = 0.6;
pub const DRAWER_SUBJECT_PREFIX: &str = "drawer:";
pub const TAG_SUBJECT_PREFIX: &str = "tag:";
pub const TOPIC_SUBJECT_PREFIX: &str = "topic:";
pub const ROOM_SUBJECT_PREFIX: &str = "room:";
pub fn drawer_subject(id: Uuid) -> String {
format!("{DRAWER_SUBJECT_PREFIX}{id}")
}
#[derive(Debug, Clone)]
pub struct ExtractInput<'a> {
pub drawer_id: Uuid,
pub content: &'a str,
pub tags: &'a [String],
pub room: Option<&'a str>,
}
pub fn extract_triples(input: &ExtractInput<'_>) -> Vec<Triple> {
extract_triples_with_config(input, &KgExtractConfig::default())
}
pub fn extract_triples_with_config(
input: &ExtractInput<'_>,
config: &KgExtractConfig<'_>,
) -> Vec<Triple> {
let denied = input.tags.iter().any(|t| {
let lower = t.trim().to_lowercase();
config.deny_tags.contains(&lower.as_str())
});
if denied {
tracing::debug!(
drawer_id = %input.drawer_id,
tags = ?input.tags,
"kg_extract: skipping drawer — tag matches deny-list"
);
return Vec::new();
}
let now = Utc::now();
let subject = drawer_subject(input.drawer_id);
let mut out: Vec<Triple> = Vec::new();
let mut seen: HashSet<(String, String, String)> = HashSet::new();
let push = |out: &mut Vec<Triple>,
seen: &mut HashSet<(String, String, String)>,
s: String,
p: String,
o: String| {
let key = (s.clone(), p.clone(), o.clone());
if seen.insert(key) {
out.push(Triple {
subject: s,
predicate: p,
object: o,
valid_from: now,
valid_to: None,
confidence: AUTO_CONFIDENCE,
provenance: Some(AUTO_PROVENANCE.to_string()),
});
}
};
for tag in input.tags {
let clean = tag.trim();
if clean.is_empty() {
continue;
}
push(
&mut out,
&mut seen,
format!("{TAG_SUBJECT_PREFIX}{}", clean.to_lowercase()),
"tags".to_string(),
subject.clone(),
);
}
if let Some(room) = input.room {
let clean = room.trim();
if !clean.is_empty() {
push(
&mut out,
&mut seen,
format!("{ROOM_SUBJECT_PREFIX}{clean}"),
"contains".to_string(),
subject.clone(),
);
}
}
for term in extract_hashtags(input.content) {
push(
&mut out,
&mut seen,
format!("{TOPIC_SUBJECT_PREFIX}{term}"),
"mentioned-in".to_string(),
subject.clone(),
);
}
for (s, p, o) in extract_patterns(input.content) {
push(&mut out, &mut seen, s, p, o);
}
out
}
fn extract_hashtags(content: &str) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
let mut iter = content.char_indices().peekable();
while let Some((_, c)) = iter.next() {
if c != '#' {
continue;
}
let mut term = String::new();
while let Some(&(_, nc)) = iter.peek() {
if nc.is_ascii_alphanumeric() || nc == '_' || nc == '-' {
term.push(nc.to_ascii_lowercase());
iter.next();
} else {
break;
}
}
if term.is_empty() {
continue;
}
if seen.insert(term.clone()) {
out.push(term);
}
}
out
}
const PATTERN_TABLE: &[(&str, &[&str])] = &[
("is-a", &[" is a ", " is an "]),
("works-at", &[" works at "]),
("uses", &[" uses ", " using "]),
("depends-on", &[" depends on ", " requires "]),
];
fn extract_patterns(content: &str) -> Vec<(String, String, String)> {
let lower = content.to_lowercase();
let mut out: Vec<(String, String, String)> = Vec::new();
for (predicate, markers) in PATTERN_TABLE {
for marker in *markers {
if let Some(idx) = lower.find(marker) {
let left = lower[..idx].trim();
let right_start = idx + marker.len();
let right = lower[right_start..].trim();
let subject_tok = last_token(left);
let object_tok = first_token(right);
if !subject_tok.is_empty() && !object_tok.is_empty() {
out.push((subject_tok, (*predicate).to_string(), object_tok));
}
break;
}
}
}
out
}
fn last_token(s: &str) -> String {
s.split_whitespace()
.last()
.map(|t| t.trim_end_matches([',', '.', ';', ':', '!', '?', '"', '\'']))
.unwrap_or("")
.to_string()
}
fn first_token(s: &str) -> String {
s.split_whitespace()
.next()
.map(|t| t.trim_end_matches([',', '.', ';', ':', '!', '?', '"', '\'']))
.unwrap_or("")
.to_string()
}
#[cfg(test)]
mod tests {
use super::*;
fn input_for(content: &str, tags: &[&str], room: Option<&str>) -> (Uuid, Vec<String>) {
let id = Uuid::new_v4();
let owned_tags: Vec<String> = tags.iter().map(|s| s.to_string()).collect();
let _ = content; let _ = room;
(id, owned_tags)
}
#[test]
fn extract_triples_emits_tag_triples() {
let (id, tags) = input_for("hello world", &["rust", "design"], Some("Backend"));
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "hello world",
tags: &tags,
room: Some("Backend"),
});
let object = drawer_subject(id);
assert!(triples
.iter()
.any(|t| t.subject == "tag:rust" && t.predicate == "tags" && t.object == object));
assert!(triples
.iter()
.any(|t| t.subject == "tag:design" && t.predicate == "tags" && t.object == object));
assert!(triples.iter().any(|t| t.subject == "room:Backend"
&& t.predicate == "contains"
&& t.object == object));
}
#[test]
fn extract_triples_emits_hashtag_mentions() {
let (id, tags) = input_for("see #Rust and #design-doc and #rust again", &[], None);
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "see #Rust and #design-doc and #rust again",
tags: &tags,
room: None,
});
let mention_subjects: Vec<&str> = triples
.iter()
.filter(|t| t.predicate == "mentioned-in")
.map(|t| t.subject.as_str())
.collect();
assert!(mention_subjects.contains(&"topic:rust"));
assert!(mention_subjects.contains(&"topic:design-doc"));
assert_eq!(
mention_subjects
.iter()
.filter(|s| **s == "topic:rust")
.count(),
1
);
}
#[test]
fn extract_triples_extracts_is_a_pattern() {
let (id, _) = input_for("rustc is a compiler for rust", &[], None);
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "rustc is a compiler for rust",
tags: &[],
room: None,
});
assert!(triples
.iter()
.any(|t| t.subject == "rustc" && t.predicate == "is-a" && t.object == "compiler"));
}
#[test]
fn extract_triples_stamps_provenance() {
let (id, tags) = input_for("anything", &["x"], None);
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "anything",
tags: &tags,
room: None,
});
assert!(!triples.is_empty());
for t in &triples {
assert_eq!(t.provenance.as_deref(), Some(AUTO_PROVENANCE));
assert!((t.confidence - AUTO_CONFIDENCE).abs() < f32::EPSILON);
}
}
#[test]
#[allow(clippy::assertions_on_constants)]
fn extract_triples_uses_reduced_confidence() {
assert!(AUTO_CONFIDENCE < 1.0);
assert!(AUTO_CONFIDENCE > 0.0);
}
#[test]
fn extract_triples_never_panics_on_empty_input() {
let id = Uuid::new_v4();
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "",
tags: &[],
room: None,
});
assert!(triples.is_empty());
}
#[test]
fn extract_triples_tags_only_path() {
let id = Uuid::new_v4();
let tags = vec!["meeting".to_string()];
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "Discussed roadmap.",
tags: &tags,
room: None,
});
assert_eq!(triples.len(), 1);
assert_eq!(triples[0].subject, "tag:meeting");
assert_eq!(triples[0].predicate, "tags");
assert_eq!(triples[0].object, drawer_subject(id));
}
#[test]
fn extract_triples_skips_denied_tags() {
let id = Uuid::new_v4();
let tags = vec!["test".to_string(), "rust".to_string()];
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "rustc is a compiler",
tags: &tags,
room: Some("Backend"),
});
assert!(
triples.is_empty(),
"a drawer with a deny-list tag must produce zero triples, got {triples:?}"
);
}
#[test]
fn extract_triples_deny_list_is_case_insensitive() {
let id = Uuid::new_v4();
let tags = vec!["FIXTURE".to_string()];
let triples = extract_triples(&ExtractInput {
drawer_id: id,
content: "some content",
tags: &tags,
room: None,
});
assert!(
triples.is_empty(),
"upper-cased deny tag must still be blocked"
);
}
#[test]
fn extract_triples_empty_deny_list_passes_through() {
let id = Uuid::new_v4();
let tags = vec!["test".to_string()];
let config = KgExtractConfig { deny_tags: &[] };
let triples = extract_triples_with_config(
&ExtractInput {
drawer_id: id,
content: "anything",
tags: &tags,
room: None,
},
&config,
);
assert!(
!triples.is_empty(),
"empty deny-list must not suppress extraction"
);
}
}