mnem_extract/traits.rs
1//! Public traits and value types for mnem-extract.
2//!
3//! The [`Extractor`] trait is the single integration point between an
4//! ingest pipeline and any statistical / LLM-backed extractor. The
5//! default implementation is [`crate::keybert::KeyBertExtractor`].
6
7use serde::{Deserialize, Serialize};
8
9/// An entity mention located in a chunk of source text.
10///
11/// Fields are deliberately flat and serde-round-trippable so that an
12/// ingest pipeline can attach the list verbatim to a Node's `props`
13/// bag or persist it as an audit artefact without an intermediate DTO.
14#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
15pub struct Entity {
16 /// Surface form of the mention, exactly as it appears in the
17 /// source text (whitespace-normalised but otherwise untouched).
18 pub mention: String,
19 /// Extractor-assigned score in `[0.0, 1.0]`. For the KeyBERT
20 /// extractor this is the cosine similarity between the candidate
21 /// embedding and the chunk embedding after MMR diversification.
22 pub score: f32,
23 /// Byte span `(start, end)` of the mention in the original chunk
24 /// text. `end` is exclusive, matching `str::get(start..end)`.
25 pub span: (usize, usize),
26}
27
28/// A candidate relation between two previously-extracted entities.
29///
30/// The payload stays flat: a statistical miner emits raw
31/// `(subject_mention, object_mention, weight)` triples without
32/// predicate names. Callers that want to name the edge type map
33/// [`ExtractionSource`] → edge label at ingest time.
34#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
35pub struct Relation {
36 /// Subject entity mention.
37 pub src: String,
38 /// Object entity mention.
39 pub dst: String,
40 /// Extractor-assigned weight. For PMI-based mining this is the
41 /// pointwise mutual information in natural-log units.
42 pub weight: f32,
43 /// Provenance of the triple.
44 pub source: ExtractionSource,
45}
46
47/// How an [`Entity`] or [`Relation`] was produced.
48///
49/// The enum is `#[non_exhaustive]` so downstream crates can add new
50/// variants (e.g. a gazetteer source) without a semver break in the
51/// consumers that only match on the KeyBERT + authored cases.
52#[non_exhaustive]
53#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
54#[serde(rename_all = "snake_case")]
55pub enum ExtractionSource {
56 /// Hand-authored by the caller (e.g. entities from a front-matter
57 /// YAML block or an explicit CLI flag). Always trusted.
58 Authored,
59 /// Produced by a statistical extractor in this crate (KeyBERT,
60 /// co-occurrence PMI).
61 Statistical,
62 /// Produced by an LLM-backed extractor; the inner string is the
63 /// fully-qualified model identifier so provenance survives
64 /// round-tripping.
65 LlmModel(String),
66}
67
68/// Pluggable statistical entity + relation extractor.
69///
70/// Implementations MUST be `Send + Sync` so `mnem-ingest` can hand them
71/// across thread boundaries when a future batch driver parallelises
72/// ingest. They SHOULD be deterministic: the proptest harness under
73/// `tests/proptest_determinism.rs` enforces byte-identical output for
74/// the in-crate default.
75pub trait Extractor: Send + Sync {
76 /// Extract entity mentions from `text`.
77 ///
78 /// `chunk_embed` is the embedding of the enclosing chunk, produced
79 /// by the same [`mnem_embed_providers::Embedder`] the extractor
80 /// will use for candidates. Its length MUST match the embedder's
81 /// `dim()`; mismatches are a programming error and extractors may
82 /// return an empty vec in that case rather than panic.
83 fn extract_entities(&self, text: &str, chunk_embed: &[f32]) -> Vec<Entity>;
84
85 /// Mine candidate relations over an already-extracted entity set.
86 ///
87 /// The default implementation returns an empty vec, letting
88 /// callers opt into relation mining explicitly via the
89 /// [`crate::cooccurrence`] module.
90 fn extract_relations(&self, _text: &str, _entities: &[Entity]) -> Vec<Relation> {
91 Vec::new()
92 }
93
94 /// Optionally infer *typed* relations between already-extracted
95 /// entities, subject to the supplied [`InferenceBudget`].
96 ///
97 /// Gated behind the `typed-relations` Cargo feature. Default OFF
98 /// per solution.md R3: no extractor emits typed relations unless
99 /// the caller explicitly opts in at ingest time.
100 ///
101 /// The default implementation returns an empty vec - safe for
102 /// every existing extractor, no behaviour change on the default
103 /// build. Implementors must enforce:
104 ///
105 /// 1. Every emitted [`TypedRelation`] carries
106 /// `source_label = "inferred:<method>"` (auto-derived by
107 /// [`TypedRelation::new`]).
108 /// 2. Wall-clock work does not exceed `budget.effective_ms()`.
109 /// 3. Emitted edge count does not exceed `budget.max_types`.
110 ///
111 /// Downstream consumers (PPR, multihop) MUST gate admission with
112 /// [`crate::trust::TrustBoundary::admit`] before using any edge.
113 #[cfg(feature = "typed-relations")]
114 fn infer_typed_relations(
115 &self,
116 _text: &str,
117 _entities: &[Entity],
118 _budget: &crate::inference::InferenceBudget,
119 ) -> Vec<crate::inference::TypedRelation> {
120 Vec::new()
121 }
122}