Skip to main content

mnem_extract/
traits.rs

1//! Public traits and value types for mnem-extract.
2//!
3//! The [`Extractor`] trait is the single integration point between an
4//! ingest pipeline and any statistical / LLM-backed extractor. The
5//! default implementation is [`crate::keybert::KeyBertExtractor`].
6
7use serde::{Deserialize, Serialize};
8
9/// An entity mention located in a chunk of source text.
10///
11/// Fields are deliberately flat and serde-round-trippable so that an
12/// ingest pipeline can attach the list verbatim to a Node's `props`
13/// bag or persist it as an audit artefact without an intermediate DTO.
14#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
15pub struct Entity {
16    /// Surface form of the mention, exactly as it appears in the
17    /// source text (whitespace-normalised but otherwise untouched).
18    pub mention: String,
19    /// Extractor-assigned score in `[0.0, 1.0]`. For the KeyBERT
20    /// extractor this is the cosine similarity between the candidate
21    /// embedding and the chunk embedding after MMR diversification.
22    pub score: f32,
23    /// Byte span `(start, end)` of the mention in the original chunk
24    /// text. `end` is exclusive, matching `str::get(start..end)`.
25    pub span: (usize, usize),
26}
27
28/// A candidate relation between two previously-extracted entities.
29///
30/// The payload stays flat: a statistical miner emits raw
31/// `(subject_mention, object_mention, weight)` triples without
32/// predicate names. Callers that want to name the edge type map
33/// [`ExtractionSource`] → edge label at ingest time.
34#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
35pub struct Relation {
36    /// Subject entity mention.
37    pub src: String,
38    /// Object entity mention.
39    pub dst: String,
40    /// Extractor-assigned weight. For PMI-based mining this is the
41    /// pointwise mutual information in natural-log units.
42    pub weight: f32,
43    /// Provenance of the triple.
44    pub source: ExtractionSource,
45}
46
47/// How an [`Entity`] or [`Relation`] was produced.
48///
49/// The enum is `#[non_exhaustive]` so downstream crates can add new
50/// variants (e.g. a gazetteer source) without a semver break in the
51/// consumers that only match on the KeyBERT + authored cases.
52#[non_exhaustive]
53#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
54#[serde(rename_all = "snake_case")]
55pub enum ExtractionSource {
56    /// Hand-authored by the caller (e.g. entities from a front-matter
57    /// YAML block or an explicit CLI flag). Always trusted.
58    Authored,
59    /// Produced by a statistical extractor in this crate (KeyBERT,
60    /// co-occurrence PMI).
61    Statistical,
62    /// Produced by an LLM-backed extractor; the inner string is the
63    /// fully-qualified model identifier so provenance survives
64    /// round-tripping.
65    LlmModel(String),
66}
67
68/// Pluggable statistical entity + relation extractor.
69///
70/// Implementations MUST be `Send + Sync` so `mnem-ingest` can hand them
71/// across thread boundaries when a future batch driver parallelises
72/// ingest. They SHOULD be deterministic: the proptest harness under
73/// `tests/proptest_determinism.rs` enforces byte-identical output for
74/// the in-crate default.
75pub trait Extractor: Send + Sync {
76    /// Extract entity mentions from `text`.
77    ///
78    /// `chunk_embed` is the embedding of the enclosing chunk, produced
79    /// by the same [`mnem_embed_providers::Embedder`] the extractor
80    /// will use for candidates. Its length MUST match the embedder's
81    /// `dim()`; mismatches are a programming error and extractors may
82    /// return an empty vec in that case rather than panic.
83    fn extract_entities(&self, text: &str, chunk_embed: &[f32]) -> Vec<Entity>;
84
85    /// Mine candidate relations over an already-extracted entity set.
86    ///
87    /// The default implementation returns an empty vec, letting
88    /// callers opt into relation mining explicitly via the
89    /// [`crate::cooccurrence`] module.
90    fn extract_relations(&self, _text: &str, _entities: &[Entity]) -> Vec<Relation> {
91        Vec::new()
92    }
93
94    /// Optionally infer *typed* relations between already-extracted
95    /// entities, subject to the supplied [`InferenceBudget`].
96    ///
97    /// Gated behind the `typed-relations` Cargo feature. Default OFF
98    /// per solution.md R3: no extractor emits typed relations unless
99    /// the caller explicitly opts in at ingest time.
100    ///
101    /// The default implementation returns an empty vec - safe for
102    /// every existing extractor, no behaviour change on the default
103    /// build. Implementors must enforce:
104    ///
105    /// 1. Every emitted [`TypedRelation`] carries
106    ///    `source_label = "inferred:<method>"` (auto-derived by
107    ///    [`TypedRelation::new`]).
108    /// 2. Wall-clock work does not exceed `budget.effective_ms()`.
109    /// 3. Emitted edge count does not exceed `budget.max_types`.
110    ///
111    /// Downstream consumers (PPR, multihop) MUST gate admission with
112    /// [`crate::trust::TrustBoundary::admit`] before using any edge.
113    #[cfg(feature = "typed-relations")]
114    fn infer_typed_relations(
115        &self,
116        _text: &str,
117        _entities: &[Entity],
118        _budget: &crate::inference::InferenceBudget,
119    ) -> Vec<crate::inference::TypedRelation> {
120        Vec::new()
121    }
122}