Skip to main content

sqlite_graphrag/
extraction.rs

1//! Entity and URL extraction pipeline (v1.0.76).
2//!
3//! v1.0.76: the default build is **LLM-only**. The legacy GLiNER NER
4//! pipeline moved to `extraction_gliner.rs` and is gated behind the
5//! `ner-legacy` feature. The default build extracts:
6//!
7//! - **URLs** via regex (always available, no model needed).
8//! - **Entities** via the `ExtractionBackend` trait (LLM headless).
9//!   The default backend is `LlmBackend` (claude / codex), which produces
10//!   structured entities and relationships via tool-use JSON.
11//!
12//! The `extract_graph_auto` function below is the entry point used by
13//! `remember`, `ingest`, and `enrich`. With the default feature set, it
14//! runs the LLM extraction backend and returns whatever entities the LLM
15//! found. Operators who want the legacy GLiNER NER can build with
16//! `--features ner-legacy` (transition window only; removed in v1.1.0).
17
18use serde::{Deserialize, Serialize};
19
20/// One URL extracted from a body. Always produced by the regex path.
21#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
22pub struct ExtractedUrl {
23    pub url: String,
24    pub start: usize,
25    pub end: usize,
26}
27
28/// One named-entity mention. The default build produces these via the
29/// LLM extraction backend; the ner-legacy build produces them via GLiNER.
30#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
31pub struct ExtractedEntity {
32    pub name: String,
33    pub entity_type: String,
34    pub start: usize,
35    pub end: usize,
36}
37
38/// Full extraction result: URLs (regex), entities (LLM), and the
39/// relationships between them. The LLM backend also returns typed
40/// relationships directly in `ExtractionOutput`; this struct is the
41/// regex-only baseline that `remember` and `ingest` consume.
42#[derive(Debug, Clone, Default, PartialEq, Serialize, Deserialize)]
43pub struct ExtractionResult {
44    pub entities: Vec<ExtractedEntity>,
45    pub urls: Vec<ExtractedUrl>,
46    /// Wall-clock latency in milliseconds.
47    pub elapsed_ms: u64,
48}
49
50/// GLiNER model variant enum. Only meaningful with the `ner-legacy`
51/// feature. In the default build, the variant is ignored and extraction
52/// is delegated to the LLM.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54pub enum GlinerVariant {
55    Fp32,
56    Int8,
57}
58
59impl GlinerVariant {
60    pub fn as_filename(self) -> &'static str {
61        match self {
62            Self::Fp32 => "model.onnx",
63            Self::Int8 => "model_int8.onnx",
64        }
65    }
66    pub fn display_size(self) -> &'static str {
67        match self {
68            Self::Fp32 => "1.1 GB",
69            Self::Int8 => "349 MB",
70        }
71    }
72}
73
74/// Trait abstraction for any extractor. The LLM backend and the
75/// GLiNER backend (ner-legacy) both implement it.
76pub trait Extractor: Send + Sync {
77    fn name(&self) -> &'static str;
78    fn extract(&self, body: &str) -> Result<ExtractionResult, crate::errors::AppError>;
79}
80
81/// Regex-only extractor: URLs and nothing else. Used as a fast
82/// pre-pass before the (slower) LLM extractor in `extract_graph_auto`.
83pub struct RegexExtractor;
84
85impl Extractor for RegexExtractor {
86    fn name(&self) -> &'static str {
87        "regex"
88    }
89    fn extract(&self, body: &str) -> Result<ExtractionResult, crate::errors::AppError> {
90        Ok(ExtractionResult {
91            entities: Vec::new(),
92            urls: extract_urls(body),
93            elapsed_ms: 0,
94        })
95    }
96}
97
98/// Extracts URLs from `body` using a substring scan. UTF-8 safe; offsets
99/// are byte indices into the input.
100pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
101    let mut out = Vec::new();
102    let mut cursor = 0usize;
103    while cursor < body.len() {
104        let hay = &body[cursor..];
105        // Find the next URL boundary, considering both schemes.
106        let http_at = hay.find("http://");
107        let https_at = hay.find("https://");
108        let (rel_start, scheme_len) = match (http_at, https_at) {
109            (Some(a), Some(b)) => {
110                if a <= b {
111                    (a, 7)
112                } else {
113                    (b, 8)
114                }
115            }
116            (Some(a), None) => (a, 7),
117            (None, Some(b)) => (b, 8),
118            (None, None) => break,
119        };
120        let abs_start = cursor + rel_start;
121        let after_scheme = abs_start + scheme_len;
122        let mut end = after_scheme;
123        for (i, c) in body[after_scheme..].char_indices() {
124            if c.is_whitespace() || matches!(c, ')' | ']' | '}' | '"' | '\'' | '<') {
125                end = after_scheme + i;
126                break;
127            }
128            end = after_scheme + i + c.len_utf8();
129        }
130        out.push(ExtractedUrl {
131            url: body[abs_start..end].to_string(),
132            start: abs_start,
133            end,
134        });
135        cursor = end;
136    }
137    out
138}
139
140/// Top-level extraction entry point used by `remember`, `ingest`, and
141/// `enrich`. Runs the regex URL pass first (always available). In the
142/// default build this remains URL-only; with `ner-legacy` enabled it
143/// delegates to the legacy GLiNER pipeline and adapts its output.
144#[cfg(feature = "ner-legacy")]
145pub fn extract_graph_auto(
146    body: &str,
147    paths: &crate::paths::AppPaths,
148    gliner_variant: GlinerVariant,
149) -> Result<ExtractionResult, crate::errors::AppError> {
150    let legacy_variant = match gliner_variant {
151        GlinerVariant::Fp32 => crate::extraction_gliner::GlinerVariant::Fp32,
152        GlinerVariant::Int8 => crate::extraction_gliner::GlinerVariant::Int8,
153    };
154    let extracted = crate::extraction_gliner::extract_graph_auto(body, paths, legacy_variant)
155        .map_err(crate::errors::AppError::from)?;
156    Ok(ExtractionResult {
157        entities: extracted
158            .entities
159            .into_iter()
160            .map(|entity| ExtractedEntity {
161                name: entity.name,
162                entity_type: entity.entity_type.to_string(),
163                start: 0,
164                end: 0,
165            })
166            .collect(),
167        urls: extracted
168            .urls
169            .into_iter()
170            .map(|url| ExtractedUrl {
171                end: url.offset + url.url.len(),
172                url: url.url,
173                start: url.offset,
174            })
175            .collect(),
176        elapsed_ms: 0,
177    })
178}
179
180#[cfg(not(feature = "ner-legacy"))]
181pub fn extract_graph_auto(
182    body: &str,
183    _paths: &crate::paths::AppPaths,
184    _gliner_variant: GlinerVariant,
185) -> Result<ExtractionResult, crate::errors::AppError> {
186    let start = std::time::Instant::now();
187    let urls = extract_urls(body);
188    Ok(ExtractionResult {
189        entities: Vec::new(),
190        urls,
191        elapsed_ms: start.elapsed().as_millis() as u64,
192    })
193}
194
195#[cfg(test)]
196mod tests {
197    use super::*;
198
199    #[test]
200    fn extract_urls_finds_http_and_https() {
201        let body = "see https://example.com/foo and http://bar.baz/qux end";
202        let urls = extract_urls(body);
203        assert_eq!(urls.len(), 2, "got {urls:?} for body {body:?}");
204        assert_eq!(urls[0].url, "https://example.com/foo");
205        assert_eq!(urls[1].url, "http://bar.baz/qux");
206    }
207
208    #[test]
209    fn extract_urls_handles_trailing_punctuation() {
210        let body = "see https://example.com/foo).";
211        let urls = extract_urls(body);
212        assert_eq!(urls.len(), 1);
213        assert_eq!(urls[0].url, "https://example.com/foo");
214    }
215
216    #[test]
217    fn extract_urls_empty_body() {
218        assert!(extract_urls("").is_empty());
219    }
220
221    #[test]
222    fn gliner_variant_size_strings() {
223        assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
224        assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
225    }
226
227    #[test]
228    fn regex_extractor_returns_only_urls() {
229        let result = RegexExtractor.extract("see https://example.com").unwrap();
230        assert_eq!(result.entities.len(), 0);
231        assert_eq!(result.urls.len(), 1);
232    }
233}