sqlite_graphrag/
extraction.rs

1//! Entity and URL extraction pipeline (NER + regex prefilter).
2//!
3//! Runs named-entity recognition and regex heuristics to extract structured
4//! entities and hyperlinks from raw memory bodies before embedding.
5
6use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use anyhow::{Context, Result};
10use ort::session::{builder::GraphOptimizationLevel, Session};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use unicode_normalization::UnicodeNormalization;
14
15use crate::entity_type::EntityType;
16use crate::paths::AppPaths;
17use crate::storage::entities::{NewEntity, NewRelationship};
18
19const MAX_ENTS: usize = 30;
20// v1.0.31 A9: only consumed by the legacy `build_relationships`, which is
21// kept for unit tests pinning the cap behaviour.
22#[cfg(test)]
23const TOP_K_RELATIONS: usize = 5;
24const DEFAULT_RELATION: &str = "mentions";
25const MIN_ENTITY_CHARS: usize = 2;
26
27static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
32static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33// v1.0.25 P0-2: captures CamelCase brand names that NER model often misses (e.g. "OpenAI", "PostgreSQL").
34static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35
36// v1.0.20: stopwords to filter common PT-BR/EN rule words captured as ALL_CAPS.
37// Without this filter, technical PT-BR corpora containing CAPS-formatted rules (NUNCA, PROIBIDO, DEVE)
38// generated ~70% of "garbage entities". We keep identifiers like MAX_RETRY (with underscore).
39// v1.0.22: expanded list with terms observed in 495-file flowaiper stress test.
40// Includes verbs (ADICIONAR, VALIDAR), adjectives (ALTA, BAIXA), common nouns (BANCO, CASO),
41// HTTP methods (GET, POST, DELETE) and generic data formats (JSON, XML).
42// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
43// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
44// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
45// and common abbreviations (ACK, ACL).
46// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
47// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
48// and generic tech terms from being extracted as entities.
49// v1.0.31 A11: added PT-BR uppercase noise observed during ingest of technical Portuguese
50// rule documents — common nouns/adjectives written in caps as visual emphasis (ADAPTER, PROJETO,
51// PASSIVA, ATIVA, SOMENTE, LEITURA, ESCRITA, OBRIGATORIA, EXEMPLO, REGRA, DEFAULT). Each one
52// kept leaking as a "concept" entity and inflating the graph with non-entities.
53const ALL_CAPS_STOPWORDS: &[&str] = &[
54    "ACEITE",
55    "ACID",
56    "ACK",
57    "ACL",
58    "ACRESCENTADO",
59    "ADAPTER",
60    "ADICIONADA",
61    "ADICIONADAS",
62    "ADICIONADO",
63    "ADICIONADOS",
64    "ADICIONAR",
65    "AGENTS",
66    "AINDA",
67    "ALL",
68    "ALTA",
69    "ALWAYS",
70    "APENAS",
71    "API",
72    "ARTEFATOS",
73    "ATIVA",
74    "ATIVO",
75    "BAIXA",
76    "BANCO",
77    "BLOQUEAR",
78    "BORDA",
79    "BUG",
80    "CAPÍTULO",
81    "CASO",
82    "CEO",
83    "CHECKLIST",
84    "CLARO",
85    "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86    "CLI",
87    "COMPLETED",
88    "CONFIRMADO",
89    "CONFIRMARAM",
90    "CONFIRME",
91    "CONFIRMEI",
92    "CONFIRMOU",
93    "CONTRATO",
94    "CRIE",
95    "CRÍTICO",
96    "CRITICAL",
97    "CSV",
98    "DDL",
99    "DEFAULT",
100    "DEFINIR",
101    "DEPARTMENT",
102    "DESC",
103    "DEVE",
104    "DEVEMOS",
105    "DISCO",
106    "DONE",
107    "DSL",
108    "DTO",
109    "EFEITO",
110    "ENTRADA",
111    "EOF",
112    "EPERM",
113    "ERROR",
114    "ESCREVA",
115    "ESCRITA",
116    "ESRCH",
117    "ESSA",
118    "ESSE",
119    "ESSENCIAL",
120    "ESTA",
121    "ESTADO",
122    "ESTE",
123    "ETAPA",
124    "EVITAR",
125    "EXEMPLO",
126    "EXPANDIR",
127    "EXPOR",
128    "FALHA",
129    "FASE",
130    "FATO",
131    "FIFO",
132    "FIXED",
133    "FIXME",
134    "FLUXO",
135    "FONTES",
136    "FORBIDDEN",
137    "FUNCIONA",
138    "GNU",
139    "HACK",
140    "HEARTBEAT",
141    "HTTP",
142    "HTTPS",
143    "INATIVO",
144    "JAMAIS",
145    "JSON",
146    "JWT",
147    "LEITURA",
148    "LLM",
149    "MCP",
150    "MESMO",
151    "METADADOS",
152    "MUST",
153    "NDJSON",
154    "NEGUE",
155    "NEVER",
156    "NOTE",
157    "NUNCA",
158    "OBRIGATORIA",
159    "OBRIGATÓRIO",
160    "OBSERVEI",
161    "PADRÃO",
162    "PASSIVA",
163    "PASSO",
164    "PENDING",
165    "PGID",
166    "PID",
167    "PLAN",
168    "PODEMOS",
169    "PONTEIROS",
170    "PREFERIR",
171    "PROIBIDO",
172    "PROJETO",
173    "RECUSE",
174    "REGRA",
175    "REGRAS",
176    "REMOVIDAS",
177    "REQUIRED",
178    "REQUISITO",
179    "REST",
180    "SEÇÃO",
181    "SEMPRE",
182    "SHALL",
183    "SHOULD",
184    "SIGTERM",
185    "SOMENTE",
186    "SOUL",
187    "TODAS",
188    "TODO",
189    "TODOS",
190    "TOKEN",
191    "TOOLS",
192    "TSV",
193    "TUI",
194    "UI",
195    "URL",
196    "USAR",
197    "VALIDAR",
198    "VAMOS",
199    "VOCÊ",
200    "WARNING",
201    "XML",
202    "YAML",
203];
204
205// v1.0.22: HTTP methods are protocol verbs, not semantically useful entities.
206// Filtered in apply_regex_prefilter (regex_all_caps path).
207const HTTP_METHODS: &[&str] = &[
208    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209];
210
211fn is_filtered_all_caps(token: &str) -> bool {
212    // Identifiers containing underscore are preserved (e.g. MAX_RETRY, FLOWAIPER_API_KEY)
213    let is_identifier = token.contains('_');
214    if is_identifier {
215        return false;
216    }
217    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
218}
219
220fn regex_email() -> &'static Regex {
221    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
222    REGEX_EMAIL.get_or_init(|| {
223        Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224            .expect("compile-time validated email regex literal")
225    })
226}
227
228fn regex_url() -> &'static Regex {
229    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
230    REGEX_URL.get_or_init(|| {
231        Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232            .expect("compile-time validated URL regex literal")
233    })
234}
235
236fn regex_uuid() -> &'static Regex {
237    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
238    REGEX_UUID.get_or_init(|| {
239        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240            .expect("compile-time validated UUID regex literal")
241    })
242}
243
244fn regex_all_caps() -> &'static Regex {
245    REGEX_ALL_CAPS.get_or_init(|| {
246        Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247            .expect("compile-time validated all-caps regex literal")
248    })
249}
250
251fn regex_section_marker() -> &'static Regex {
252    REGEX_SECTION_MARKER.get_or_init(|| {
253        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1",
254        // "Camada 5", "Passo 2", etc. v1.0.36 (H5): added "Camada" after audit found
255        // "Camada 1".."Camada 5" leaking through into entity extraction with degree>=3.
256        // Accented characters expressed as escapes to keep this source file ASCII-only
257        // per the project language policy. Pattern is equivalent to:
258        //   \b(?:Etapa|Fase|Passo|Camada|Se\xe7\xe3o|Cap\xedtulo)\s+\d+\b
259        Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260            .expect("compile-time validated section marker regex literal")
261    })
262}
263
264fn regex_brand_camel() -> &'static Regex {
265    REGEX_BRAND_CAMEL.get_or_init(|| {
266        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
267        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
268        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269            .expect("compile-time validated CamelCase brand regex literal")
270    })
271}
272
273#[derive(Debug, Clone, PartialEq)]
274pub struct ExtractedEntity {
275    pub name: String,
276    pub entity_type: EntityType,
277}
278
279/// URL with source offset extracted from the memory body.
280#[derive(Debug, Clone)]
281pub struct ExtractedUrl {
282    pub url: String,
283    /// Byte position in the body where the URL was found.
284    pub offset: usize,
285}
286
287#[derive(Debug, Clone)]
288pub struct ExtractionResult {
289    pub entities: Vec<NewEntity>,
290    pub relationships: Vec<NewRelationship>,
291    /// True when build_relationships hit the cap before covering all entity pairs.
292    /// Exposed in RememberResponse so callers can detect when relationships were cut.
293    pub relationships_truncated: bool,
294    /// Extraction method used: `"gliner-<variant>+regex"` or `"regex-only"`.
295    /// Useful for auditing, metrics and user reports.
296    pub extraction_method: String,
297    /// URLs extracted from the body — stored separately from graph entities.
298    pub urls: Vec<ExtractedUrl>,
299}
300
301pub trait Extractor: Send + Sync {
302    fn extract(&self, body: &str) -> Result<ExtractionResult>;
303}
304
305/// GLiNER ONNX model quantization variant.
306#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307pub enum GlinerVariant {
308    Fp32,
309    Fp16,
310    Int8,
311    Q4,
312    Q4f16,
313}
314
315impl GlinerVariant {
316    /// ONNX filename for this variant in the HuggingFace repository.
317    pub fn as_filename(self) -> &'static str {
318        match self {
319            Self::Fp32 => "model.onnx",
320            Self::Fp16 => "model_fp16.onnx",
321            Self::Int8 => "model_quantized.onnx",
322            Self::Q4 => "model_q4.onnx",
323            Self::Q4f16 => "model_q4f16.onnx",
324        }
325    }
326
327    /// Approximate model size for user-facing messages.
328    pub fn display_size(self) -> &'static str {
329        match self {
330            Self::Fp32 => "1.1 GB",
331            Self::Fp16 => "580 MB",
332            Self::Int8 => "349 MB",
333            Self::Q4 => "894 MB",
334            Self::Q4f16 => "472 MB",
335        }
336    }
337}
338
339impl std::fmt::Display for GlinerVariant {
340    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341        match self {
342            Self::Fp32 => f.write_str("fp32"),
343            Self::Fp16 => f.write_str("fp16"),
344            Self::Int8 => f.write_str("int8"),
345            Self::Q4 => f.write_str("q4"),
346            Self::Q4f16 => f.write_str("q4f16"),
347        }
348    }
349}
350
351impl std::str::FromStr for GlinerVariant {
352    type Err = anyhow::Error;
353    fn from_str(s: &str) -> Result<Self> {
354        match s.to_lowercase().as_str() {
355            "fp32" => Ok(Self::Fp32),
356            "fp16" => Ok(Self::Fp16),
357            "int8" => Ok(Self::Int8),
358            "q4" => Ok(Self::Q4),
359            "q4f16" => Ok(Self::Q4f16),
360            other => {
361                anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362            }
363        }
364    }
365}
366
367const GLINER_MAX_WIDTH: usize = 12;
368const GLINER_MAX_SEQ_LEN: usize = 384;
369const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371
372const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373    ("person", EntityType::Person),
374    ("organization", EntityType::Organization),
375    ("location", EntityType::Location),
376    ("date", EntityType::Date),
377    ("project", EntityType::Project),
378    ("tool", EntityType::Tool),
379    ("file", EntityType::File),
380    ("concept", EntityType::Concept),
381    ("decision", EntityType::Decision),
382    ("incident", EntityType::Incident),
383    ("dashboard", EntityType::Dashboard),
384    ("issue tracker", EntityType::IssueTracker),
385    ("memory", EntityType::Memory),
386];
387
388struct GlinerModel {
389    session: parking_lot::Mutex<Session>,
390    tokenizer: tokenizers::Tokenizer,
391    #[allow(dead_code)]
392    variant: GlinerVariant,
393}
394
395impl GlinerModel {
396    fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397        let model_path = model_dir.join(variant.as_filename());
398        let tokenizer_path = model_dir.join("tokenizer.json");
399
400        let session = Session::builder()
401            .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
402            .with_optimization_level(GraphOptimizationLevel::Level3)
403            .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
404            .commit_from_file(&model_path)
405            .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
406
407        let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408            .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
409
410        Ok(Self {
411            session: parking_lot::Mutex::new(session),
412            tokenizer,
413            variant,
414        })
415    }
416
417    fn predict(
418        &self,
419        body: &str,
420        entity_labels: &[(&str, EntityType)],
421        threshold: f32,
422    ) -> Result<Vec<ExtractedEntity>> {
423        let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424        let words: Vec<&str> = body.split_whitespace().collect();
425        if words.is_empty() {
426            return Ok(Vec::new());
427        }
428
429        // Cap words to fit within model sequence length (accounting for label tokens)
430        let label_token_count = label_names.len() * 2 + 1;
431        let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432        let words = if words.len() > max_words {
433            tracing::warn!(target: "extraction",
434                original_words = words.len(),
435                capped_words = max_words,
436                "GLiNER input truncated to fit model sequence length"
437            );
438            &words[..max_words]
439        } else {
440            &words[..]
441        };
442        let num_words = words.len();
443
444        // Build prompt: [<<ENT>>, label1, <<ENT>>, label2, ..., <<SEP>>, word1, word2, ...]
445        let prompt_cap = label_names.len() * 2 + 1 + num_words;
446        let mut prompt_tokens: Vec<String> = Vec::new();
447        prompt_tokens.try_reserve(prompt_cap).map_err(|_| {
448            anyhow::anyhow!(
449                "allocation of {prompt_cap} prompt tokens would exceed available memory"
450            )
451        })?;
452        for label in &label_names {
453            prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
454            prompt_tokens.push((*label).to_string());
455        }
456        prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
457        for word in words {
458            prompt_tokens.push((*word).to_string());
459        }
460
461        // Encode each token individually (word-by-word encoding per GLiNER protocol)
462        let seq_estimate = prompt_tokens.len() * 3;
463        let mut all_ids: Vec<i64> = Vec::new();
464        all_ids.try_reserve(seq_estimate).map_err(|_| {
465            anyhow::anyhow!("allocation of {seq_estimate} token IDs would exceed available memory")
466        })?;
467        let mut all_attention: Vec<i64> = Vec::new();
468        all_attention.try_reserve(seq_estimate).map_err(|_| {
469            anyhow::anyhow!(
470                "allocation of {seq_estimate} attention masks would exceed available memory"
471            )
472        })?;
473        let mut all_word_mask: Vec<i64> = Vec::new();
474        all_word_mask.try_reserve(seq_estimate).map_err(|_| {
475            anyhow::anyhow!("allocation of {seq_estimate} word masks would exceed available memory")
476        })?;
477
478        // BOS token
479        all_ids.push(1);
480        all_attention.push(1);
481        all_word_mask.push(0);
482
483        let text_offset = label_names.len() * 2 + 1;
484        let mut word_id: i64 = 0;
485
486        for (pos, token_str) in prompt_tokens.iter().enumerate() {
487            let encoding = self
488                .tokenizer
489                .encode(token_str.as_str(), false)
490                .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
491            let ids = encoding.get_ids();
492            let is_text_token = pos >= text_offset;
493
494            for (sub_idx, &id) in ids.iter().enumerate() {
495                all_ids.push(id as i64);
496                all_attention.push(1);
497                if is_text_token && sub_idx == 0 {
498                    word_id += 1;
499                    all_word_mask.push(word_id);
500                } else {
501                    all_word_mask.push(0);
502                }
503            }
504        }
505
506        // EOS token
507        all_ids.push(2);
508        all_attention.push(1);
509        all_word_mask.push(0);
510
511        let seq_len = all_ids.len();
512
513        // Build ORT tensors using Tensor::from_array((shape, data)) API
514        let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
515            .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
516        let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
517            .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
518        let t_words_mask =
519            ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
520                .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
521        let t_text_lengths =
522            ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
523                .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
524
525        // Build span tensors
526        let num_spans = num_words * GLINER_MAX_WIDTH;
527        let mut span_idx_data = vec![0i64; num_spans * 2];
528        let mut span_mask_data = vec![false; num_spans];
529
530        for start in 0..num_words {
531            let remaining = num_words - start;
532            let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
533            for width in 0..actual_max_width {
534                let dim = start * GLINER_MAX_WIDTH + width;
535                span_idx_data[dim * 2] = start as i64;
536                span_idx_data[dim * 2 + 1] = (start + width) as i64;
537                span_mask_data[dim] = true;
538            }
539        }
540
541        let t_span_idx =
542            ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
543                .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
544        let t_span_mask =
545            ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
546                .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
547
548        // Run inference — Session::run requires &mut Session; bind guard first.
549        let mut session_guard = self.session.lock();
550        let outputs = session_guard
551            .run(ort::inputs![
552                "input_ids" => t_input_ids,
553                "attention_mask" => t_attention,
554                "words_mask" => t_words_mask,
555                "text_lengths" => t_text_lengths,
556                "span_idx" => t_span_idx,
557                "span_mask" => t_span_mask
558            ])
559            .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
560
561        // Extract logits: [1, num_words, max_width, num_classes]
562        // try_extract_tensor returns (&Shape, &[f32]); index manually.
563        let (logits_shape, logits_data) = outputs["logits"]
564            .try_extract_tensor::<f32>()
565            .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
566
567        let num_classes = label_names.len();
568        // Expected shape: [1, num_words, GLINER_MAX_WIDTH, num_classes]
569        // Shape derefs to &[i64] so we can index directly.
570        let max_width = logits_shape
571            .get(2)
572            .copied()
573            .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
574        let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
575
576        let candidates_cap = num_words * max_width;
577        let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
578        candidates.try_reserve(candidates_cap).map_err(|_| {
579            anyhow::anyhow!(
580                "allocation of {candidates_cap} candidates would exceed available memory"
581            )
582        })?;
583
584        for start in 0..num_words {
585            for width in 0..max_width {
586                let end = start + width;
587                if end >= num_words {
588                    break;
589                }
590                for class_idx in 0..nc.min(num_classes) {
591                    // flat index: batch=0 * (num_words*max_width*nc) + start*(max_width*nc) + width*nc + class_idx
592                    let flat = start * (max_width * nc) + width * nc + class_idx;
593                    if flat >= logits_data.len() {
594                        break;
595                    }
596                    let raw = logits_data[flat];
597                    let score = 1.0 / (1.0 + (-raw).exp());
598                    if score >= threshold {
599                        candidates.push((start, end, class_idx, score));
600                    }
601                }
602            }
603        }
604
605        // Sort by score descending for greedy NMS
606        candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
607
608        // Greedy non-maximum suppression
609        let mut used = vec![false; num_words];
610        let mut entities: Vec<ExtractedEntity> = Vec::with_capacity(candidates.len().min(MAX_ENTS));
611
612        for (start, end, class_idx, _score) in &candidates {
613            let overlap = (*start..=*end).any(|i| used[i]);
614            if overlap {
615                continue;
616            }
617            for flag in used.iter_mut().take(*end + 1).skip(*start) {
618                *flag = true;
619            }
620            let text = words[*start..=*end].join(" ");
621            if text.len() < MIN_ENTITY_CHARS {
622                continue;
623            }
624            let entity_type = entity_labels[*class_idx].1;
625            entities.push(ExtractedEntity {
626                name: text,
627                entity_type,
628            });
629            if entities.len() >= MAX_ENTS {
630                break;
631            }
632        }
633
634        Ok(entities)
635    }
636}
637
638static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
639
640fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
641    paths.models.join(format!("gliner-multi-v2.1/{variant}"))
642}
643
644fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
645    let dir = gliner_model_dir(paths, variant);
646    std::fs::create_dir_all(&dir)
647        .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
648
649    let model_file = dir.join(variant.as_filename());
650    let tokenizer_file = dir.join("tokenizer.json");
651
652    if model_file.exists() && tokenizer_file.exists() {
653        return Ok(dir);
654    }
655
656    let repo = crate::constants::gliner_model_repo();
657    tracing::info!(target: "extraction",
658        "Downloading GLiNER model ({variant}, ~{})...",
659        variant.display_size()
660    );
661    crate::output::emit_progress_i18n(
662        &format!(
663            "Downloading GLiNER model ({variant}, ~{})...",
664            variant.display_size()
665        ),
666        &format!(
667            "Baixando modelo GLiNER ({variant}, ~{})...",
668            variant.display_size()
669        ),
670    );
671
672    let api = huggingface_hub::api::sync::Api::new().with_context(|| "creating HF Hub client")?;
673    let hf_repo = api.model(repo);
674
675    let remote_model = format!("onnx/{}", variant.as_filename());
676    if !model_file.exists() {
677        let src = hf_repo
678            .get(&remote_model)
679            .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
680        std::fs::copy(&src, &model_file)
681            .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
682    }
683
684    if !tokenizer_file.exists() {
685        let src = hf_repo
686            .get("tokenizer.json")
687            .with_context(|| "downloading tokenizer.json from HF Hub")?;
688        std::fs::copy(&src, &tokenizer_file).with_context(|| "copying tokenizer.json to cache")?;
689    }
690
691    Ok(dir)
692}
693
694fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
695    let dir = ensure_gliner_model_files(paths, variant)?;
696    GlinerModel::load(&dir, variant)
697}
698
699fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
700    GLINER_MODEL
701        .get_or_init(|| match load_gliner_model(paths, variant) {
702            Ok(m) => Some(m),
703            Err(e) => {
704                tracing::warn!(target: "extraction", error = %e, "GLiNER model unavailable, graceful degradation");
705                None
706            }
707        })
708        .as_ref()
709}
710
711fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
712    let mut entities = Vec::with_capacity(16);
713    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(32);
714
715    let add = |entities: &mut Vec<ExtractedEntity>,
716               seen: &mut std::collections::HashSet<String>,
717               name: &str,
718               entity_type: EntityType| {
719        let name = name.trim().to_string();
720        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
721            entities.push(ExtractedEntity { name, entity_type });
722        }
723    };
724
725    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
726    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
727    let cleaned = regex_section_marker().replace_all(body, " ");
728    let cleaned = cleaned.as_ref();
729
730    for m in regex_email().find_iter(cleaned) {
731        // v1.0.20: email is "concept" (regex alone cannot distinguish person from mailing list/role).
732        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
733    }
734    for m in regex_uuid().find_iter(cleaned) {
735        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
736    }
737    for m in regex_all_caps().find_iter(cleaned) {
738        let candidate = m.as_str();
739        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
740        if !is_filtered_all_caps(candidate) {
741            add(&mut entities, &mut seen, candidate, EntityType::Concept);
742        }
743    }
744    // v1.0.25 P0-2: capture CamelCase brand names that NER model often misses.
745    // Maps to "organization" (V008 schema) because brand names are typically organisations.
746    for m in regex_brand_camel().find_iter(cleaned) {
747        let name = m.as_str();
748        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
749        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
750            add(&mut entities, &mut seen, name, EntityType::Organization);
751        }
752    }
753
754    entities
755}
756
757/// Extracts URLs from a memory body, deduplicated by text.
758/// URLs are stored in the `memory_urls` table separately from graph entities.
759/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
760pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
761    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(8);
762    let mut result = Vec::with_capacity(4);
763    for m in regex_url().find_iter(body) {
764        let raw = m.as_str();
765        let cleaned = raw
766            .trim_end_matches('`')
767            .trim_end_matches(',')
768            .trim_end_matches('.')
769            .trim_end_matches(';')
770            .trim_end_matches(')')
771            .trim_end_matches(']')
772            .trim_end_matches('}');
773        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
774            result.push(ExtractedUrl {
775                url: cleaned.to_string(),
776                offset: m.start(),
777            });
778        }
779    }
780    result
781}
782
783/// Returns (relationships, truncated) where truncated is true when the cap was hit
784/// before all entity pairs were covered. Exposed in RememberResponse as
785/// `relationships_truncated` so callers can decide whether to increase the cap.
786///
787/// v1.0.31 A9: superseded by `build_relationships_by_sentence_cooccurrence` for
788/// the auto-extraction pipeline because the legacy pairwise scheme produces a
789/// dense C(N,2) graph polluted with co-mentions across unrelated paragraphs.
790/// Kept for unit tests that pin the cap behaviour and for callers that lack a
791/// body string.
792#[cfg(test)]
793fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
794    if entities.len() < 2 {
795        return (Vec::new(), false);
796    }
797
798    // v1.0.22: cap configurable via env var (constants::max_relationships_per_memory).
799    // Allows users with dense corpora to increase beyond the default 50.
800    let max_rels = crate::constants::max_relationships_per_memory();
801    let n = entities.len().min(MAX_ENTS);
802    let mut rels: Vec<NewRelationship> = Vec::with_capacity(n.min(max_rels));
803    let mut seen: std::collections::HashSet<(usize, usize)> =
804        std::collections::HashSet::with_capacity(n.min(max_rels));
805
806    let mut hit_cap = false;
807    'outer: for i in 0..n {
808        if rels.len() >= max_rels {
809            hit_cap = true;
810            break;
811        }
812
813        let mut for_entity = 0usize;
814        for j in (i + 1)..n {
815            if for_entity >= TOP_K_RELATIONS {
816                break;
817            }
818            if rels.len() >= max_rels {
819                hit_cap = true;
820                break 'outer;
821            }
822
823            let key = (i.min(j), i.max(j));
824            if !seen.insert(key) {
825                continue;
826            }
827
828            rels.push(NewRelationship {
829                // clone needed: NewRelationship requires owned String for source/target
830                source: entities[i].name.clone(),
831                target: entities[j].name.clone(),
832                relation: DEFAULT_RELATION.to_string(),
833                strength: 0.5,
834                description: None,
835            });
836            for_entity += 1;
837        }
838    }
839
840    // v1.0.20: warn when relationships were truncated before covering all possible pairs.
841    if hit_cap {
842        tracing::warn!(target: "extraction",
843            "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
844            n.saturating_sub(1)
845        );
846    }
847
848    (rels, hit_cap)
849}
850
851/// v1.0.31 A9: build relationships only between entities that actually
852/// co-occur within the same sentence (split on `.`, `!`, `?`, newline).
853///
854/// The legacy `build_relationships` pairs every entity with every other,
855/// yielding a dense C(N,2) graph dominated by spurious "mentions" edges
856/// across unrelated sections. Restricting to sentence-level co-occurrence
857/// keeps the edges semantically meaningful while still respecting the
858/// configurable `max_relationships_per_memory` cap.
859///
860/// Returns `(relationships, truncated)` mirroring `build_relationships`.
861fn build_relationships_by_sentence_cooccurrence(
862    body: &str,
863    entities: &[NewEntity],
864) -> (Vec<NewRelationship>, bool) {
865    if entities.len() < 2 {
866        return (Vec::new(), false);
867    }
868
869    let max_rels = crate::constants::max_relationships_per_memory();
870    let lower_names: Vec<(usize, String)> = entities
871        .iter()
872        .take(MAX_ENTS)
873        .enumerate()
874        .map(|(i, e)| (i, e.name.to_lowercase()))
875        .collect();
876
877    let mut rels: Vec<NewRelationship> = Vec::with_capacity(max_rels);
878    let mut seen: std::collections::HashSet<(usize, usize)> =
879        std::collections::HashSet::with_capacity(max_rels);
880    let mut hit_cap = false;
881
882    for sentence in body.split(['.', '!', '?', '\n']) {
883        if sentence.trim().is_empty() {
884            continue;
885        }
886        let lower_sentence = sentence.to_lowercase();
887        let present: Vec<usize> = lower_names
888            .iter()
889            .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
890            .map(|(i, _)| *i)
891            .collect();
892
893        if present.len() < 2 {
894            continue;
895        }
896
897        let n = present.len();
898        for i in 0..n {
899            for j in (i + 1)..n {
900                if rels.len() >= max_rels {
901                    hit_cap = true;
902                    tracing::warn!(target: "extraction",
903                        "relationships truncated to {max_rels} during sentence-level pairing"
904                    );
905                    return (rels, hit_cap);
906                }
907                let ei = present[i];
908                let ej = present[j];
909                let key = (ei.min(ej), ei.max(ej));
910                if seen.insert(key) {
911                    rels.push(NewRelationship {
912                        source: entities[ei].name.clone(),
913                        target: entities[ej].name.clone(),
914                        relation: DEFAULT_RELATION.to_string(),
915                        strength: 0.5,
916                        description: None,
917                    });
918                }
919            }
920        }
921    }
922
923    (rels, hit_cap)
924}
925
926/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
927/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
928/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
929/// Conservative: only extends when the suffix is at most 7 characters.
930/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
931/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
932fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
933    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
934    // Matches: separator + digits + optional decimal + optional lowercase letter
935    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
936    let suffix_re = SUFFIX_RE.get_or_init(|| {
937        Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
938            .expect("compile-time validated numeric suffix regex literal")
939    });
940
941    entities
942        .into_iter()
943        .map(|ent| {
944            // Finds the first case-sensitive occurrence of the entity in the body
945            if let Some(pos) = body.find(&ent.name) {
946                let after_pos = pos + ent.name.len();
947                if after_pos < body.len() {
948                    let after = &body[after_pos..];
949                    if let Some(m) = suffix_re.find(after) {
950                        let suffix = m.as_str();
951                        // Conservative: cap suffix length to 7 chars to avoid grabbing
952                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
953                        if suffix.len() <= 7 {
954                            let mut extended = String::with_capacity(ent.name.len() + suffix.len());
955                            extended.push_str(&ent.name);
956                            extended.push_str(suffix);
957                            return ExtractedEntity {
958                                name: extended,
959                                entity_type: ent.entity_type,
960                            };
961                        }
962                    }
963                }
964            }
965            ent
966        })
967        .collect()
968}
969
970/// Captures versioned model names that NER model consistently misses.
971///
972/// NER model often classifies tokens like "Claude" or "Llama" as common nouns,
973/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
974/// never sees these candidates and the version suffix gets lost.
975///
976/// This function scans the body with a conservative regex, matching capitalised
977/// words followed by a space-or-hyphen and a small integer. Matches that are not
978/// already covered by an existing entity (case-insensitive) are appended with the
979/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
980/// items downstream.
981///
982/// v1.0.24 P2-D: regex extended to cover:
983/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
984/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
985/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
986///
987/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
988/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
989fn augment_versioned_model_names(
990    entities: Vec<ExtractedEntity>,
991    body: &str,
992) -> Vec<ExtractedEntity> {
993    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
994    // Pattern breakdown:
995    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
996    //   [\s\-]+               — separator: space(s) or hyphen(s)
997    //   \d+(?:\.\d+)?         — version number, optional decimal
998    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
999    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
1000    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
1001        Regex::new(
1002            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
1003        )
1004        .expect("compile-time validated versioned model regex literal")
1005    });
1006
1007    let mut existing_lc: std::collections::HashSet<String> =
1008        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
1009    let mut result = entities;
1010
1011    for caps in model_re.captures_iter(body) {
1012        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
1013        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
1014        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
1015        if full_match.is_empty() || full_match.len() > 24 {
1016            continue;
1017        }
1018        let normalized_lc = full_match.to_lowercase();
1019        if existing_lc.contains(&normalized_lc) {
1020            continue;
1021        }
1022        // Stop appending once the global entity cap is reached to keep parity with
1023        // `merge_and_deduplicate` truncation semantics.
1024        if result.len() >= MAX_ENTS {
1025            break;
1026        }
1027        existing_lc.insert(normalized_lc);
1028        result.push(ExtractedEntity {
1029            name: full_match.to_string(),
1030            entity_type: EntityType::Concept,
1031        });
1032    }
1033
1034    result
1035}
1036
1037fn merge_and_deduplicate(
1038    regex_ents: Vec<ExtractedEntity>,
1039    ner_ents: Vec<ExtractedEntity>,
1040) -> Vec<ExtractedEntity> {
1041    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
1042    // and is scoped per entity_type. This fixes two bugs from prior versions:
1043    //
1044    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
1045    //    start_with "sonnet", so the pair could survive dedup depending on insertion
1046    //    order. contains() catches both directions unconditionally.
1047    //
1048    // 2. The lookup key omitted entity_type, so "Apple/organization" and
1049    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
1050    //
1051    // Earlier invariants preserved:
1052    // - NFKC normalization before lowercasing (v1.0.24).
1053    // - Longest-wins: on collision keep the entity with the longer name.
1054    // - Truncation warning at MAX_ENTS.
1055    let mut by_lc: std::collections::HashMap<String, usize> =
1056        std::collections::HashMap::with_capacity(regex_ents.len() + ner_ents.len());
1057    let mut result: Vec<ExtractedEntity> = Vec::with_capacity(MAX_ENTS);
1058    let mut truncated = false;
1059
1060    let total_input = regex_ents.len() + ner_ents.len();
1061    for ent in regex_ents.into_iter().chain(ner_ents) {
1062        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1063        // Composite key: entity_type + NUL + normalised lowercase name.
1064        // Collision search is scoped to the same type so that e.g.
1065        // "Apple/organization" and "Apple/concept" are kept separately.
1066        let key = {
1067            let et = ent.entity_type.as_str();
1068            let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1069            k.push_str(et);
1070            k.push('\0');
1071            k.push_str(&name_lc);
1072            k
1073        };
1074
1075        // Scan stored entries for substring containment within the same type.
1076        // Two names collide when one is a case-insensitive substring of the other:
1077        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
1078        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
1079        let type_prefix = {
1080            let et = ent.entity_type.as_str();
1081            let mut p = String::with_capacity(et.len() + 1);
1082            p.push_str(et);
1083            p.push('\0');
1084            p
1085        };
1086        let mut collision_idx: Option<usize> = None;
1087        for (existing_key, idx) in &by_lc {
1088            // Fast-path: check type prefix matches before scanning the name.
1089            if !existing_key.starts_with(&type_prefix) {
1090                continue;
1091            }
1092            let existing_name_lc = &existing_key[type_prefix.len()..];
1093            if existing_name_lc == name_lc
1094                || existing_name_lc.contains(name_lc.as_str())
1095                || name_lc.contains(existing_name_lc)
1096            {
1097                collision_idx = Some(*idx);
1098                break;
1099            }
1100        }
1101        match collision_idx {
1102            Some(idx) => {
1103                // Replace stored entity only when the new candidate is strictly
1104                // longer; otherwise drop the new one.
1105                if ent.name.len() > result[idx].name.len() {
1106                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1107                    let old_key = {
1108                        let et = result[idx].entity_type.as_str();
1109                        let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1110                        k.push_str(et);
1111                        k.push('\0');
1112                        k.push_str(&old_name_lc);
1113                        k
1114                    };
1115                    by_lc.remove(&old_key);
1116                    result[idx] = ent;
1117                    by_lc.insert(key, idx);
1118                }
1119            }
1120            None => {
1121                by_lc.insert(key, result.len());
1122                result.push(ent);
1123            }
1124        }
1125        if result.len() >= MAX_ENTS {
1126            truncated = true;
1127            break;
1128        }
1129    }
1130
1131    // v1.0.20: warn when silent truncation discards entities above MAX_ENTS.
1132    if truncated {
1133        tracing::warn!(target: "extraction",
1134            "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1135        );
1136    }
1137
1138    result
1139}
1140
1141fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1142    extracted
1143        .into_iter()
1144        .map(|e| NewEntity {
1145            name: e.name,
1146            entity_type: e.entity_type,
1147            description: None,
1148        })
1149        .collect()
1150}
1151
1152pub fn extract_graph_auto(
1153    body: &str,
1154    paths: &AppPaths,
1155    variant: GlinerVariant,
1156) -> Result<ExtractionResult> {
1157    let regex_entities = apply_regex_prefilter(body);
1158    let threshold = crate::constants::gliner_confidence_threshold();
1159
1160    let mut gliner_used = false;
1161    let ner_entities = match get_or_init_gliner(paths, variant) {
1162        Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1163            Ok(ents) => {
1164                gliner_used = true;
1165                ents
1166            }
1167            Err(e) => {
1168                tracing::warn!(target: "extraction", error = %e, "GLiNER NER failed, falling back to regex-only");
1169                Vec::new()
1170            }
1171        },
1172        None => Vec::new(),
1173    };
1174
1175    let merged = merge_and_deduplicate(regex_entities, ner_entities);
1176    let extended = extend_with_numeric_suffix(merged, body);
1177    let with_models = augment_versioned_model_names(extended, body);
1178    let with_models: Vec<ExtractedEntity> = with_models
1179        .into_iter()
1180        .filter(|e| !regex_section_marker().is_match(&e.name))
1181        .collect();
1182    let entities = to_new_entities(with_models);
1183    let (relationships, relationships_truncated) =
1184        build_relationships_by_sentence_cooccurrence(body, &entities);
1185
1186    let extraction_method = if gliner_used {
1187        format!("gliner-{variant}+regex")
1188    } else {
1189        "regex-only".to_string()
1190    };
1191
1192    let urls = extract_urls(body);
1193
1194    Ok(ExtractionResult {
1195        entities,
1196        relationships,
1197        relationships_truncated,
1198        extraction_method,
1199        urls,
1200    })
1201}
1202
1203pub struct RegexExtractor;
1204
1205impl Extractor for RegexExtractor {
1206    fn extract(&self, body: &str) -> Result<ExtractionResult> {
1207        let regex_entities = apply_regex_prefilter(body);
1208        let entities = to_new_entities(regex_entities);
1209        let (relationships, relationships_truncated) =
1210            build_relationships_by_sentence_cooccurrence(body, &entities);
1211        let urls = extract_urls(body);
1212        Ok(ExtractionResult {
1213            entities,
1214            relationships,
1215            relationships_truncated,
1216            extraction_method: "regex-only".to_string(),
1217            urls,
1218        })
1219    }
1220}
1221
1222#[cfg(test)]
1223mod tests {
1224    use super::*;
1225    use crate::entity_type::EntityType;
1226
1227    fn make_paths() -> AppPaths {
1228        use std::path::PathBuf;
1229        AppPaths {
1230            db: PathBuf::from("/tmp/test.sqlite"),
1231            models: PathBuf::from("/tmp/test_models"),
1232        }
1233    }
1234
1235    #[test]
1236    fn regex_email_captures_address() {
1237        let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1238        // v1.0.20: emails are classified as "concept" (regex alone cannot distinguish person from role).
1239        assert!(ents
1240            .iter()
1241            .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1242    }
1243
1244    #[test]
1245    fn regex_all_caps_filters_pt_rule_word() {
1246        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE must not become "entities".
1247        let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1248        assert!(
1249            !ents.iter().any(|e| e.name == "NUNCA"),
1250            "NUNCA must be filtered as a stopword"
1251        );
1252        assert!(
1253            !ents.iter().any(|e| e.name == "PROIBIDO"),
1254            "PROIBIDO must be filtered"
1255        );
1256        assert!(
1257            !ents.iter().any(|e| e.name == "DEVE"),
1258            "DEVE must be filtered"
1259        );
1260    }
1261
1262    #[test]
1263    fn regex_all_caps_accepts_underscored_constant() {
1264        // Technical constants like MAX_RETRY, TIMEOUT_MS must always be accepted.
1265        let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1266        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1267        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1268    }
1269
1270    #[test]
1271    fn regex_all_caps_accepts_domain_acronym() {
1272        // Legitimate (non-stopword) acronyms must pass: OPENAI, NVIDIA, GOOGLE.
1273        let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1274        assert!(ents.iter().any(|e| e.name == "OPENAI"));
1275        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1276    }
1277
1278    #[test]
1279    fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1280        // v1.0.24 P0-2: URLs were removed from apply_regex_prefilter and now go through extract_urls.
1281        let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1282        assert!(
1283            !ents.iter().any(|e| e.name.starts_with("https://")),
1284            "URLs must not appear as entities after the P0-2 split"
1285        );
1286    }
1287
1288    #[test]
1289    fn extract_urls_captures_https() {
1290        let urls = extract_urls("see https://docs.rs/crate for details");
1291        assert_eq!(urls.len(), 1);
1292        assert_eq!(urls[0].url, "https://docs.rs/crate");
1293        assert!(urls[0].offset > 0);
1294    }
1295
1296    #[test]
1297    fn extract_urls_trim_sufixo_pontuacao() {
1298        let urls = extract_urls("link: https://example.com/path. fim");
1299        assert!(!urls.is_empty());
1300        assert!(
1301            !urls[0].url.ends_with('.'),
1302            "sufixo ponto deve ser removido"
1303        );
1304    }
1305
1306    #[test]
1307    fn extract_urls_dedupes_repeated() {
1308        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1309        let urls = extract_urls(body);
1310        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1311    }
1312
1313    #[test]
1314    fn regex_uuid_captura_identificador() {
1315        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1316        assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1317    }
1318
1319    #[test]
1320    fn regex_all_caps_captura_constante() {
1321        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1322        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1323        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1324    }
1325
1326    #[test]
1327    fn regex_all_caps_ignores_short_words() {
1328        let ents = apply_regex_prefilter("use AI em seu projeto");
1329        assert!(
1330            !ents.iter().any(|e| e.name == "AI"),
1331            "AI tem apenas 2 chars, deve ser ignorado"
1332        );
1333    }
1334
1335    #[test]
1336    fn build_relationships_respeitam_max_rels() {
1337        let entities: Vec<NewEntity> = (0..20)
1338            .map(|i| NewEntity {
1339                name: format!("entidade_{i}"),
1340                entity_type: EntityType::Concept,
1341                description: None,
1342            })
1343            .collect();
1344        let (rels, truncated) = build_relationships(&entities);
1345        let max_rels = crate::constants::max_relationships_per_memory();
1346        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1347        if rels.len() == max_rels {
1348            assert!(truncated, "truncated deve ser true quando atingiu o cap");
1349        }
1350    }
1351
1352    #[test]
1353    fn build_relationships_without_duplicates() {
1354        let entities: Vec<NewEntity> = (0..5)
1355            .map(|i| NewEntity {
1356                name: format!("ent_{i}"),
1357                entity_type: EntityType::Concept,
1358                description: None,
1359            })
1360            .collect();
1361        let (rels, _truncated) = build_relationships(&entities);
1362        let mut pares: std::collections::HashSet<(String, String)> =
1363            std::collections::HashSet::new();
1364        for r in &rels {
1365            let par = (r.source.clone(), r.target.clone());
1366            assert!(pares.insert(par), "par duplicado encontrado");
1367        }
1368    }
1369
1370    #[test]
1371    fn merge_dedupes_by_lowercase_name() {
1372        // v1.0.25: collision detection is scoped per entity_type; same name + same type
1373        // must deduplicate to one entry. Different types are kept separately.
1374        let a = vec![ExtractedEntity {
1375            name: "Rust".to_string(),
1376            entity_type: EntityType::Concept,
1377        }];
1378        let b = vec![ExtractedEntity {
1379            name: "rust".to_string(),
1380            entity_type: EntityType::Concept,
1381        }];
1382        let merged = merge_and_deduplicate(a, b);
1383        assert_eq!(
1384            merged.len(),
1385            1,
1386            "rust and Rust with the same type are the same entity"
1387        );
1388    }
1389
1390    #[test]
1391    fn regex_extractor_implements_trait() {
1392        let extractor = RegexExtractor;
1393        let result = extractor
1394            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1395            .unwrap();
1396        assert!(!result.entities.is_empty());
1397    }
1398
1399    #[test]
1400    fn extract_returns_ok_without_model() {
1401        // Without a downloaded model, must return Ok with regex-only entities.
1402        let paths = make_paths();
1403        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1404        let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1405        assert!(result
1406            .entities
1407            .iter()
1408            .any(|e| e.name.contains("teste@exemplo.com")));
1409    }
1410
1411    #[test]
1412    fn stopwords_filter_v1024_terms() {
1413        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1414        // by apply_regex_prefilter so they do not appear as entities.
1415        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1416                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1417        let ents = apply_regex_prefilter(body);
1418        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1419        for word in &[
1420            "ACEITE",
1421            "ACK",
1422            "ACL",
1423            "BORDA",
1424            "CHECKLIST",
1425            "COMPLETED",
1426            "CONFIRME",
1427            "DEVEMOS",
1428            "DONE",
1429            "FIXED",
1430            "NEGUE",
1431            "PENDING",
1432            "PLAN",
1433            "PODEMOS",
1434            "RECUSE",
1435            "TOKEN",
1436            "VAMOS",
1437        ] {
1438            assert!(
1439                !names.contains(word),
1440                "v1.0.24 stopword {word} should be filtered but was found in entities"
1441            );
1442        }
1443    }
1444
1445    #[test]
1446    fn dedup_normalizes_unicode_combining_marks() {
1447        // v1.0.24 P1-E: "Caf\u{e9}" (NFC precomposed) and "Cafe\u{301}" (NFD with
1448        // combining acute accent) must deduplicate to a single entity after NFKC
1449        // normalization.
1450        let nfc = vec![ExtractedEntity {
1451            name: "Caf\u{e9}".to_string(),
1452            entity_type: EntityType::Concept,
1453        }];
1454        // Build the NFD form: 'e' followed by combining acute accent U+0301
1455        let nfd_name = "Cafe\u{301}".to_string();
1456        let nfd = vec![ExtractedEntity {
1457            name: nfd_name,
1458            entity_type: EntityType::Concept,
1459        }];
1460        let merged = merge_and_deduplicate(nfc, nfd);
1461        assert_eq!(
1462            merged.len(),
1463            1,
1464            "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1465        );
1466    }
1467
1468    #[test]
1469    fn extraction_method_regex_only_unchanged() {
1470        // RegexExtractor always returns "regex-only" regardless of GLINER_MODEL state.
1471        // This guards against accidentally changing the regex-only fallback string.
1472        let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1473        assert_eq!(
1474            result.extraction_method, "regex-only",
1475            "RegexExtractor must return regex-only"
1476        );
1477    }
1478
1479    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1480
1481    #[test]
1482    fn extend_suffix_pure_numeric_unchanged() {
1483        // Existing behaviour: pure-numeric suffix must still work after P2-E.
1484        let ents = vec![ExtractedEntity {
1485            name: "GPT".to_string(),
1486            entity_type: EntityType::Concept,
1487        }];
1488        let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1489        assert_eq!(
1490            result[0].name, "GPT-5",
1491            "purely numeric suffix must be extended"
1492        );
1493    }
1494
1495    #[test]
1496    fn extend_suffix_alphanumeric_letter_after_digit() {
1497        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1498        let ents = vec![ExtractedEntity {
1499            name: "GPT".to_string(),
1500            entity_type: EntityType::Concept,
1501        }];
1502        let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1503        assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
1504    }
1505
1506    #[test]
1507    fn extend_suffix_alphanumeric_b_suffix() {
1508        // P2-E: "5b" suffix (digit + 'b') must be captured.
1509        let ents = vec![ExtractedEntity {
1510            name: "Llama".to_string(),
1511            entity_type: EntityType::Concept,
1512        }];
1513        let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1514        assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
1515    }
1516
1517    #[test]
1518    fn extend_suffix_alphanumeric_x_suffix() {
1519        // P2-E: "8x" suffix (digit + 'x') must be captured.
1520        let ents = vec![ExtractedEntity {
1521            name: "Mistral".to_string(),
1522            entity_type: EntityType::Concept,
1523        }];
1524        let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1525        assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
1526    }
1527
1528    // --- P2-D: augment_versioned_model_names extended regex ---
1529
1530    #[test]
1531    fn augment_versioned_gpt4o() {
1532        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1533        let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1534        assert!(
1535            result.iter().any(|e| e.name == "GPT-4o"),
1536            "GPT-4o must be captured by augment, found: {:?}",
1537            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1538        );
1539    }
1540
1541    #[test]
1542    fn augment_versioned_claude_4_sonnet() {
1543        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1544        let result =
1545            augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1546        assert!(
1547            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1548            "Claude 4 Sonnet must be captured, found: {:?}",
1549            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1550        );
1551    }
1552
1553    #[test]
1554    fn augment_versioned_llama_3_pro() {
1555        // P2-D: "Llama 3 Pro" must be captured with release tier.
1556        let result =
1557            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1558        assert!(
1559            result.iter().any(|e| e.name == "Llama 3 Pro"),
1560            "Llama 3 Pro deve ser capturado, achados: {:?}",
1561            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1562        );
1563    }
1564
1565    #[test]
1566    fn augment_versioned_mixtral_8x7b() {
1567        // P2-D: "Mixtral 8x7B" composite version must be captured.
1568        let result =
1569            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1570        assert!(
1571            result.iter().any(|e| e.name == "Mixtral 8x7B"),
1572            "Mixtral 8x7B deve ser capturado, achados: {:?}",
1573            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1574        );
1575    }
1576
1577    #[test]
1578    fn augment_versioned_does_not_duplicate_existing() {
1579        // P2-D back-compat: entities already present must not be duplicated.
1580        let existing = vec![ExtractedEntity {
1581            name: "Claude 4".to_string(),
1582            entity_type: EntityType::Concept,
1583        }];
1584        let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1585        let count = result.iter().filter(|e| e.name == "Claude 4").count();
1586        assert_eq!(count, 1, "Claude 4 must not be duplicated");
1587    }
1588
1589    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1590
1591    #[test]
1592    fn stopwords_filter_url_jwt_api_v1025() {
1593        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1594        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1595        let ents = apply_regex_prefilter(body);
1596        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1597        for blocked in &[
1598            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1599        ] {
1600            assert!(
1601                !names.contains(blocked),
1602                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1603            );
1604        }
1605    }
1606
1607    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1608
1609    #[test]
1610    fn section_markers_etapa_fase_filtered_v1025() {
1611        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1612        // Body intentionally uses PT-BR section keywords (Etapa/Fase/Migra\u{e7}\u{e3}o) to
1613        // exercise the PT-BR section-marker filter. ASCII-escaped per the project policy.
1614        let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1615        let ents = apply_regex_prefilter(body);
1616        assert!(
1617            !ents
1618                .iter()
1619                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1620            "section markers must be stripped; entities: {:?}",
1621            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1622        );
1623    }
1624
1625    #[test]
1626    fn section_markers_passo_secao_filtered_v1025() {
1627        // PT-BR keywords Passo/Se\u{e7}\u{e3}o written with Unicode escapes per the
1628        // project language policy.
1629        let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1630        let ents = apply_regex_prefilter(body);
1631        assert!(
1632            !ents
1633                .iter()
1634                .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
1635            "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1636            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1637        );
1638    }
1639
1640    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1641
1642    #[test]
1643    fn brand_camelcase_extracted_as_organization_v1025() {
1644        // "OpenAI" is a CamelCase brand that NER model often misses.
1645        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1646        let ents = apply_regex_prefilter(body);
1647        let openai = ents.iter().find(|e| e.name == "OpenAI");
1648        assert!(
1649            openai.is_some(),
1650            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1651            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1652        );
1653        assert_eq!(
1654            openai.unwrap().entity_type,
1655            EntityType::Organization,
1656            "brand CamelCase must map to organization (V008)"
1657        );
1658    }
1659
1660    #[test]
1661    fn brand_postgresql_extracted_as_organization_v1025() {
1662        let body = "migrating from MySQL to PostgreSQL for better performance.";
1663        let ents = apply_regex_prefilter(body);
1664        assert!(
1665            ents.iter()
1666                .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
1667            "PostgreSQL must be extracted as organization; entities: {:?}",
1668            ents.iter()
1669                .map(|e| (&e.name, &e.entity_type))
1670                .collect::<Vec<_>>()
1671        );
1672    }
1673
1674    // --- P0-3 longest-wins v1.0.25 ---
1675
1676    fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1677        ExtractedEntity {
1678            name: name.to_string(),
1679            entity_type,
1680        }
1681    }
1682
1683    #[test]
1684    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1685        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1686        let regex = vec![entity("Sonne", EntityType::Concept)];
1687        let ner = vec![entity("Sonnet", EntityType::Concept)];
1688        let result = merge_and_deduplicate(regex, ner);
1689        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1690        assert_eq!(result[0].name, "Sonnet");
1691    }
1692
1693    #[test]
1694    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1695        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1696        let regex = vec![
1697            entity("Open", EntityType::Organization),
1698            entity("OpenAI", EntityType::Organization),
1699        ];
1700        let result = merge_and_deduplicate(regex, vec![]);
1701        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1702        assert_eq!(result[0].name, "OpenAI");
1703    }
1704
1705    #[test]
1706    fn merge_keeps_both_when_no_containment_v1025() {
1707        // "Alice" and "Bob" share no containment — both must be preserved.
1708        let regex = vec![
1709            entity("Alice", EntityType::Person),
1710            entity("Bob", EntityType::Person),
1711        ];
1712        let result = merge_and_deduplicate(regex, vec![]);
1713        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1714    }
1715
1716    #[test]
1717    fn merge_respects_entity_type_boundary_v1025() {
1718        // Same name "Apple" but different types: both must survive independently.
1719        let regex = vec![
1720            entity("Apple", EntityType::Organization),
1721            entity("Apple", EntityType::Concept),
1722        ];
1723        let result = merge_and_deduplicate(regex, vec![]);
1724        assert_eq!(
1725            result.len(),
1726            2,
1727            "expected 2 entities (different types), got: {result:?}"
1728        );
1729    }
1730
1731    #[test]
1732    fn merge_case_insensitive_dedup_v1025() {
1733        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1734        let regex = vec![
1735            entity("OpenAI", EntityType::Organization),
1736            entity("openai", EntityType::Organization),
1737        ];
1738        let result = merge_and_deduplicate(regex, vec![]);
1739        assert_eq!(
1740            result.len(),
1741            1,
1742            "expected 1 entity after case-insensitive dedup, got: {result:?}"
1743        );
1744    }
1745
1746    // ── v1.0.31 A1: NER cap protects against pathological body sizes ──
1747
1748    #[test]
1749    fn extract_graph_auto_handles_large_body_under_30s() {
1750        // Regression guard for the v1.0.31 A1 fix. A 80 KB body without real
1751        // entities must complete in under 30 s; before the cap it took 5+ minutes.
1752        let body = "x ".repeat(40_000);
1753        let paths = make_paths();
1754        let start = std::time::Instant::now();
1755        let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1756            .expect("extraction must not error");
1757        let elapsed = start.elapsed();
1758        assert!(
1759            elapsed.as_secs() < 30,
1760            "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1761            elapsed.as_secs()
1762        );
1763        // No real entities expected in synthetic body, but the call must succeed.
1764        let _ = result.entities;
1765    }
1766
1767    // ── v1.0.31 A11: PT-BR uppercase noise must not leak as entities ──
1768
1769    #[test]
1770    fn pt_uppercase_stopwords_filtered_v1031() {
1771        let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1772                    SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1773        let ents = apply_regex_prefilter(body);
1774        let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
1775        for stop in &[
1776            "ADAPTER",
1777            "PROJETO",
1778            "PASSIVA",
1779            "SOMENTE",
1780            "LEITURA",
1781            "REGRA",
1782            "OBRIGATORIA",
1783            "EXEMPLO",
1784            "DEFAULT",
1785        ] {
1786            assert!(
1787                !names.contains(&stop.to_string()),
1788                "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1789            );
1790        }
1791    }
1792
1793    #[test]
1794    fn pt_underscored_identifier_preserved_v1031() {
1795        // Identifiers with underscore must still pass through (FLOWAIPER_API_KEY,
1796        // MAX_RETRY etc. are intentional entities, not noise).
1797        let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1798        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1799        assert!(names.contains(&"FLOWAIPER_API_KEY"));
1800        assert!(names.contains(&"MAX_TIMEOUT"));
1801    }
1802
1803    // ── v1.0.31 A9: relationships only between entities co-occurring in same sentence ──
1804
1805    #[test]
1806    fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1807        let body = "Alice met Bob at the conference. Carol works alone in another room.";
1808        let entities = vec![
1809            NewEntity {
1810                name: "Alice".to_string(),
1811                entity_type: EntityType::Person,
1812                description: None,
1813            },
1814            NewEntity {
1815                name: "Bob".to_string(),
1816                entity_type: EntityType::Person,
1817                description: None,
1818            },
1819            NewEntity {
1820                name: "Carol".to_string(),
1821                entity_type: EntityType::Person,
1822                description: None,
1823            },
1824        ];
1825        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1826        assert!(!truncated);
1827        assert_eq!(
1828            rels.len(),
1829            1,
1830            "only Alice/Bob should pair (same sentence); Carol is isolated"
1831        );
1832        let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1833        assert!(
1834            matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
1835            "unexpected pair {pair:?}"
1836        );
1837    }
1838
1839    #[test]
1840    fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1841        let body = "Alice is here.";
1842        let entities = vec![NewEntity {
1843            name: "Alice".to_string(),
1844            entity_type: EntityType::Person,
1845            description: None,
1846        }];
1847        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1848        assert!(rels.is_empty());
1849        assert!(!truncated);
1850    }
1851
1852    #[test]
1853    fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1854        let body = "Alice met Bob. Bob saw Alice again.";
1855        let entities = vec![
1856            NewEntity {
1857                name: "Alice".to_string(),
1858                entity_type: EntityType::Person,
1859                description: None,
1860            },
1861            NewEntity {
1862                name: "Bob".to_string(),
1863                entity_type: EntityType::Person,
1864                description: None,
1865            },
1866        ];
1867        let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1868        assert_eq!(
1869            rels.len(),
1870            1,
1871            "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1872        );
1873    }
1874
1875    #[test]
1876    fn extraction_max_tokens_default_is_5000() {
1877        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1878        assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1879    }
1880
1881    #[test]
1882    fn extraction_max_tokens_env_override_clamped() {
1883        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1884        assert_eq!(
1885            crate::constants::extraction_max_tokens(),
1886            5_000,
1887            "value below 512 must fall back to default"
1888        );
1889
1890        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1891        assert_eq!(
1892            crate::constants::extraction_max_tokens(),
1893            5_000,
1894            "value above 100_000 must fall back to default"
1895        );
1896
1897        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1898        assert_eq!(
1899            crate::constants::extraction_max_tokens(),
1900            8_000,
1901            "valid value must be honoured"
1902        );
1903
1904        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1905    }
1906
1907    #[test]
1908    fn gliner_variant_from_str_valid() {
1909        assert_eq!(
1910            "fp32".parse::<GlinerVariant>().unwrap(),
1911            GlinerVariant::Fp32
1912        );
1913        assert_eq!(
1914            "fp16".parse::<GlinerVariant>().unwrap(),
1915            GlinerVariant::Fp16
1916        );
1917        assert_eq!(
1918            "int8".parse::<GlinerVariant>().unwrap(),
1919            GlinerVariant::Int8
1920        );
1921        assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1922        assert_eq!(
1923            "q4f16".parse::<GlinerVariant>().unwrap(),
1924            GlinerVariant::Q4f16
1925        );
1926        // Case-insensitive
1927        assert_eq!(
1928            "FP32".parse::<GlinerVariant>().unwrap(),
1929            GlinerVariant::Fp32
1930        );
1931        assert_eq!(
1932            "INT8".parse::<GlinerVariant>().unwrap(),
1933            GlinerVariant::Int8
1934        );
1935    }
1936
1937    #[test]
1938    fn gliner_variant_from_str_invalid() {
1939        assert!("invalid".parse::<GlinerVariant>().is_err());
1940        assert!("fp64".parse::<GlinerVariant>().is_err());
1941        assert!("".parse::<GlinerVariant>().is_err());
1942    }
1943
1944    #[test]
1945    fn gliner_variant_filename_mapping() {
1946        assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1947        assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1948        assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1949        assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1950        assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1951    }
1952
1953    #[test]
1954    fn gliner_variant_display() {
1955        assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1956        assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1957        assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1958        assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1959        assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1960    }
1961
1962    #[test]
1963    fn gliner_variant_display_size() {
1964        assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1965        assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1966    }
1967
1968    #[test]
1969    fn gliner_entity_labels_covers_all_types() {
1970        let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1971        assert!(label_types.contains(&EntityType::Person));
1972        assert!(label_types.contains(&EntityType::Organization));
1973        assert!(label_types.contains(&EntityType::Location));
1974        assert!(label_types.contains(&EntityType::Date));
1975        assert!(label_types.contains(&EntityType::Project));
1976        assert!(label_types.contains(&EntityType::Tool));
1977        assert!(label_types.contains(&EntityType::File));
1978        assert!(label_types.contains(&EntityType::Concept));
1979        assert!(label_types.contains(&EntityType::Decision));
1980        assert!(label_types.contains(&EntityType::Incident));
1981        assert!(label_types.contains(&EntityType::Dashboard));
1982        assert!(label_types.contains(&EntityType::IssueTracker));
1983        assert!(label_types.contains(&EntityType::Memory));
1984        assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1985    }
1986
1987    #[test]
1988    fn gliner_entity_labels_no_duplicates() {
1989        let mut seen = std::collections::HashSet::new();
1990        for (label, _) in GLINER_ENTITY_LABELS {
1991            assert!(seen.insert(*label), "duplicate label: {label}");
1992        }
1993    }
1994
1995    #[test]
1996    fn extract_graph_auto_regex_only_fallback() {
1997        // extract_graph_auto must succeed and capture regex entities regardless of whether
1998        // GLiNER model files exist in the test environment (GLINER_MODEL is a global OnceLock
1999        // that may already be initialised by a sibling test, so we cannot assert on
2000        // extraction_method; use RegexExtractor for that invariant).
2001        let result = extract_graph_auto(
2002            "Contact someone@test.com about OPENAI project",
2003            &make_paths(),
2004            GlinerVariant::Fp32,
2005        );
2006        assert!(result.is_ok());
2007        let res = result.unwrap();
2008        // Regex prefilter must always capture the email entity
2009        assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
2010        // extraction_method must be one of the two valid values
2011        assert!(
2012            res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
2013            "unexpected extraction_method: {}",
2014            res.extraction_method
2015        );
2016    }
2017
2018    #[test]
2019    fn gliner_variant_roundtrip() {
2020        for variant in &[
2021            GlinerVariant::Fp32,
2022            GlinerVariant::Fp16,
2023            GlinerVariant::Int8,
2024            GlinerVariant::Q4,
2025            GlinerVariant::Q4f16,
2026        ] {
2027            let s = format!("{variant}");
2028            let parsed: GlinerVariant = s.parse().unwrap();
2029            assert_eq!(*variant, parsed);
2030        }
2031    }
2032}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs