sqlite_graphrag/
extraction.rs

1//! Entity and URL extraction pipeline (NER + regex prefilter).
2//!
3//! Runs named-entity recognition and regex heuristics to extract structured
4//! entities and hyperlinks from raw memory bodies before embedding.
5
6use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use anyhow::{Context, Result};
10use ort::session::{builder::GraphOptimizationLevel, Session};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use unicode_normalization::UnicodeNormalization;
14
15use crate::entity_type::EntityType;
16use crate::paths::AppPaths;
17use crate::storage::entities::{NewEntity, NewRelationship};
18
19const MAX_ENTS: usize = 30;
20// v1.0.31 A9: only consumed by the legacy `build_relationships`, which is
21// kept for unit tests pinning the cap behaviour.
22#[cfg(test)]
23const TOP_K_RELATIONS: usize = 5;
24const DEFAULT_RELATION: &str = "mentions";
25const MIN_ENTITY_CHARS: usize = 2;
26
27static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
32static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33// v1.0.25 P0-2: captures CamelCase brand names that NER model often misses (e.g. "OpenAI", "PostgreSQL").
34static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35
36// v1.0.20: stopwords to filter common PT-BR/EN rule words captured as ALL_CAPS.
37// Without this filter, technical PT-BR corpora containing CAPS-formatted rules (NUNCA, PROIBIDO, DEVE)
38// generated ~70% of "garbage entities". We keep identifiers like MAX_RETRY (with underscore).
39// v1.0.22: expanded list with terms observed in 495-file flowaiper stress test.
40// Includes verbs (ADICIONAR, VALIDAR), adjectives (ALTA, BAIXA), common nouns (BANCO, CASO),
41// HTTP methods (GET, POST, DELETE) and generic data formats (JSON, XML).
42// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
43// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
44// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
45// and common abbreviations (ACK, ACL).
46// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
47// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
48// and generic tech terms from being extracted as entities.
49// v1.0.31 A11: added PT-BR uppercase noise observed during ingest of technical Portuguese
50// rule documents — common nouns/adjectives written in caps as visual emphasis (ADAPTER, PROJETO,
51// PASSIVA, ATIVA, SOMENTE, LEITURA, ESCRITA, OBRIGATORIA, EXEMPLO, REGRA, DEFAULT). Each one
52// kept leaking as a "concept" entity and inflating the graph with non-entities.
53const ALL_CAPS_STOPWORDS: &[&str] = &[
54    "ACEITE",
55    "ACID",
56    "ACK",
57    "ACL",
58    "ACRESCENTADO",
59    "ADAPTER",
60    "ADICIONADA",
61    "ADICIONADAS",
62    "ADICIONADO",
63    "ADICIONADOS",
64    "ADICIONAR",
65    "AGENTS",
66    "AINDA",
67    "ALL",
68    "ALTA",
69    "ALWAYS",
70    "APENAS",
71    "API",
72    "ARTEFATOS",
73    "ATIVA",
74    "ATIVO",
75    "BAIXA",
76    "BANCO",
77    "BLOQUEAR",
78    "BORDA",
79    "BUG",
80    "CAPÍTULO",
81    "CASO",
82    "CEO",
83    "CHECKLIST",
84    "CLARO",
85    "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86    "CLI",
87    "COMPLETED",
88    "CONFIRMADO",
89    "CONFIRMARAM",
90    "CONFIRME",
91    "CONFIRMEI",
92    "CONFIRMOU",
93    "CONTRATO",
94    "CRIE",
95    "CRÍTICO",
96    "CRITICAL",
97    "CSV",
98    "DDL",
99    "DEFAULT",
100    "DEFINIR",
101    "DEPARTMENT",
102    "DESC",
103    "DEVE",
104    "DEVEMOS",
105    "DISCO",
106    "DONE",
107    "DSL",
108    "DTO",
109    "EFEITO",
110    "ENTRADA",
111    "EOF",
112    "EPERM",
113    "ERROR",
114    "ESCREVA",
115    "ESCRITA",
116    "ESRCH",
117    "ESSA",
118    "ESSE",
119    "ESSENCIAL",
120    "ESTA",
121    "ESTADO",
122    "ESTE",
123    "ETAPA",
124    "EVITAR",
125    "EXEMPLO",
126    "EXPANDIR",
127    "EXPOR",
128    "FALHA",
129    "FASE",
130    "FATO",
131    "FIFO",
132    "FIXED",
133    "FIXME",
134    "FLUXO",
135    "FONTES",
136    "FORBIDDEN",
137    "FUNCIONA",
138    "GNU",
139    "HACK",
140    "HEARTBEAT",
141    "HTTP",
142    "HTTPS",
143    "INATIVO",
144    "JAMAIS",
145    "JSON",
146    "JWT",
147    "LEITURA",
148    "LLM",
149    "MCP",
150    "MESMO",
151    "METADADOS",
152    "MUST",
153    "NDJSON",
154    "NEGUE",
155    "NEVER",
156    "NOTE",
157    "NUNCA",
158    "OBRIGATORIA",
159    "OBRIGATÓRIO",
160    "OBSERVEI",
161    "PADRÃO",
162    "PASSIVA",
163    "PASSO",
164    "PENDING",
165    "PGID",
166    "PID",
167    "PLAN",
168    "PODEMOS",
169    "PONTEIROS",
170    "PREFERIR",
171    "PROIBIDO",
172    "PROJETO",
173    "RECUSE",
174    "REGRA",
175    "REGRAS",
176    "REMOVIDAS",
177    "REQUIRED",
178    "REQUISITO",
179    "REST",
180    "SEÇÃO",
181    "SEMPRE",
182    "SHALL",
183    "SHOULD",
184    "SIGTERM",
185    "SOMENTE",
186    "SOUL",
187    "TODAS",
188    "TODO",
189    "TODOS",
190    "TOKEN",
191    "TOOLS",
192    "TSV",
193    "TUI",
194    "UI",
195    "URL",
196    "USAR",
197    "VALIDAR",
198    "VAMOS",
199    "VOCÊ",
200    "WARNING",
201    "XML",
202    "YAML",
203];
204
205// v1.0.22: HTTP methods are protocol verbs, not semantically useful entities.
206// Filtered in apply_regex_prefilter (regex_all_caps path).
207const HTTP_METHODS: &[&str] = &[
208    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209];
210
211fn is_filtered_all_caps(token: &str) -> bool {
212    // Identifiers containing underscore are preserved (e.g. MAX_RETRY, FLOWAIPER_API_KEY)
213    let is_identifier = token.contains('_');
214    if is_identifier {
215        return false;
216    }
217    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
218}
219
220fn regex_email() -> &'static Regex {
221    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
222    REGEX_EMAIL.get_or_init(|| {
223        Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224            .expect("compile-time validated email regex literal")
225    })
226}
227
228fn regex_url() -> &'static Regex {
229    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
230    REGEX_URL.get_or_init(|| {
231        Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232            .expect("compile-time validated URL regex literal")
233    })
234}
235
236fn regex_uuid() -> &'static Regex {
237    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
238    REGEX_UUID.get_or_init(|| {
239        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240            .expect("compile-time validated UUID regex literal")
241    })
242}
243
244fn regex_all_caps() -> &'static Regex {
245    REGEX_ALL_CAPS.get_or_init(|| {
246        Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247            .expect("compile-time validated all-caps regex literal")
248    })
249}
250
251fn regex_section_marker() -> &'static Regex {
252    REGEX_SECTION_MARKER.get_or_init(|| {
253        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1",
254        // "Camada 5", "Passo 2", etc. v1.0.36 (H5): added "Camada" after audit found
255        // "Camada 1".."Camada 5" leaking through into entity extraction with degree>=3.
256        // Accented characters expressed as escapes to keep this source file ASCII-only
257        // per the project language policy. Pattern is equivalent to:
258        //   \b(?:Etapa|Fase|Passo|Camada|Se\xe7\xe3o|Cap\xedtulo)\s+\d+\b
259        Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260            .expect("compile-time validated section marker regex literal")
261    })
262}
263
264fn regex_brand_camel() -> &'static Regex {
265    REGEX_BRAND_CAMEL.get_or_init(|| {
266        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
267        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
268        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269            .expect("compile-time validated CamelCase brand regex literal")
270    })
271}
272
273#[derive(Debug, Clone, PartialEq)]
274pub struct ExtractedEntity {
275    pub name: String,
276    pub entity_type: EntityType,
277}
278
279/// URL with source offset extracted from the memory body.
280#[derive(Debug, Clone)]
281pub struct ExtractedUrl {
282    pub url: String,
283    /// Byte position in the body where the URL was found.
284    pub offset: usize,
285}
286
287#[derive(Debug, Clone)]
288pub struct ExtractionResult {
289    pub entities: Vec<NewEntity>,
290    pub relationships: Vec<NewRelationship>,
291    /// True when build_relationships hit the cap before covering all entity pairs.
292    /// Exposed in RememberResponse so callers can detect when relationships were cut.
293    pub relationships_truncated: bool,
294    /// Extraction method used: `"gliner-<variant>+regex"` or `"regex-only"`.
295    /// Useful for auditing, metrics and user reports.
296    pub extraction_method: String,
297    /// URLs extracted from the body — stored separately from graph entities.
298    pub urls: Vec<ExtractedUrl>,
299}
300
301pub trait Extractor: Send + Sync {
302    fn extract(&self, body: &str) -> Result<ExtractionResult>;
303}
304
305/// GLiNER ONNX model quantization variant.
306#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307pub enum GlinerVariant {
308    Fp32,
309    Fp16,
310    Int8,
311    Q4,
312    Q4f16,
313}
314
315impl GlinerVariant {
316    /// ONNX filename for this variant in the HuggingFace repository.
317    pub fn as_filename(self) -> &'static str {
318        match self {
319            Self::Fp32 => "model.onnx",
320            Self::Fp16 => "model_fp16.onnx",
321            Self::Int8 => "model_quantized.onnx",
322            Self::Q4 => "model_q4.onnx",
323            Self::Q4f16 => "model_q4f16.onnx",
324        }
325    }
326
327    /// Approximate model size for user-facing messages.
328    pub fn display_size(self) -> &'static str {
329        match self {
330            Self::Fp32 => "1.1 GB",
331            Self::Fp16 => "580 MB",
332            Self::Int8 => "349 MB",
333            Self::Q4 => "894 MB",
334            Self::Q4f16 => "472 MB",
335        }
336    }
337}
338
339impl std::fmt::Display for GlinerVariant {
340    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341        match self {
342            Self::Fp32 => f.write_str("fp32"),
343            Self::Fp16 => f.write_str("fp16"),
344            Self::Int8 => f.write_str("int8"),
345            Self::Q4 => f.write_str("q4"),
346            Self::Q4f16 => f.write_str("q4f16"),
347        }
348    }
349}
350
351impl std::str::FromStr for GlinerVariant {
352    type Err = anyhow::Error;
353    fn from_str(s: &str) -> Result<Self> {
354        match s.to_lowercase().as_str() {
355            "fp32" => Ok(Self::Fp32),
356            "fp16" => Ok(Self::Fp16),
357            "int8" => Ok(Self::Int8),
358            "q4" => Ok(Self::Q4),
359            "q4f16" => Ok(Self::Q4f16),
360            other => {
361                anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362            }
363        }
364    }
365}
366
367const GLINER_MAX_WIDTH: usize = 12;
368const GLINER_MAX_SEQ_LEN: usize = 384;
369const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371
372const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373    ("person", EntityType::Person),
374    ("organization", EntityType::Organization),
375    ("location", EntityType::Location),
376    ("date", EntityType::Date),
377    ("project", EntityType::Project),
378    ("tool", EntityType::Tool),
379    ("file", EntityType::File),
380    ("concept", EntityType::Concept),
381    ("decision", EntityType::Decision),
382    ("incident", EntityType::Incident),
383    ("dashboard", EntityType::Dashboard),
384    ("issue tracker", EntityType::IssueTracker),
385    ("memory", EntityType::Memory),
386];
387
388struct GlinerModel {
389    session: std::sync::Mutex<Session>,
390    tokenizer: tokenizers::Tokenizer,
391    #[allow(dead_code)]
392    variant: GlinerVariant,
393}
394
395impl GlinerModel {
396    fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397        let model_path = model_dir.join(variant.as_filename());
398        let tokenizer_path = model_dir.join("tokenizer.json");
399
400        let session = Session::builder()
401            .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
402            .with_optimization_level(GraphOptimizationLevel::Level3)
403            .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
404            .commit_from_file(&model_path)
405            .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
406
407        let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408            .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
409
410        Ok(Self {
411            session: std::sync::Mutex::new(session),
412            tokenizer,
413            variant,
414        })
415    }
416
417    fn predict(
418        &self,
419        body: &str,
420        entity_labels: &[(&str, EntityType)],
421        threshold: f32,
422    ) -> Result<Vec<ExtractedEntity>> {
423        let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424        let words: Vec<&str> = body.split_whitespace().collect();
425        if words.is_empty() {
426            return Ok(Vec::new());
427        }
428
429        // Cap words to fit within model sequence length (accounting for label tokens)
430        let label_token_count = label_names.len() * 2 + 1;
431        let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432        let words = if words.len() > max_words {
433            tracing::warn!(
434                original_words = words.len(),
435                capped_words = max_words,
436                "GLiNER input truncated to fit model sequence length"
437            );
438            &words[..max_words]
439        } else {
440            &words[..]
441        };
442        let num_words = words.len();
443
444        // Build prompt: [<<ENT>>, label1, <<ENT>>, label2, ..., <<SEP>>, word1, word2, ...]
445        let mut prompt_tokens: Vec<String> =
446            Vec::with_capacity(label_names.len() * 2 + 1 + num_words);
447        for label in &label_names {
448            prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
449            prompt_tokens.push((*label).to_string());
450        }
451        prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
452        for word in words {
453            prompt_tokens.push((*word).to_string());
454        }
455
456        // Encode each token individually (word-by-word encoding per GLiNER protocol)
457        let seq_estimate = prompt_tokens.len() * 3;
458        let mut all_ids: Vec<i64> = Vec::with_capacity(seq_estimate);
459        let mut all_attention: Vec<i64> = Vec::with_capacity(seq_estimate);
460        let mut all_word_mask: Vec<i64> = Vec::with_capacity(seq_estimate);
461
462        // BOS token
463        all_ids.push(1);
464        all_attention.push(1);
465        all_word_mask.push(0);
466
467        let text_offset = label_names.len() * 2 + 1;
468        let mut word_id: i64 = 0;
469
470        for (pos, token_str) in prompt_tokens.iter().enumerate() {
471            let encoding = self
472                .tokenizer
473                .encode(token_str.as_str(), false)
474                .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
475            let ids = encoding.get_ids();
476            let is_text_token = pos >= text_offset;
477
478            for (sub_idx, &id) in ids.iter().enumerate() {
479                all_ids.push(id as i64);
480                all_attention.push(1);
481                if is_text_token && sub_idx == 0 {
482                    word_id += 1;
483                    all_word_mask.push(word_id);
484                } else {
485                    all_word_mask.push(0);
486                }
487            }
488        }
489
490        // EOS token
491        all_ids.push(2);
492        all_attention.push(1);
493        all_word_mask.push(0);
494
495        let seq_len = all_ids.len();
496
497        // Build ORT tensors using Tensor::from_array((shape, data)) API
498        let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
499            .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
500        let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
501            .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
502        let t_words_mask =
503            ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
504                .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
505        let t_text_lengths =
506            ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
507                .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
508
509        // Build span tensors
510        let num_spans = num_words * GLINER_MAX_WIDTH;
511        let mut span_idx_data = vec![0i64; num_spans * 2];
512        let mut span_mask_data = vec![false; num_spans];
513
514        for start in 0..num_words {
515            let remaining = num_words - start;
516            let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
517            for width in 0..actual_max_width {
518                let dim = start * GLINER_MAX_WIDTH + width;
519                span_idx_data[dim * 2] = start as i64;
520                span_idx_data[dim * 2 + 1] = (start + width) as i64;
521                span_mask_data[dim] = true;
522            }
523        }
524
525        let t_span_idx =
526            ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
527                .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
528        let t_span_mask =
529            ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
530                .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
531
532        // Run inference — Session::run requires &mut Session; bind guard first.
533        let mut session_guard = self
534            .session
535            .lock()
536            .map_err(|_| anyhow::anyhow!("GLiNER session mutex poisoned"))?;
537        let outputs = session_guard
538            .run(ort::inputs![
539                "input_ids" => t_input_ids,
540                "attention_mask" => t_attention,
541                "words_mask" => t_words_mask,
542                "text_lengths" => t_text_lengths,
543                "span_idx" => t_span_idx,
544                "span_mask" => t_span_mask
545            ])
546            .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
547
548        // Extract logits: [1, num_words, max_width, num_classes]
549        // try_extract_tensor returns (&Shape, &[f32]); index manually.
550        let (logits_shape, logits_data) = outputs["logits"]
551            .try_extract_tensor::<f32>()
552            .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
553
554        let num_classes = label_names.len();
555        // Expected shape: [1, num_words, GLINER_MAX_WIDTH, num_classes]
556        // Shape derefs to &[i64] so we can index directly.
557        let max_width = logits_shape
558            .get(2)
559            .copied()
560            .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
561        let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
562
563        let mut candidates: Vec<(usize, usize, usize, f32)> =
564            Vec::with_capacity(num_words * max_width);
565
566        for start in 0..num_words {
567            for width in 0..max_width {
568                let end = start + width;
569                if end >= num_words {
570                    break;
571                }
572                for class_idx in 0..nc.min(num_classes) {
573                    // flat index: batch=0 * (num_words*max_width*nc) + start*(max_width*nc) + width*nc + class_idx
574                    let flat = start * (max_width * nc) + width * nc + class_idx;
575                    if flat >= logits_data.len() {
576                        break;
577                    }
578                    let raw = logits_data[flat];
579                    let score = 1.0 / (1.0 + (-raw).exp());
580                    if score >= threshold {
581                        candidates.push((start, end, class_idx, score));
582                    }
583                }
584            }
585        }
586
587        // Sort by score descending for greedy NMS
588        candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
589
590        // Greedy non-maximum suppression
591        let mut used = vec![false; num_words];
592        let mut entities: Vec<ExtractedEntity> = Vec::with_capacity(candidates.len().min(MAX_ENTS));
593
594        for (start, end, class_idx, _score) in &candidates {
595            let overlap = (*start..=*end).any(|i| used[i]);
596            if overlap {
597                continue;
598            }
599            for flag in used.iter_mut().take(*end + 1).skip(*start) {
600                *flag = true;
601            }
602            let text = words[*start..=*end].join(" ");
603            if text.len() < MIN_ENTITY_CHARS {
604                continue;
605            }
606            let entity_type = entity_labels[*class_idx].1;
607            entities.push(ExtractedEntity {
608                name: text,
609                entity_type,
610            });
611            if entities.len() >= MAX_ENTS {
612                break;
613            }
614        }
615
616        Ok(entities)
617    }
618}
619
620static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
621
622fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
623    paths.models.join(format!("gliner-multi-v2.1/{variant}"))
624}
625
626fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
627    let dir = gliner_model_dir(paths, variant);
628    std::fs::create_dir_all(&dir)
629        .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
630
631    let model_file = dir.join(variant.as_filename());
632    let tokenizer_file = dir.join("tokenizer.json");
633
634    if model_file.exists() && tokenizer_file.exists() {
635        return Ok(dir);
636    }
637
638    let repo = crate::constants::gliner_model_repo();
639    tracing::info!(
640        "Downloading GLiNER model ({variant}, ~{})...",
641        variant.display_size()
642    );
643    crate::output::emit_progress_i18n(
644        &format!(
645            "Downloading GLiNER model ({variant}, ~{})...",
646            variant.display_size()
647        ),
648        &format!(
649            "Baixando modelo GLiNER ({variant}, ~{})...",
650            variant.display_size()
651        ),
652    );
653
654    let api = huggingface_hub::api::sync::Api::new().context("creating HF Hub client")?;
655    let hf_repo = api.model(repo);
656
657    let remote_model = format!("onnx/{}", variant.as_filename());
658    if !model_file.exists() {
659        let src = hf_repo
660            .get(&remote_model)
661            .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
662        std::fs::copy(&src, &model_file)
663            .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
664    }
665
666    if !tokenizer_file.exists() {
667        let src = hf_repo
668            .get("tokenizer.json")
669            .context("downloading tokenizer.json from HF Hub")?;
670        std::fs::copy(&src, &tokenizer_file).context("copying tokenizer.json to cache")?;
671    }
672
673    Ok(dir)
674}
675
676fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
677    let dir = ensure_gliner_model_files(paths, variant)?;
678    GlinerModel::load(&dir, variant)
679}
680
681fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
682    GLINER_MODEL
683        .get_or_init(|| match load_gliner_model(paths, variant) {
684            Ok(m) => Some(m),
685            Err(e) => {
686                tracing::warn!("GLiNER model unavailable (graceful degradation): {e:#}");
687                None
688            }
689        })
690        .as_ref()
691}
692
693fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
694    let mut entities = Vec::with_capacity(16);
695    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
696
697    let add = |entities: &mut Vec<ExtractedEntity>,
698               seen: &mut std::collections::HashSet<String>,
699               name: &str,
700               entity_type: EntityType| {
701        let name = name.trim().to_string();
702        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
703            entities.push(ExtractedEntity { name, entity_type });
704        }
705    };
706
707    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
708    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
709    let cleaned = regex_section_marker().replace_all(body, " ");
710    let cleaned = cleaned.as_ref();
711
712    for m in regex_email().find_iter(cleaned) {
713        // v1.0.20: email is "concept" (regex alone cannot distinguish person from mailing list/role).
714        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
715    }
716    for m in regex_uuid().find_iter(cleaned) {
717        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
718    }
719    for m in regex_all_caps().find_iter(cleaned) {
720        let candidate = m.as_str();
721        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
722        if !is_filtered_all_caps(candidate) {
723            add(&mut entities, &mut seen, candidate, EntityType::Concept);
724        }
725    }
726    // v1.0.25 P0-2: capture CamelCase brand names that NER model often misses.
727    // Maps to "organization" (V008 schema) because brand names are typically organisations.
728    for m in regex_brand_camel().find_iter(cleaned) {
729        let name = m.as_str();
730        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
731        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
732            add(&mut entities, &mut seen, name, EntityType::Organization);
733        }
734    }
735
736    entities
737}
738
739/// Extracts URLs from a memory body, deduplicated by text.
740/// URLs are stored in the `memory_urls` table separately from graph entities.
741/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
742pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
743    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
744    let mut result = Vec::with_capacity(4);
745    for m in regex_url().find_iter(body) {
746        let raw = m.as_str();
747        let cleaned = raw
748            .trim_end_matches('`')
749            .trim_end_matches(',')
750            .trim_end_matches('.')
751            .trim_end_matches(';')
752            .trim_end_matches(')')
753            .trim_end_matches(']')
754            .trim_end_matches('}');
755        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
756            result.push(ExtractedUrl {
757                url: cleaned.to_string(),
758                offset: m.start(),
759            });
760        }
761    }
762    result
763}
764
765/// Returns (relationships, truncated) where truncated is true when the cap was hit
766/// before all entity pairs were covered. Exposed in RememberResponse as
767/// `relationships_truncated` so callers can decide whether to increase the cap.
768///
769/// v1.0.31 A9: superseded by `build_relationships_by_sentence_cooccurrence` for
770/// the auto-extraction pipeline because the legacy pairwise scheme produces a
771/// dense C(N,2) graph polluted with co-mentions across unrelated paragraphs.
772/// Kept for unit tests that pin the cap behaviour and for callers that lack a
773/// body string.
774#[cfg(test)]
775fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
776    if entities.len() < 2 {
777        return (Vec::new(), false);
778    }
779
780    // v1.0.22: cap configurable via env var (constants::max_relationships_per_memory).
781    // Allows users with dense corpora to increase beyond the default 50.
782    let max_rels = crate::constants::max_relationships_per_memory();
783    let n = entities.len().min(MAX_ENTS);
784    let mut rels: Vec<NewRelationship> = Vec::new();
785    let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
786
787    let mut hit_cap = false;
788    'outer: for i in 0..n {
789        if rels.len() >= max_rels {
790            hit_cap = true;
791            break;
792        }
793
794        let mut for_entity = 0usize;
795        for j in (i + 1)..n {
796            if for_entity >= TOP_K_RELATIONS {
797                break;
798            }
799            if rels.len() >= max_rels {
800                hit_cap = true;
801                break 'outer;
802            }
803
804            let key = (i.min(j), i.max(j));
805            if !seen.insert(key) {
806                continue;
807            }
808
809            rels.push(NewRelationship {
810                // clone needed: NewRelationship requires owned String for source/target
811                source: entities[i].name.clone(),
812                target: entities[j].name.clone(),
813                relation: DEFAULT_RELATION.to_string(),
814                strength: 0.5,
815                description: None,
816            });
817            for_entity += 1;
818        }
819    }
820
821    // v1.0.20: warn when relationships were truncated before covering all possible pairs.
822    if hit_cap {
823        tracing::warn!(
824            "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
825            n.saturating_sub(1)
826        );
827    }
828
829    (rels, hit_cap)
830}
831
832/// v1.0.31 A9: build relationships only between entities that actually
833/// co-occur within the same sentence (split on `.`, `!`, `?`, newline).
834///
835/// The legacy `build_relationships` pairs every entity with every other,
836/// yielding a dense C(N,2) graph dominated by spurious "mentions" edges
837/// across unrelated sections. Restricting to sentence-level co-occurrence
838/// keeps the edges semantically meaningful while still respecting the
839/// configurable `max_relationships_per_memory` cap.
840///
841/// Returns `(relationships, truncated)` mirroring `build_relationships`.
842fn build_relationships_by_sentence_cooccurrence(
843    body: &str,
844    entities: &[NewEntity],
845) -> (Vec<NewRelationship>, bool) {
846    if entities.len() < 2 {
847        return (Vec::new(), false);
848    }
849
850    let max_rels = crate::constants::max_relationships_per_memory();
851    let lower_names: Vec<(usize, String)> = entities
852        .iter()
853        .take(MAX_ENTS)
854        .enumerate()
855        .map(|(i, e)| (i, e.name.to_lowercase()))
856        .collect();
857
858    let mut rels: Vec<NewRelationship> = Vec::with_capacity(max_rels);
859    let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
860    let mut hit_cap = false;
861
862    for sentence in body.split(['.', '!', '?', '\n']) {
863        if sentence.trim().is_empty() {
864            continue;
865        }
866        let lower_sentence = sentence.to_lowercase();
867        let present: Vec<usize> = lower_names
868            .iter()
869            .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
870            .map(|(i, _)| *i)
871            .collect();
872
873        if present.len() < 2 {
874            continue;
875        }
876
877        for i in 0..present.len() {
878            for j in (i + 1)..present.len() {
879                if rels.len() >= max_rels {
880                    hit_cap = true;
881                    tracing::warn!(
882                        "relationships truncated to {max_rels} during sentence-level pairing"
883                    );
884                    return (rels, hit_cap);
885                }
886                let ei = present[i];
887                let ej = present[j];
888                let key = (ei.min(ej), ei.max(ej));
889                if seen.insert(key) {
890                    rels.push(NewRelationship {
891                        source: entities[ei].name.clone(),
892                        target: entities[ej].name.clone(),
893                        relation: DEFAULT_RELATION.to_string(),
894                        strength: 0.5,
895                        description: None,
896                    });
897                }
898            }
899        }
900    }
901
902    (rels, hit_cap)
903}
904
905/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
906/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
907/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
908/// Conservative: only extends when the suffix is at most 7 characters.
909/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
910/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
911fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
912    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
913    // Matches: separator + digits + optional decimal + optional lowercase letter
914    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
915    let suffix_re = SUFFIX_RE.get_or_init(|| {
916        Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
917            .expect("compile-time validated numeric suffix regex literal")
918    });
919
920    entities
921        .into_iter()
922        .map(|ent| {
923            // Finds the first case-sensitive occurrence of the entity in the body
924            if let Some(pos) = body.find(&ent.name) {
925                let after_pos = pos + ent.name.len();
926                if after_pos < body.len() {
927                    let after = &body[after_pos..];
928                    if let Some(m) = suffix_re.find(after) {
929                        let suffix = m.as_str();
930                        // Conservative: cap suffix length to 7 chars to avoid grabbing
931                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
932                        if suffix.len() <= 7 {
933                            let mut extended = String::with_capacity(ent.name.len() + suffix.len());
934                            extended.push_str(&ent.name);
935                            extended.push_str(suffix);
936                            return ExtractedEntity {
937                                name: extended,
938                                entity_type: ent.entity_type,
939                            };
940                        }
941                    }
942                }
943            }
944            ent
945        })
946        .collect()
947}
948
949/// Captures versioned model names that NER model consistently misses.
950///
951/// NER model often classifies tokens like "Claude" or "Llama" as common nouns,
952/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
953/// never sees these candidates and the version suffix gets lost.
954///
955/// This function scans the body with a conservative regex, matching capitalised
956/// words followed by a space-or-hyphen and a small integer. Matches that are not
957/// already covered by an existing entity (case-insensitive) are appended with the
958/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
959/// items downstream.
960///
961/// v1.0.24 P2-D: regex extended to cover:
962/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
963/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
964/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
965///
966/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
967/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
968fn augment_versioned_model_names(
969    entities: Vec<ExtractedEntity>,
970    body: &str,
971) -> Vec<ExtractedEntity> {
972    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
973    // Pattern breakdown:
974    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
975    //   [\s\-]+               — separator: space(s) or hyphen(s)
976    //   \d+(?:\.\d+)?         — version number, optional decimal
977    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
978    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
979    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
980        Regex::new(
981            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
982        )
983        .expect("compile-time validated versioned model regex literal")
984    });
985
986    let mut existing_lc: std::collections::HashSet<String> =
987        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
988    let mut result = entities;
989
990    for caps in model_re.captures_iter(body) {
991        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
992        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
993        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
994        if full_match.is_empty() || full_match.len() > 24 {
995            continue;
996        }
997        let normalized_lc = full_match.to_lowercase();
998        if existing_lc.contains(&normalized_lc) {
999            continue;
1000        }
1001        // Stop appending once the global entity cap is reached to keep parity with
1002        // `merge_and_deduplicate` truncation semantics.
1003        if result.len() >= MAX_ENTS {
1004            break;
1005        }
1006        existing_lc.insert(normalized_lc);
1007        result.push(ExtractedEntity {
1008            name: full_match.to_string(),
1009            entity_type: EntityType::Concept,
1010        });
1011    }
1012
1013    result
1014}
1015
1016fn merge_and_deduplicate(
1017    regex_ents: Vec<ExtractedEntity>,
1018    ner_ents: Vec<ExtractedEntity>,
1019) -> Vec<ExtractedEntity> {
1020    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
1021    // and is scoped per entity_type. This fixes two bugs from prior versions:
1022    //
1023    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
1024    //    start_with "sonnet", so the pair could survive dedup depending on insertion
1025    //    order. contains() catches both directions unconditionally.
1026    //
1027    // 2. The lookup key omitted entity_type, so "Apple/organization" and
1028    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
1029    //
1030    // Earlier invariants preserved:
1031    // - NFKC normalization before lowercasing (v1.0.24).
1032    // - Longest-wins: on collision keep the entity with the longer name.
1033    // - Truncation warning at MAX_ENTS.
1034    let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
1035    let mut result: Vec<ExtractedEntity> = Vec::with_capacity(MAX_ENTS);
1036    let mut truncated = false;
1037
1038    let total_input = regex_ents.len() + ner_ents.len();
1039    for ent in regex_ents.into_iter().chain(ner_ents) {
1040        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1041        // Composite key: entity_type + NUL + normalised lowercase name.
1042        // Collision search is scoped to the same type so that e.g.
1043        // "Apple/organization" and "Apple/concept" are kept separately.
1044        let key = {
1045            let et = ent.entity_type.as_str();
1046            let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1047            k.push_str(et);
1048            k.push('\0');
1049            k.push_str(&name_lc);
1050            k
1051        };
1052
1053        // Scan stored entries for substring containment within the same type.
1054        // Two names collide when one is a case-insensitive substring of the other:
1055        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
1056        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
1057        let type_prefix = {
1058            let et = ent.entity_type.as_str();
1059            let mut p = String::with_capacity(et.len() + 1);
1060            p.push_str(et);
1061            p.push('\0');
1062            p
1063        };
1064        let mut collision_idx: Option<usize> = None;
1065        for (existing_key, idx) in &by_lc {
1066            // Fast-path: check type prefix matches before scanning the name.
1067            if !existing_key.starts_with(&type_prefix) {
1068                continue;
1069            }
1070            let existing_name_lc = &existing_key[type_prefix.len()..];
1071            if existing_name_lc == name_lc
1072                || existing_name_lc.contains(name_lc.as_str())
1073                || name_lc.contains(existing_name_lc)
1074            {
1075                collision_idx = Some(*idx);
1076                break;
1077            }
1078        }
1079        match collision_idx {
1080            Some(idx) => {
1081                // Replace stored entity only when the new candidate is strictly
1082                // longer; otherwise drop the new one.
1083                if ent.name.len() > result[idx].name.len() {
1084                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1085                    let old_key = {
1086                        let et = result[idx].entity_type.as_str();
1087                        let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1088                        k.push_str(et);
1089                        k.push('\0');
1090                        k.push_str(&old_name_lc);
1091                        k
1092                    };
1093                    by_lc.remove(&old_key);
1094                    result[idx] = ent;
1095                    by_lc.insert(key, idx);
1096                }
1097            }
1098            None => {
1099                by_lc.insert(key, result.len());
1100                result.push(ent);
1101            }
1102        }
1103        if result.len() >= MAX_ENTS {
1104            truncated = true;
1105            break;
1106        }
1107    }
1108
1109    // v1.0.20: warn when silent truncation discards entities above MAX_ENTS.
1110    if truncated {
1111        tracing::warn!(
1112            "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1113        );
1114    }
1115
1116    result
1117}
1118
1119fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1120    extracted
1121        .into_iter()
1122        .map(|e| NewEntity {
1123            name: e.name,
1124            entity_type: e.entity_type,
1125            description: None,
1126        })
1127        .collect()
1128}
1129
1130pub fn extract_graph_auto(
1131    body: &str,
1132    paths: &AppPaths,
1133    variant: GlinerVariant,
1134) -> Result<ExtractionResult> {
1135    let regex_entities = apply_regex_prefilter(body);
1136    let threshold = crate::constants::gliner_confidence_threshold();
1137
1138    let mut gliner_used = false;
1139    let ner_entities = match get_or_init_gliner(paths, variant) {
1140        Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1141            Ok(ents) => {
1142                gliner_used = true;
1143                ents
1144            }
1145            Err(e) => {
1146                tracing::warn!("GLiNER NER failed, falling back to regex-only extraction: {e:#}");
1147                Vec::new()
1148            }
1149        },
1150        None => Vec::new(),
1151    };
1152
1153    let merged = merge_and_deduplicate(regex_entities, ner_entities);
1154    let extended = extend_with_numeric_suffix(merged, body);
1155    let with_models = augment_versioned_model_names(extended, body);
1156    let with_models: Vec<ExtractedEntity> = with_models
1157        .into_iter()
1158        .filter(|e| !regex_section_marker().is_match(&e.name))
1159        .collect();
1160    let entities = to_new_entities(with_models);
1161    let (relationships, relationships_truncated) =
1162        build_relationships_by_sentence_cooccurrence(body, &entities);
1163
1164    let extraction_method = if gliner_used {
1165        format!("gliner-{variant}+regex")
1166    } else {
1167        "regex-only".to_string()
1168    };
1169
1170    let urls = extract_urls(body);
1171
1172    Ok(ExtractionResult {
1173        entities,
1174        relationships,
1175        relationships_truncated,
1176        extraction_method,
1177        urls,
1178    })
1179}
1180
1181pub struct RegexExtractor;
1182
1183impl Extractor for RegexExtractor {
1184    fn extract(&self, body: &str) -> Result<ExtractionResult> {
1185        let regex_entities = apply_regex_prefilter(body);
1186        let entities = to_new_entities(regex_entities);
1187        let (relationships, relationships_truncated) =
1188            build_relationships_by_sentence_cooccurrence(body, &entities);
1189        let urls = extract_urls(body);
1190        Ok(ExtractionResult {
1191            entities,
1192            relationships,
1193            relationships_truncated,
1194            extraction_method: "regex-only".to_string(),
1195            urls,
1196        })
1197    }
1198}
1199
1200#[cfg(test)]
1201mod tests {
1202    use super::*;
1203    use crate::entity_type::EntityType;
1204
1205    fn make_paths() -> AppPaths {
1206        use std::path::PathBuf;
1207        AppPaths {
1208            db: PathBuf::from("/tmp/test.sqlite"),
1209            models: PathBuf::from("/tmp/test_models"),
1210        }
1211    }
1212
1213    #[test]
1214    fn regex_email_captures_address() {
1215        let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1216        // v1.0.20: emails are classified as "concept" (regex alone cannot distinguish person from role).
1217        assert!(ents
1218            .iter()
1219            .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1220    }
1221
1222    #[test]
1223    fn regex_all_caps_filters_pt_rule_word() {
1224        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE must not become "entities".
1225        let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1226        assert!(
1227            !ents.iter().any(|e| e.name == "NUNCA"),
1228            "NUNCA must be filtered as a stopword"
1229        );
1230        assert!(
1231            !ents.iter().any(|e| e.name == "PROIBIDO"),
1232            "PROIBIDO must be filtered"
1233        );
1234        assert!(
1235            !ents.iter().any(|e| e.name == "DEVE"),
1236            "DEVE must be filtered"
1237        );
1238    }
1239
1240    #[test]
1241    fn regex_all_caps_accepts_underscored_constant() {
1242        // Technical constants like MAX_RETRY, TIMEOUT_MS must always be accepted.
1243        let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1244        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1245        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1246    }
1247
1248    #[test]
1249    fn regex_all_caps_accepts_domain_acronym() {
1250        // Legitimate (non-stopword) acronyms must pass: OPENAI, NVIDIA, GOOGLE.
1251        let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1252        assert!(ents.iter().any(|e| e.name == "OPENAI"));
1253        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1254    }
1255
1256    #[test]
1257    fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1258        // v1.0.24 P0-2: URLs were removed from apply_regex_prefilter and now go through extract_urls.
1259        let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1260        assert!(
1261            !ents.iter().any(|e| e.name.starts_with("https://")),
1262            "URLs must not appear as entities after the P0-2 split"
1263        );
1264    }
1265
1266    #[test]
1267    fn extract_urls_captures_https() {
1268        let urls = extract_urls("see https://docs.rs/crate for details");
1269        assert_eq!(urls.len(), 1);
1270        assert_eq!(urls[0].url, "https://docs.rs/crate");
1271        assert!(urls[0].offset > 0);
1272    }
1273
1274    #[test]
1275    fn extract_urls_trim_sufixo_pontuacao() {
1276        let urls = extract_urls("link: https://example.com/path. fim");
1277        assert!(!urls.is_empty());
1278        assert!(
1279            !urls[0].url.ends_with('.'),
1280            "sufixo ponto deve ser removido"
1281        );
1282    }
1283
1284    #[test]
1285    fn extract_urls_dedupes_repeated() {
1286        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1287        let urls = extract_urls(body);
1288        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1289    }
1290
1291    #[test]
1292    fn regex_uuid_captura_identificador() {
1293        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1294        assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1295    }
1296
1297    #[test]
1298    fn regex_all_caps_captura_constante() {
1299        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1300        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1301        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1302    }
1303
1304    #[test]
1305    fn regex_all_caps_ignores_short_words() {
1306        let ents = apply_regex_prefilter("use AI em seu projeto");
1307        assert!(
1308            !ents.iter().any(|e| e.name == "AI"),
1309            "AI tem apenas 2 chars, deve ser ignorado"
1310        );
1311    }
1312
1313    #[test]
1314    fn build_relationships_respeitam_max_rels() {
1315        let entities: Vec<NewEntity> = (0..20)
1316            .map(|i| NewEntity {
1317                name: format!("entidade_{i}"),
1318                entity_type: EntityType::Concept,
1319                description: None,
1320            })
1321            .collect();
1322        let (rels, truncated) = build_relationships(&entities);
1323        let max_rels = crate::constants::max_relationships_per_memory();
1324        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1325        if rels.len() == max_rels {
1326            assert!(truncated, "truncated deve ser true quando atingiu o cap");
1327        }
1328    }
1329
1330    #[test]
1331    fn build_relationships_without_duplicates() {
1332        let entities: Vec<NewEntity> = (0..5)
1333            .map(|i| NewEntity {
1334                name: format!("ent_{i}"),
1335                entity_type: EntityType::Concept,
1336                description: None,
1337            })
1338            .collect();
1339        let (rels, _truncated) = build_relationships(&entities);
1340        let mut pares: std::collections::HashSet<(String, String)> =
1341            std::collections::HashSet::new();
1342        for r in &rels {
1343            let par = (r.source.clone(), r.target.clone());
1344            assert!(pares.insert(par), "par duplicado encontrado");
1345        }
1346    }
1347
1348    #[test]
1349    fn merge_dedupes_by_lowercase_name() {
1350        // v1.0.25: collision detection is scoped per entity_type; same name + same type
1351        // must deduplicate to one entry. Different types are kept separately.
1352        let a = vec![ExtractedEntity {
1353            name: "Rust".to_string(),
1354            entity_type: EntityType::Concept,
1355        }];
1356        let b = vec![ExtractedEntity {
1357            name: "rust".to_string(),
1358            entity_type: EntityType::Concept,
1359        }];
1360        let merged = merge_and_deduplicate(a, b);
1361        assert_eq!(
1362            merged.len(),
1363            1,
1364            "rust and Rust with the same type are the same entity"
1365        );
1366    }
1367
1368    #[test]
1369    fn regex_extractor_implements_trait() {
1370        let extractor = RegexExtractor;
1371        let result = extractor
1372            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1373            .unwrap();
1374        assert!(!result.entities.is_empty());
1375    }
1376
1377    #[test]
1378    fn extract_returns_ok_without_model() {
1379        // Without a downloaded model, must return Ok with regex-only entities.
1380        let paths = make_paths();
1381        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1382        let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1383        assert!(result
1384            .entities
1385            .iter()
1386            .any(|e| e.name.contains("teste@exemplo.com")));
1387    }
1388
1389    #[test]
1390    fn stopwords_filter_v1024_terms() {
1391        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1392        // by apply_regex_prefilter so they do not appear as entities.
1393        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1394                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1395        let ents = apply_regex_prefilter(body);
1396        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1397        for word in &[
1398            "ACEITE",
1399            "ACK",
1400            "ACL",
1401            "BORDA",
1402            "CHECKLIST",
1403            "COMPLETED",
1404            "CONFIRME",
1405            "DEVEMOS",
1406            "DONE",
1407            "FIXED",
1408            "NEGUE",
1409            "PENDING",
1410            "PLAN",
1411            "PODEMOS",
1412            "RECUSE",
1413            "TOKEN",
1414            "VAMOS",
1415        ] {
1416            assert!(
1417                !names.contains(word),
1418                "v1.0.24 stopword {word} should be filtered but was found in entities"
1419            );
1420        }
1421    }
1422
1423    #[test]
1424    fn dedup_normalizes_unicode_combining_marks() {
1425        // v1.0.24 P1-E: "Caf\u{e9}" (NFC precomposed) and "Cafe\u{301}" (NFD with
1426        // combining acute accent) must deduplicate to a single entity after NFKC
1427        // normalization.
1428        let nfc = vec![ExtractedEntity {
1429            name: "Caf\u{e9}".to_string(),
1430            entity_type: EntityType::Concept,
1431        }];
1432        // Build the NFD form: 'e' followed by combining acute accent U+0301
1433        let nfd_name = "Cafe\u{301}".to_string();
1434        let nfd = vec![ExtractedEntity {
1435            name: nfd_name,
1436            entity_type: EntityType::Concept,
1437        }];
1438        let merged = merge_and_deduplicate(nfc, nfd);
1439        assert_eq!(
1440            merged.len(),
1441            1,
1442            "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1443        );
1444    }
1445
1446    #[test]
1447    fn extraction_method_regex_only_unchanged() {
1448        // RegexExtractor always returns "regex-only" regardless of GLINER_MODEL state.
1449        // This guards against accidentally changing the regex-only fallback string.
1450        let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1451        assert_eq!(
1452            result.extraction_method, "regex-only",
1453            "RegexExtractor must return regex-only"
1454        );
1455    }
1456
1457    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1458
1459    #[test]
1460    fn extend_suffix_pure_numeric_unchanged() {
1461        // Existing behaviour: pure-numeric suffix must still work after P2-E.
1462        let ents = vec![ExtractedEntity {
1463            name: "GPT".to_string(),
1464            entity_type: EntityType::Concept,
1465        }];
1466        let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1467        assert_eq!(
1468            result[0].name, "GPT-5",
1469            "purely numeric suffix must be extended"
1470        );
1471    }
1472
1473    #[test]
1474    fn extend_suffix_alphanumeric_letter_after_digit() {
1475        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1476        let ents = vec![ExtractedEntity {
1477            name: "GPT".to_string(),
1478            entity_type: EntityType::Concept,
1479        }];
1480        let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1481        assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
1482    }
1483
1484    #[test]
1485    fn extend_suffix_alphanumeric_b_suffix() {
1486        // P2-E: "5b" suffix (digit + 'b') must be captured.
1487        let ents = vec![ExtractedEntity {
1488            name: "Llama".to_string(),
1489            entity_type: EntityType::Concept,
1490        }];
1491        let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1492        assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
1493    }
1494
1495    #[test]
1496    fn extend_suffix_alphanumeric_x_suffix() {
1497        // P2-E: "8x" suffix (digit + 'x') must be captured.
1498        let ents = vec![ExtractedEntity {
1499            name: "Mistral".to_string(),
1500            entity_type: EntityType::Concept,
1501        }];
1502        let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1503        assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
1504    }
1505
1506    // --- P2-D: augment_versioned_model_names extended regex ---
1507
1508    #[test]
1509    fn augment_versioned_gpt4o() {
1510        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1511        let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1512        assert!(
1513            result.iter().any(|e| e.name == "GPT-4o"),
1514            "GPT-4o must be captured by augment, found: {:?}",
1515            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1516        );
1517    }
1518
1519    #[test]
1520    fn augment_versioned_claude_4_sonnet() {
1521        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1522        let result =
1523            augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1524        assert!(
1525            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1526            "Claude 4 Sonnet must be captured, found: {:?}",
1527            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1528        );
1529    }
1530
1531    #[test]
1532    fn augment_versioned_llama_3_pro() {
1533        // P2-D: "Llama 3 Pro" must be captured with release tier.
1534        let result =
1535            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1536        assert!(
1537            result.iter().any(|e| e.name == "Llama 3 Pro"),
1538            "Llama 3 Pro deve ser capturado, achados: {:?}",
1539            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1540        );
1541    }
1542
1543    #[test]
1544    fn augment_versioned_mixtral_8x7b() {
1545        // P2-D: "Mixtral 8x7B" composite version must be captured.
1546        let result =
1547            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1548        assert!(
1549            result.iter().any(|e| e.name == "Mixtral 8x7B"),
1550            "Mixtral 8x7B deve ser capturado, achados: {:?}",
1551            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1552        );
1553    }
1554
1555    #[test]
1556    fn augment_versioned_does_not_duplicate_existing() {
1557        // P2-D back-compat: entities already present must not be duplicated.
1558        let existing = vec![ExtractedEntity {
1559            name: "Claude 4".to_string(),
1560            entity_type: EntityType::Concept,
1561        }];
1562        let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1563        let count = result.iter().filter(|e| e.name == "Claude 4").count();
1564        assert_eq!(count, 1, "Claude 4 must not be duplicated");
1565    }
1566
1567    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1568
1569    #[test]
1570    fn stopwords_filter_url_jwt_api_v1025() {
1571        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1572        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1573        let ents = apply_regex_prefilter(body);
1574        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1575        for blocked in &[
1576            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1577        ] {
1578            assert!(
1579                !names.contains(blocked),
1580                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1581            );
1582        }
1583    }
1584
1585    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1586
1587    #[test]
1588    fn section_markers_etapa_fase_filtered_v1025() {
1589        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1590        // Body intentionally uses PT-BR section keywords (Etapa/Fase/Migra\u{e7}\u{e3}o) to
1591        // exercise the PT-BR section-marker filter. ASCII-escaped per the project policy.
1592        let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1593        let ents = apply_regex_prefilter(body);
1594        assert!(
1595            !ents
1596                .iter()
1597                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1598            "section markers must be stripped; entities: {:?}",
1599            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1600        );
1601    }
1602
1603    #[test]
1604    fn section_markers_passo_secao_filtered_v1025() {
1605        // PT-BR keywords Passo/Se\u{e7}\u{e3}o written with Unicode escapes per the
1606        // project language policy.
1607        let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1608        let ents = apply_regex_prefilter(body);
1609        assert!(
1610            !ents
1611                .iter()
1612                .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
1613            "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1614            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1615        );
1616    }
1617
1618    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1619
1620    #[test]
1621    fn brand_camelcase_extracted_as_organization_v1025() {
1622        // "OpenAI" is a CamelCase brand that NER model often misses.
1623        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1624        let ents = apply_regex_prefilter(body);
1625        let openai = ents.iter().find(|e| e.name == "OpenAI");
1626        assert!(
1627            openai.is_some(),
1628            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1629            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1630        );
1631        assert_eq!(
1632            openai.unwrap().entity_type,
1633            EntityType::Organization,
1634            "brand CamelCase must map to organization (V008)"
1635        );
1636    }
1637
1638    #[test]
1639    fn brand_postgresql_extracted_as_organization_v1025() {
1640        let body = "migrating from MySQL to PostgreSQL for better performance.";
1641        let ents = apply_regex_prefilter(body);
1642        assert!(
1643            ents.iter()
1644                .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
1645            "PostgreSQL must be extracted as organization; entities: {:?}",
1646            ents.iter()
1647                .map(|e| (&e.name, &e.entity_type))
1648                .collect::<Vec<_>>()
1649        );
1650    }
1651
1652    // --- P0-3 longest-wins v1.0.25 ---
1653
1654    fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1655        ExtractedEntity {
1656            name: name.to_string(),
1657            entity_type,
1658        }
1659    }
1660
1661    #[test]
1662    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1663        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1664        let regex = vec![entity("Sonne", EntityType::Concept)];
1665        let ner = vec![entity("Sonnet", EntityType::Concept)];
1666        let result = merge_and_deduplicate(regex, ner);
1667        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1668        assert_eq!(result[0].name, "Sonnet");
1669    }
1670
1671    #[test]
1672    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1673        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1674        let regex = vec![
1675            entity("Open", EntityType::Organization),
1676            entity("OpenAI", EntityType::Organization),
1677        ];
1678        let result = merge_and_deduplicate(regex, vec![]);
1679        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1680        assert_eq!(result[0].name, "OpenAI");
1681    }
1682
1683    #[test]
1684    fn merge_keeps_both_when_no_containment_v1025() {
1685        // "Alice" and "Bob" share no containment — both must be preserved.
1686        let regex = vec![
1687            entity("Alice", EntityType::Person),
1688            entity("Bob", EntityType::Person),
1689        ];
1690        let result = merge_and_deduplicate(regex, vec![]);
1691        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1692    }
1693
1694    #[test]
1695    fn merge_respects_entity_type_boundary_v1025() {
1696        // Same name "Apple" but different types: both must survive independently.
1697        let regex = vec![
1698            entity("Apple", EntityType::Organization),
1699            entity("Apple", EntityType::Concept),
1700        ];
1701        let result = merge_and_deduplicate(regex, vec![]);
1702        assert_eq!(
1703            result.len(),
1704            2,
1705            "expected 2 entities (different types), got: {result:?}"
1706        );
1707    }
1708
1709    #[test]
1710    fn merge_case_insensitive_dedup_v1025() {
1711        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1712        let regex = vec![
1713            entity("OpenAI", EntityType::Organization),
1714            entity("openai", EntityType::Organization),
1715        ];
1716        let result = merge_and_deduplicate(regex, vec![]);
1717        assert_eq!(
1718            result.len(),
1719            1,
1720            "expected 1 entity after case-insensitive dedup, got: {result:?}"
1721        );
1722    }
1723
1724    // ── v1.0.31 A1: NER cap protects against pathological body sizes ──
1725
1726    #[test]
1727    fn extract_graph_auto_handles_large_body_under_30s() {
1728        // Regression guard for the v1.0.31 A1 fix. A 80 KB body without real
1729        // entities must complete in under 30 s; before the cap it took 5+ minutes.
1730        let body = "x ".repeat(40_000);
1731        let paths = make_paths();
1732        let start = std::time::Instant::now();
1733        let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1734            .expect("extraction must not error");
1735        let elapsed = start.elapsed();
1736        assert!(
1737            elapsed.as_secs() < 30,
1738            "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1739            elapsed.as_secs()
1740        );
1741        // No real entities expected in synthetic body, but the call must succeed.
1742        let _ = result.entities;
1743    }
1744
1745    // ── v1.0.31 A11: PT-BR uppercase noise must not leak as entities ──
1746
1747    #[test]
1748    fn pt_uppercase_stopwords_filtered_v1031() {
1749        let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1750                    SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1751        let ents = apply_regex_prefilter(body);
1752        let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
1753        for stop in &[
1754            "ADAPTER",
1755            "PROJETO",
1756            "PASSIVA",
1757            "SOMENTE",
1758            "LEITURA",
1759            "REGRA",
1760            "OBRIGATORIA",
1761            "EXEMPLO",
1762            "DEFAULT",
1763        ] {
1764            assert!(
1765                !names.contains(&stop.to_string()),
1766                "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1767            );
1768        }
1769    }
1770
1771    #[test]
1772    fn pt_underscored_identifier_preserved_v1031() {
1773        // Identifiers with underscore must still pass through (FLOWAIPER_API_KEY,
1774        // MAX_RETRY etc. are intentional entities, not noise).
1775        let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1776        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1777        assert!(names.contains(&"FLOWAIPER_API_KEY"));
1778        assert!(names.contains(&"MAX_TIMEOUT"));
1779    }
1780
1781    // ── v1.0.31 A9: relationships only between entities co-occurring in same sentence ──
1782
1783    #[test]
1784    fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1785        let body = "Alice met Bob at the conference. Carol works alone in another room.";
1786        let entities = vec![
1787            NewEntity {
1788                name: "Alice".to_string(),
1789                entity_type: EntityType::Person,
1790                description: None,
1791            },
1792            NewEntity {
1793                name: "Bob".to_string(),
1794                entity_type: EntityType::Person,
1795                description: None,
1796            },
1797            NewEntity {
1798                name: "Carol".to_string(),
1799                entity_type: EntityType::Person,
1800                description: None,
1801            },
1802        ];
1803        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1804        assert!(!truncated);
1805        assert_eq!(
1806            rels.len(),
1807            1,
1808            "only Alice/Bob should pair (same sentence); Carol is isolated"
1809        );
1810        let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1811        assert!(
1812            matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
1813            "unexpected pair {pair:?}"
1814        );
1815    }
1816
1817    #[test]
1818    fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1819        let body = "Alice is here.";
1820        let entities = vec![NewEntity {
1821            name: "Alice".to_string(),
1822            entity_type: EntityType::Person,
1823            description: None,
1824        }];
1825        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1826        assert!(rels.is_empty());
1827        assert!(!truncated);
1828    }
1829
1830    #[test]
1831    fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1832        let body = "Alice met Bob. Bob saw Alice again.";
1833        let entities = vec![
1834            NewEntity {
1835                name: "Alice".to_string(),
1836                entity_type: EntityType::Person,
1837                description: None,
1838            },
1839            NewEntity {
1840                name: "Bob".to_string(),
1841                entity_type: EntityType::Person,
1842                description: None,
1843            },
1844        ];
1845        let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1846        assert_eq!(
1847            rels.len(),
1848            1,
1849            "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1850        );
1851    }
1852
1853    #[test]
1854    fn extraction_max_tokens_default_is_5000() {
1855        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1856        assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1857    }
1858
1859    #[test]
1860    fn extraction_max_tokens_env_override_clamped() {
1861        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1862        assert_eq!(
1863            crate::constants::extraction_max_tokens(),
1864            5_000,
1865            "value below 512 must fall back to default"
1866        );
1867
1868        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1869        assert_eq!(
1870            crate::constants::extraction_max_tokens(),
1871            5_000,
1872            "value above 100_000 must fall back to default"
1873        );
1874
1875        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1876        assert_eq!(
1877            crate::constants::extraction_max_tokens(),
1878            8_000,
1879            "valid value must be honoured"
1880        );
1881
1882        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1883    }
1884
1885    #[test]
1886    fn gliner_variant_from_str_valid() {
1887        assert_eq!(
1888            "fp32".parse::<GlinerVariant>().unwrap(),
1889            GlinerVariant::Fp32
1890        );
1891        assert_eq!(
1892            "fp16".parse::<GlinerVariant>().unwrap(),
1893            GlinerVariant::Fp16
1894        );
1895        assert_eq!(
1896            "int8".parse::<GlinerVariant>().unwrap(),
1897            GlinerVariant::Int8
1898        );
1899        assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1900        assert_eq!(
1901            "q4f16".parse::<GlinerVariant>().unwrap(),
1902            GlinerVariant::Q4f16
1903        );
1904        // Case-insensitive
1905        assert_eq!(
1906            "FP32".parse::<GlinerVariant>().unwrap(),
1907            GlinerVariant::Fp32
1908        );
1909        assert_eq!(
1910            "INT8".parse::<GlinerVariant>().unwrap(),
1911            GlinerVariant::Int8
1912        );
1913    }
1914
1915    #[test]
1916    fn gliner_variant_from_str_invalid() {
1917        assert!("invalid".parse::<GlinerVariant>().is_err());
1918        assert!("fp64".parse::<GlinerVariant>().is_err());
1919        assert!("".parse::<GlinerVariant>().is_err());
1920    }
1921
1922    #[test]
1923    fn gliner_variant_filename_mapping() {
1924        assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1925        assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1926        assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1927        assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1928        assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1929    }
1930
1931    #[test]
1932    fn gliner_variant_display() {
1933        assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1934        assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1935        assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1936        assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1937        assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1938    }
1939
1940    #[test]
1941    fn gliner_variant_display_size() {
1942        assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1943        assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1944    }
1945
1946    #[test]
1947    fn gliner_entity_labels_covers_all_types() {
1948        let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1949        assert!(label_types.contains(&EntityType::Person));
1950        assert!(label_types.contains(&EntityType::Organization));
1951        assert!(label_types.contains(&EntityType::Location));
1952        assert!(label_types.contains(&EntityType::Date));
1953        assert!(label_types.contains(&EntityType::Project));
1954        assert!(label_types.contains(&EntityType::Tool));
1955        assert!(label_types.contains(&EntityType::File));
1956        assert!(label_types.contains(&EntityType::Concept));
1957        assert!(label_types.contains(&EntityType::Decision));
1958        assert!(label_types.contains(&EntityType::Incident));
1959        assert!(label_types.contains(&EntityType::Dashboard));
1960        assert!(label_types.contains(&EntityType::IssueTracker));
1961        assert!(label_types.contains(&EntityType::Memory));
1962        assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1963    }
1964
1965    #[test]
1966    fn gliner_entity_labels_no_duplicates() {
1967        let mut seen = std::collections::HashSet::new();
1968        for (label, _) in GLINER_ENTITY_LABELS {
1969            assert!(seen.insert(*label), "duplicate label: {label}");
1970        }
1971    }
1972
1973    #[test]
1974    fn extract_graph_auto_regex_only_fallback() {
1975        // extract_graph_auto must succeed and capture regex entities regardless of whether
1976        // GLiNER model files exist in the test environment (GLINER_MODEL is a global OnceLock
1977        // that may already be initialised by a sibling test, so we cannot assert on
1978        // extraction_method; use RegexExtractor for that invariant).
1979        let result = extract_graph_auto(
1980            "Contact someone@test.com about OPENAI project",
1981            &make_paths(),
1982            GlinerVariant::Fp32,
1983        );
1984        assert!(result.is_ok());
1985        let res = result.unwrap();
1986        // Regex prefilter must always capture the email entity
1987        assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
1988        // extraction_method must be one of the two valid values
1989        assert!(
1990            res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
1991            "unexpected extraction_method: {}",
1992            res.extraction_method
1993        );
1994    }
1995
1996    #[test]
1997    fn gliner_variant_roundtrip() {
1998        for variant in &[
1999            GlinerVariant::Fp32,
2000            GlinerVariant::Fp16,
2001            GlinerVariant::Int8,
2002            GlinerVariant::Q4,
2003            GlinerVariant::Q4f16,
2004        ] {
2005            let s = format!("{variant}");
2006            let parsed: GlinerVariant = s.parse().unwrap();
2007            assert_eq!(*variant, parsed);
2008        }
2009    }
2010}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs