sqlite_graphrag/
extraction.rs

1//! Entity and URL extraction pipeline (NER + regex prefilter).
2//!
3//! Runs named-entity recognition and regex heuristics to extract structured
4//! entities and hyperlinks from raw memory bodies before embedding.
5
6use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use anyhow::{Context, Result};
10use ort::session::{builder::GraphOptimizationLevel, Session};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use unicode_normalization::UnicodeNormalization;
14
15use crate::entity_type::EntityType;
16use crate::paths::AppPaths;
17use crate::storage::entities::{NewEntity, NewRelationship};
18
19const MAX_ENTS: usize = 30;
20// v1.0.31 A9: only consumed by the legacy `build_relationships`, which is
21// kept for unit tests pinning the cap behaviour.
22#[cfg(test)]
23const TOP_K_RELATIONS: usize = 5;
24const DEFAULT_RELATION: &str = "mentions";
25const MIN_ENTITY_CHARS: usize = 2;
26
27static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
32static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33// v1.0.25 P0-2: captures CamelCase brand names that NER model often misses (e.g. "OpenAI", "PostgreSQL").
34static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35
36// v1.0.20: stopwords to filter common PT-BR/EN rule words captured as ALL_CAPS.
37// Without this filter, technical PT-BR corpora containing CAPS-formatted rules (NUNCA, PROIBIDO, DEVE)
38// generated ~70% of "garbage entities". We keep identifiers like MAX_RETRY (with underscore).
39// v1.0.22: expanded list with terms observed in 495-file flowaiper stress test.
40// Includes verbs (ADICIONAR, VALIDAR), adjectives (ALTA, BAIXA), common nouns (BANCO, CASO),
41// HTTP methods (GET, POST, DELETE) and generic data formats (JSON, XML).
42// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
43// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
44// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
45// and common abbreviations (ACK, ACL).
46// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
47// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
48// and generic tech terms from being extracted as entities.
49// v1.0.31 A11: added PT-BR uppercase noise observed during ingest of technical Portuguese
50// rule documents — common nouns/adjectives written in caps as visual emphasis (ADAPTER, PROJETO,
51// PASSIVA, ATIVA, SOMENTE, LEITURA, ESCRITA, OBRIGATORIA, EXEMPLO, REGRA, DEFAULT). Each one
52// kept leaking as a "concept" entity and inflating the graph with non-entities.
53const ALL_CAPS_STOPWORDS: &[&str] = &[
54    "ACEITE",
55    "ACID",
56    "ACK",
57    "ACL",
58    "ACRESCENTADO",
59    "ADAPTER",
60    "ADICIONADA",
61    "ADICIONADAS",
62    "ADICIONADO",
63    "ADICIONADOS",
64    "ADICIONAR",
65    "AGENTS",
66    "AINDA",
67    "ALL",
68    "ALTA",
69    "ALWAYS",
70    "APENAS",
71    "API",
72    "ARTEFATOS",
73    "ATIVA",
74    "ATIVO",
75    "BAIXA",
76    "BANCO",
77    "BLOQUEAR",
78    "BORDA",
79    "BUG",
80    "CAPÍTULO",
81    "CASO",
82    "CEO",
83    "CHECKLIST",
84    "CLARO",
85    "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86    "CLI",
87    "COMPLETED",
88    "CONFIRMADO",
89    "CONFIRMARAM",
90    "CONFIRME",
91    "CONFIRMEI",
92    "CONFIRMOU",
93    "CONTRATO",
94    "CRIE",
95    "CRÍTICO",
96    "CRITICAL",
97    "CSV",
98    "DDL",
99    "DEFAULT",
100    "DEFINIR",
101    "DEPARTMENT",
102    "DESC",
103    "DEVE",
104    "DEVEMOS",
105    "DISCO",
106    "DONE",
107    "DSL",
108    "DTO",
109    "EFEITO",
110    "ENTRADA",
111    "EOF",
112    "EPERM",
113    "ERROR",
114    "ESCREVA",
115    "ESCRITA",
116    "ESRCH",
117    "ESSA",
118    "ESSE",
119    "ESSENCIAL",
120    "ESTA",
121    "ESTADO",
122    "ESTE",
123    "ETAPA",
124    "EVITAR",
125    "EXEMPLO",
126    "EXPANDIR",
127    "EXPOR",
128    "FALHA",
129    "FASE",
130    "FATO",
131    "FIFO",
132    "FIXED",
133    "FIXME",
134    "FLUXO",
135    "FONTES",
136    "FORBIDDEN",
137    "FUNCIONA",
138    "GNU",
139    "HACK",
140    "HEARTBEAT",
141    "HTTP",
142    "HTTPS",
143    "INATIVO",
144    "JAMAIS",
145    "JSON",
146    "JWT",
147    "LEITURA",
148    "LLM",
149    "MCP",
150    "MESMO",
151    "METADADOS",
152    "MUST",
153    "NDJSON",
154    "NEGUE",
155    "NEVER",
156    "NOTE",
157    "NUNCA",
158    "OBRIGATORIA",
159    "OBRIGATÓRIO",
160    "OBSERVEI",
161    "PADRÃO",
162    "PASSIVA",
163    "PASSO",
164    "PENDING",
165    "PGID",
166    "PID",
167    "PLAN",
168    "PODEMOS",
169    "PONTEIROS",
170    "PREFERIR",
171    "PROIBIDO",
172    "PROJETO",
173    "RECUSE",
174    "REGRA",
175    "REGRAS",
176    "REMOVIDAS",
177    "REQUIRED",
178    "REQUISITO",
179    "REST",
180    "SEÇÃO",
181    "SEMPRE",
182    "SHALL",
183    "SHOULD",
184    "SIGTERM",
185    "SOMENTE",
186    "SOUL",
187    "TODAS",
188    "TODO",
189    "TODOS",
190    "TOKEN",
191    "TOOLS",
192    "TSV",
193    "TUI",
194    "UI",
195    "URL",
196    "USAR",
197    "VALIDAR",
198    "VAMOS",
199    "VOCÊ",
200    "WARNING",
201    "XML",
202    "YAML",
203];
204
205// v1.0.22: HTTP methods are protocol verbs, not semantically useful entities.
206// Filtered in apply_regex_prefilter (regex_all_caps path).
207const HTTP_METHODS: &[&str] = &[
208    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209];
210
211fn is_filtered_all_caps(token: &str) -> bool {
212    // Identifiers containing underscore are preserved (e.g. MAX_RETRY, FLOWAIPER_API_KEY)
213    let is_identifier = token.contains('_');
214    if is_identifier {
215        return false;
216    }
217    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
218}
219
220fn regex_email() -> &'static Regex {
221    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
222    REGEX_EMAIL.get_or_init(|| {
223        Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224            .expect("compile-time validated email regex literal")
225    })
226}
227
228fn regex_url() -> &'static Regex {
229    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
230    REGEX_URL.get_or_init(|| {
231        Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232            .expect("compile-time validated URL regex literal")
233    })
234}
235
236fn regex_uuid() -> &'static Regex {
237    // SAFETY: regex literal validated at compile-time via test::regex_literals_compile
238    REGEX_UUID.get_or_init(|| {
239        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240            .expect("compile-time validated UUID regex literal")
241    })
242}
243
244fn regex_all_caps() -> &'static Regex {
245    REGEX_ALL_CAPS.get_or_init(|| {
246        Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247            .expect("compile-time validated all-caps regex literal")
248    })
249}
250
251fn regex_section_marker() -> &'static Regex {
252    REGEX_SECTION_MARKER.get_or_init(|| {
253        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1",
254        // "Camada 5", "Passo 2", etc. v1.0.36 (H5): added "Camada" after audit found
255        // "Camada 1".."Camada 5" leaking through into entity extraction with degree>=3.
256        // Accented characters expressed as escapes to keep this source file ASCII-only
257        // per the project language policy. Pattern is equivalent to:
258        //   \b(?:Etapa|Fase|Passo|Camada|Se\xe7\xe3o|Cap\xedtulo)\s+\d+\b
259        Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260            .expect("compile-time validated section marker regex literal")
261    })
262}
263
264fn regex_brand_camel() -> &'static Regex {
265    REGEX_BRAND_CAMEL.get_or_init(|| {
266        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
267        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
268        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269            .expect("compile-time validated CamelCase brand regex literal")
270    })
271}
272
273#[derive(Debug, Clone, PartialEq)]
274pub struct ExtractedEntity {
275    pub name: String,
276    pub entity_type: EntityType,
277}
278
279/// URL with source offset extracted from the memory body.
280#[derive(Debug, Clone)]
281pub struct ExtractedUrl {
282    pub url: String,
283    /// Byte position in the body where the URL was found.
284    pub offset: usize,
285}
286
287#[derive(Debug, Clone)]
288pub struct ExtractionResult {
289    pub entities: Vec<NewEntity>,
290    pub relationships: Vec<NewRelationship>,
291    /// True when build_relationships hit the cap before covering all entity pairs.
292    /// Exposed in RememberResponse so callers can detect when relationships were cut.
293    pub relationships_truncated: bool,
294    /// Extraction method used: `"gliner-<variant>+regex"` or `"regex-only"`.
295    /// Useful for auditing, metrics and user reports.
296    pub extraction_method: String,
297    /// URLs extracted from the body — stored separately from graph entities.
298    pub urls: Vec<ExtractedUrl>,
299}
300
301pub trait Extractor: Send + Sync {
302    fn extract(&self, body: &str) -> Result<ExtractionResult>;
303}
304
305/// GLiNER ONNX model quantization variant.
306#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307pub enum GlinerVariant {
308    Fp32,
309    Fp16,
310    Int8,
311    Q4,
312    Q4f16,
313}
314
315impl GlinerVariant {
316    /// ONNX filename for this variant in the HuggingFace repository.
317    pub fn as_filename(self) -> &'static str {
318        match self {
319            Self::Fp32 => "model.onnx",
320            Self::Fp16 => "model_fp16.onnx",
321            Self::Int8 => "model_quantized.onnx",
322            Self::Q4 => "model_q4.onnx",
323            Self::Q4f16 => "model_q4f16.onnx",
324        }
325    }
326
327    /// Approximate model size for user-facing messages.
328    pub fn display_size(self) -> &'static str {
329        match self {
330            Self::Fp32 => "1.1 GB",
331            Self::Fp16 => "580 MB",
332            Self::Int8 => "349 MB",
333            Self::Q4 => "894 MB",
334            Self::Q4f16 => "472 MB",
335        }
336    }
337}
338
339impl std::fmt::Display for GlinerVariant {
340    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341        match self {
342            Self::Fp32 => f.write_str("fp32"),
343            Self::Fp16 => f.write_str("fp16"),
344            Self::Int8 => f.write_str("int8"),
345            Self::Q4 => f.write_str("q4"),
346            Self::Q4f16 => f.write_str("q4f16"),
347        }
348    }
349}
350
351impl std::str::FromStr for GlinerVariant {
352    type Err = anyhow::Error;
353    fn from_str(s: &str) -> Result<Self> {
354        match s.to_lowercase().as_str() {
355            "fp32" => Ok(Self::Fp32),
356            "fp16" => Ok(Self::Fp16),
357            "int8" => Ok(Self::Int8),
358            "q4" => Ok(Self::Q4),
359            "q4f16" => Ok(Self::Q4f16),
360            other => {
361                anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362            }
363        }
364    }
365}
366
367const GLINER_MAX_WIDTH: usize = 12;
368const GLINER_MAX_SEQ_LEN: usize = 384;
369const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371
372const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373    ("person", EntityType::Person),
374    ("organization", EntityType::Organization),
375    ("location", EntityType::Location),
376    ("date", EntityType::Date),
377    ("project", EntityType::Project),
378    ("tool", EntityType::Tool),
379    ("file", EntityType::File),
380    ("concept", EntityType::Concept),
381    ("decision", EntityType::Decision),
382    ("incident", EntityType::Incident),
383    ("dashboard", EntityType::Dashboard),
384    ("issue tracker", EntityType::IssueTracker),
385    ("memory", EntityType::Memory),
386];
387
388struct GlinerModel {
389    session: std::sync::Mutex<Session>,
390    tokenizer: tokenizers::Tokenizer,
391    #[allow(dead_code)]
392    variant: GlinerVariant,
393}
394
395impl GlinerModel {
396    fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397        let model_path = model_dir.join(variant.as_filename());
398        let tokenizer_path = model_dir.join("tokenizer.json");
399
400        let session = Session::builder()
401            .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
402            .with_optimization_level(GraphOptimizationLevel::Level3)
403            .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
404            .commit_from_file(&model_path)
405            .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
406
407        let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408            .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
409
410        Ok(Self {
411            session: std::sync::Mutex::new(session),
412            tokenizer,
413            variant,
414        })
415    }
416
417    fn predict(
418        &self,
419        body: &str,
420        entity_labels: &[(&str, EntityType)],
421        threshold: f32,
422    ) -> Result<Vec<ExtractedEntity>> {
423        let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424        let words: Vec<&str> = body.split_whitespace().collect();
425        if words.is_empty() {
426            return Ok(Vec::new());
427        }
428
429        // Cap words to fit within model sequence length (accounting for label tokens)
430        let label_token_count = label_names.len() * 2 + 1;
431        let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432        let words = if words.len() > max_words {
433            tracing::warn!(
434                original_words = words.len(),
435                capped_words = max_words,
436                "GLiNER input truncated to fit model sequence length"
437            );
438            &words[..max_words]
439        } else {
440            &words[..]
441        };
442        let num_words = words.len();
443
444        // Build prompt: [<<ENT>>, label1, <<ENT>>, label2, ..., <<SEP>>, word1, word2, ...]
445        let mut prompt_tokens: Vec<String> =
446            Vec::with_capacity(label_names.len() * 2 + 1 + num_words);
447        for label in &label_names {
448            prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
449            prompt_tokens.push((*label).to_string());
450        }
451        prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
452        for word in words {
453            prompt_tokens.push((*word).to_string());
454        }
455
456        // Encode each token individually (word-by-word encoding per GLiNER protocol)
457        let mut all_ids: Vec<i64> = Vec::new();
458        let mut all_attention: Vec<i64> = Vec::new();
459        let mut all_word_mask: Vec<i64> = Vec::new();
460
461        // BOS token
462        all_ids.push(1);
463        all_attention.push(1);
464        all_word_mask.push(0);
465
466        let text_offset = label_names.len() * 2 + 1;
467        let mut word_id: i64 = 0;
468
469        for (pos, token_str) in prompt_tokens.iter().enumerate() {
470            let encoding = self
471                .tokenizer
472                .encode(token_str.as_str(), false)
473                .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
474            let ids = encoding.get_ids();
475            let is_text_token = pos >= text_offset;
476
477            for (sub_idx, &id) in ids.iter().enumerate() {
478                all_ids.push(id as i64);
479                all_attention.push(1);
480                if is_text_token && sub_idx == 0 {
481                    word_id += 1;
482                    all_word_mask.push(word_id);
483                } else {
484                    all_word_mask.push(0);
485                }
486            }
487        }
488
489        // EOS token
490        all_ids.push(2);
491        all_attention.push(1);
492        all_word_mask.push(0);
493
494        let seq_len = all_ids.len();
495
496        // Build ORT tensors using Tensor::from_array((shape, data)) API
497        let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
498            .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
499        let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
500            .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
501        let t_words_mask =
502            ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
503                .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
504        let t_text_lengths =
505            ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
506                .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
507
508        // Build span tensors
509        let num_spans = num_words * GLINER_MAX_WIDTH;
510        let mut span_idx_data = vec![0i64; num_spans * 2];
511        let mut span_mask_data = vec![false; num_spans];
512
513        for start in 0..num_words {
514            let remaining = num_words - start;
515            let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
516            for width in 0..actual_max_width {
517                let dim = start * GLINER_MAX_WIDTH + width;
518                span_idx_data[dim * 2] = start as i64;
519                span_idx_data[dim * 2 + 1] = (start + width) as i64;
520                span_mask_data[dim] = true;
521            }
522        }
523
524        let t_span_idx =
525            ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
526                .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
527        let t_span_mask =
528            ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
529                .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
530
531        // Run inference — Session::run requires &mut Session; bind guard first.
532        let mut session_guard = self
533            .session
534            .lock()
535            .map_err(|_| anyhow::anyhow!("GLiNER session mutex poisoned"))?;
536        let outputs = session_guard
537            .run(ort::inputs![
538                "input_ids" => t_input_ids,
539                "attention_mask" => t_attention,
540                "words_mask" => t_words_mask,
541                "text_lengths" => t_text_lengths,
542                "span_idx" => t_span_idx,
543                "span_mask" => t_span_mask
544            ])
545            .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
546
547        // Extract logits: [1, num_words, max_width, num_classes]
548        // try_extract_tensor returns (&Shape, &[f32]); index manually.
549        let (logits_shape, logits_data) = outputs["logits"]
550            .try_extract_tensor::<f32>()
551            .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
552
553        let num_classes = label_names.len();
554        // Expected shape: [1, num_words, GLINER_MAX_WIDTH, num_classes]
555        // Shape derefs to &[i64] so we can index directly.
556        let max_width = logits_shape
557            .get(2)
558            .copied()
559            .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
560        let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
561
562        let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
563
564        for start in 0..num_words {
565            for width in 0..max_width {
566                let end = start + width;
567                if end >= num_words {
568                    break;
569                }
570                for class_idx in 0..nc.min(num_classes) {
571                    // flat index: batch=0 * (num_words*max_width*nc) + start*(max_width*nc) + width*nc + class_idx
572                    let flat = start * (max_width * nc) + width * nc + class_idx;
573                    if flat >= logits_data.len() {
574                        break;
575                    }
576                    let raw = logits_data[flat];
577                    let score = 1.0 / (1.0 + (-raw).exp());
578                    if score >= threshold {
579                        candidates.push((start, end, class_idx, score));
580                    }
581                }
582            }
583        }
584
585        // Sort by score descending for greedy NMS
586        candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
587
588        // Greedy non-maximum suppression
589        let mut used = vec![false; num_words];
590        let mut entities: Vec<ExtractedEntity> = Vec::new();
591
592        for (start, end, class_idx, _score) in &candidates {
593            let overlap = (*start..=*end).any(|i| used[i]);
594            if overlap {
595                continue;
596            }
597            for flag in used.iter_mut().take(*end + 1).skip(*start) {
598                *flag = true;
599            }
600            let text = words[*start..=*end].join(" ");
601            if text.len() < MIN_ENTITY_CHARS {
602                continue;
603            }
604            let entity_type = entity_labels[*class_idx].1;
605            entities.push(ExtractedEntity {
606                name: text,
607                entity_type,
608            });
609            if entities.len() >= MAX_ENTS {
610                break;
611            }
612        }
613
614        Ok(entities)
615    }
616}
617
618static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
619
620fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
621    paths.models.join(format!("gliner-multi-v2.1/{variant}"))
622}
623
624fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
625    let dir = gliner_model_dir(paths, variant);
626    std::fs::create_dir_all(&dir)
627        .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
628
629    let model_file = dir.join(variant.as_filename());
630    let tokenizer_file = dir.join("tokenizer.json");
631
632    if model_file.exists() && tokenizer_file.exists() {
633        return Ok(dir);
634    }
635
636    let repo = crate::constants::gliner_model_repo();
637    tracing::info!(
638        "Downloading GLiNER model ({variant}, ~{})...",
639        variant.display_size()
640    );
641    crate::output::emit_progress_i18n(
642        &format!(
643            "Downloading GLiNER model ({variant}, ~{})...",
644            variant.display_size()
645        ),
646        &format!(
647            "Baixando modelo GLiNER ({variant}, ~{})...",
648            variant.display_size()
649        ),
650    );
651
652    let api = huggingface_hub::api::sync::Api::new().context("creating HF Hub client")?;
653    let hf_repo = api.model(repo);
654
655    let remote_model = format!("onnx/{}", variant.as_filename());
656    if !model_file.exists() {
657        let src = hf_repo
658            .get(&remote_model)
659            .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
660        std::fs::copy(&src, &model_file)
661            .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
662    }
663
664    if !tokenizer_file.exists() {
665        let src = hf_repo
666            .get("tokenizer.json")
667            .context("downloading tokenizer.json from HF Hub")?;
668        std::fs::copy(&src, &tokenizer_file).context("copying tokenizer.json to cache")?;
669    }
670
671    Ok(dir)
672}
673
674fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
675    let dir = ensure_gliner_model_files(paths, variant)?;
676    GlinerModel::load(&dir, variant)
677}
678
679fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
680    GLINER_MODEL
681        .get_or_init(|| match load_gliner_model(paths, variant) {
682            Ok(m) => Some(m),
683            Err(e) => {
684                tracing::warn!("GLiNER model unavailable (graceful degradation): {e:#}");
685                None
686            }
687        })
688        .as_ref()
689}
690
691fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
692    let mut entities = Vec::with_capacity(16);
693    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
694
695    let add = |entities: &mut Vec<ExtractedEntity>,
696               seen: &mut std::collections::HashSet<String>,
697               name: &str,
698               entity_type: EntityType| {
699        let name = name.trim().to_string();
700        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
701            entities.push(ExtractedEntity { name, entity_type });
702        }
703    };
704
705    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
706    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
707    let cleaned = regex_section_marker().replace_all(body, " ");
708    let cleaned = cleaned.as_ref();
709
710    for m in regex_email().find_iter(cleaned) {
711        // v1.0.20: email is "concept" (regex alone cannot distinguish person from mailing list/role).
712        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
713    }
714    for m in regex_uuid().find_iter(cleaned) {
715        add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
716    }
717    for m in regex_all_caps().find_iter(cleaned) {
718        let candidate = m.as_str();
719        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
720        if !is_filtered_all_caps(candidate) {
721            add(&mut entities, &mut seen, candidate, EntityType::Concept);
722        }
723    }
724    // v1.0.25 P0-2: capture CamelCase brand names that NER model often misses.
725    // Maps to "organization" (V008 schema) because brand names are typically organisations.
726    for m in regex_brand_camel().find_iter(cleaned) {
727        let name = m.as_str();
728        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
729        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
730            add(&mut entities, &mut seen, name, EntityType::Organization);
731        }
732    }
733
734    entities
735}
736
737/// Extracts URLs from a memory body, deduplicated by text.
738/// URLs are stored in the `memory_urls` table separately from graph entities.
739/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
740pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
741    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
742    let mut result = Vec::with_capacity(4);
743    for m in regex_url().find_iter(body) {
744        let raw = m.as_str();
745        let cleaned = raw
746            .trim_end_matches('`')
747            .trim_end_matches(',')
748            .trim_end_matches('.')
749            .trim_end_matches(';')
750            .trim_end_matches(')')
751            .trim_end_matches(']')
752            .trim_end_matches('}');
753        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
754            result.push(ExtractedUrl {
755                url: cleaned.to_string(),
756                offset: m.start(),
757            });
758        }
759    }
760    result
761}
762
763/// Returns (relationships, truncated) where truncated is true when the cap was hit
764/// before all entity pairs were covered. Exposed in RememberResponse as
765/// `relationships_truncated` so callers can decide whether to increase the cap.
766///
767/// v1.0.31 A9: superseded by `build_relationships_by_sentence_cooccurrence` for
768/// the auto-extraction pipeline because the legacy pairwise scheme produces a
769/// dense C(N,2) graph polluted with co-mentions across unrelated paragraphs.
770/// Kept for unit tests that pin the cap behaviour and for callers that lack a
771/// body string.
772#[cfg(test)]
773fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
774    if entities.len() < 2 {
775        return (Vec::new(), false);
776    }
777
778    // v1.0.22: cap configurable via env var (constants::max_relationships_per_memory).
779    // Allows users with dense corpora to increase beyond the default 50.
780    let max_rels = crate::constants::max_relationships_per_memory();
781    let n = entities.len().min(MAX_ENTS);
782    let mut rels: Vec<NewRelationship> = Vec::new();
783    let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
784
785    let mut hit_cap = false;
786    'outer: for i in 0..n {
787        if rels.len() >= max_rels {
788            hit_cap = true;
789            break;
790        }
791
792        let mut for_entity = 0usize;
793        for j in (i + 1)..n {
794            if for_entity >= TOP_K_RELATIONS {
795                break;
796            }
797            if rels.len() >= max_rels {
798                hit_cap = true;
799                break 'outer;
800            }
801
802            let key = (i.min(j), i.max(j));
803            if !seen.insert(key) {
804                continue;
805            }
806
807            rels.push(NewRelationship {
808                // clone needed: NewRelationship requires owned String for source/target
809                source: entities[i].name.clone(),
810                target: entities[j].name.clone(),
811                relation: DEFAULT_RELATION.to_string(),
812                strength: 0.5,
813                description: None,
814            });
815            for_entity += 1;
816        }
817    }
818
819    // v1.0.20: warn when relationships were truncated before covering all possible pairs.
820    if hit_cap {
821        tracing::warn!(
822            "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
823            n.saturating_sub(1)
824        );
825    }
826
827    (rels, hit_cap)
828}
829
830/// v1.0.31 A9: build relationships only between entities that actually
831/// co-occur within the same sentence (split on `.`, `!`, `?`, newline).
832///
833/// The legacy `build_relationships` pairs every entity with every other,
834/// yielding a dense C(N,2) graph dominated by spurious "mentions" edges
835/// across unrelated sections. Restricting to sentence-level co-occurrence
836/// keeps the edges semantically meaningful while still respecting the
837/// configurable `max_relationships_per_memory` cap.
838///
839/// Returns `(relationships, truncated)` mirroring `build_relationships`.
840fn build_relationships_by_sentence_cooccurrence(
841    body: &str,
842    entities: &[NewEntity],
843) -> (Vec<NewRelationship>, bool) {
844    if entities.len() < 2 {
845        return (Vec::new(), false);
846    }
847
848    let max_rels = crate::constants::max_relationships_per_memory();
849    let lower_names: Vec<(usize, String)> = entities
850        .iter()
851        .take(MAX_ENTS)
852        .enumerate()
853        .map(|(i, e)| (i, e.name.to_lowercase()))
854        .collect();
855
856    let mut rels: Vec<NewRelationship> = Vec::new();
857    let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
858    let mut hit_cap = false;
859
860    for sentence in body.split(['.', '!', '?', '\n']) {
861        if sentence.trim().is_empty() {
862            continue;
863        }
864        let lower_sentence = sentence.to_lowercase();
865        let present: Vec<usize> = lower_names
866            .iter()
867            .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
868            .map(|(i, _)| *i)
869            .collect();
870
871        if present.len() < 2 {
872            continue;
873        }
874
875        for i in 0..present.len() {
876            for j in (i + 1)..present.len() {
877                if rels.len() >= max_rels {
878                    hit_cap = true;
879                    tracing::warn!(
880                        "relationships truncated to {max_rels} during sentence-level pairing"
881                    );
882                    return (rels, hit_cap);
883                }
884                let ei = present[i];
885                let ej = present[j];
886                let key = (ei.min(ej), ei.max(ej));
887                if seen.insert(key) {
888                    rels.push(NewRelationship {
889                        source: entities[ei].name.clone(),
890                        target: entities[ej].name.clone(),
891                        relation: DEFAULT_RELATION.to_string(),
892                        strength: 0.5,
893                        description: None,
894                    });
895                }
896            }
897        }
898    }
899
900    (rels, hit_cap)
901}
902
903/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
904/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
905/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
906/// Conservative: only extends when the suffix is at most 7 characters.
907/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
908/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
909fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
910    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
911    // Matches: separator + digits + optional decimal + optional lowercase letter
912    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
913    let suffix_re = SUFFIX_RE.get_or_init(|| {
914        Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
915            .expect("compile-time validated numeric suffix regex literal")
916    });
917
918    entities
919        .into_iter()
920        .map(|ent| {
921            // Finds the first case-sensitive occurrence of the entity in the body
922            if let Some(pos) = body.find(&ent.name) {
923                let after_pos = pos + ent.name.len();
924                if after_pos < body.len() {
925                    let after = &body[after_pos..];
926                    if let Some(m) = suffix_re.find(after) {
927                        let suffix = m.as_str();
928                        // Conservative: cap suffix length to 7 chars to avoid grabbing
929                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
930                        if suffix.len() <= 7 {
931                            let mut extended = String::with_capacity(ent.name.len() + suffix.len());
932                            extended.push_str(&ent.name);
933                            extended.push_str(suffix);
934                            return ExtractedEntity {
935                                name: extended,
936                                entity_type: ent.entity_type,
937                            };
938                        }
939                    }
940                }
941            }
942            ent
943        })
944        .collect()
945}
946
947/// Captures versioned model names that NER model consistently misses.
948///
949/// NER model often classifies tokens like "Claude" or "Llama" as common nouns,
950/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
951/// never sees these candidates and the version suffix gets lost.
952///
953/// This function scans the body with a conservative regex, matching capitalised
954/// words followed by a space-or-hyphen and a small integer. Matches that are not
955/// already covered by an existing entity (case-insensitive) are appended with the
956/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
957/// items downstream.
958///
959/// v1.0.24 P2-D: regex extended to cover:
960/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
961/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
962/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
963///
964/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
965/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
966fn augment_versioned_model_names(
967    entities: Vec<ExtractedEntity>,
968    body: &str,
969) -> Vec<ExtractedEntity> {
970    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
971    // Pattern breakdown:
972    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
973    //   [\s\-]+               — separator: space(s) or hyphen(s)
974    //   \d+(?:\.\d+)?         — version number, optional decimal
975    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
976    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
977    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
978        Regex::new(
979            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
980        )
981        .expect("compile-time validated versioned model regex literal")
982    });
983
984    let mut existing_lc: std::collections::HashSet<String> =
985        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
986    let mut result = entities;
987
988    for caps in model_re.captures_iter(body) {
989        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
990        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
991        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
992        if full_match.is_empty() || full_match.len() > 24 {
993            continue;
994        }
995        let normalized_lc = full_match.to_lowercase();
996        if existing_lc.contains(&normalized_lc) {
997            continue;
998        }
999        // Stop appending once the global entity cap is reached to keep parity with
1000        // `merge_and_deduplicate` truncation semantics.
1001        if result.len() >= MAX_ENTS {
1002            break;
1003        }
1004        existing_lc.insert(normalized_lc);
1005        result.push(ExtractedEntity {
1006            name: full_match.to_string(),
1007            entity_type: EntityType::Concept,
1008        });
1009    }
1010
1011    result
1012}
1013
1014fn merge_and_deduplicate(
1015    regex_ents: Vec<ExtractedEntity>,
1016    ner_ents: Vec<ExtractedEntity>,
1017) -> Vec<ExtractedEntity> {
1018    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
1019    // and is scoped per entity_type. This fixes two bugs from prior versions:
1020    //
1021    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
1022    //    start_with "sonnet", so the pair could survive dedup depending on insertion
1023    //    order. contains() catches both directions unconditionally.
1024    //
1025    // 2. The lookup key omitted entity_type, so "Apple/organization" and
1026    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
1027    //
1028    // Earlier invariants preserved:
1029    // - NFKC normalization before lowercasing (v1.0.24).
1030    // - Longest-wins: on collision keep the entity with the longer name.
1031    // - Truncation warning at MAX_ENTS.
1032    let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
1033    let mut result: Vec<ExtractedEntity> = Vec::new();
1034    let mut truncated = false;
1035
1036    let total_input = regex_ents.len() + ner_ents.len();
1037    for ent in regex_ents.into_iter().chain(ner_ents) {
1038        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1039        // Composite key: entity_type + NUL + normalised lowercase name.
1040        // Collision search is scoped to the same type so that e.g.
1041        // "Apple/organization" and "Apple/concept" are kept separately.
1042        let key = {
1043            let et = ent.entity_type.as_str();
1044            let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1045            k.push_str(et);
1046            k.push('\0');
1047            k.push_str(&name_lc);
1048            k
1049        };
1050
1051        // Scan stored entries for substring containment within the same type.
1052        // Two names collide when one is a case-insensitive substring of the other:
1053        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
1054        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
1055        let type_prefix = {
1056            let et = ent.entity_type.as_str();
1057            let mut p = String::with_capacity(et.len() + 1);
1058            p.push_str(et);
1059            p.push('\0');
1060            p
1061        };
1062        let mut collision_idx: Option<usize> = None;
1063        for (existing_key, idx) in &by_lc {
1064            // Fast-path: check type prefix matches before scanning the name.
1065            if !existing_key.starts_with(&type_prefix) {
1066                continue;
1067            }
1068            let existing_name_lc = &existing_key[type_prefix.len()..];
1069            if existing_name_lc == name_lc
1070                || existing_name_lc.contains(name_lc.as_str())
1071                || name_lc.contains(existing_name_lc)
1072            {
1073                collision_idx = Some(*idx);
1074                break;
1075            }
1076        }
1077        match collision_idx {
1078            Some(idx) => {
1079                // Replace stored entity only when the new candidate is strictly
1080                // longer; otherwise drop the new one.
1081                if ent.name.len() > result[idx].name.len() {
1082                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1083                    let old_key = {
1084                        let et = result[idx].entity_type.as_str();
1085                        let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1086                        k.push_str(et);
1087                        k.push('\0');
1088                        k.push_str(&old_name_lc);
1089                        k
1090                    };
1091                    by_lc.remove(&old_key);
1092                    result[idx] = ent;
1093                    by_lc.insert(key, idx);
1094                }
1095            }
1096            None => {
1097                by_lc.insert(key, result.len());
1098                result.push(ent);
1099            }
1100        }
1101        if result.len() >= MAX_ENTS {
1102            truncated = true;
1103            break;
1104        }
1105    }
1106
1107    // v1.0.20: warn when silent truncation discards entities above MAX_ENTS.
1108    if truncated {
1109        tracing::warn!(
1110            "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1111        );
1112    }
1113
1114    result
1115}
1116
1117fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1118    extracted
1119        .into_iter()
1120        .map(|e| NewEntity {
1121            name: e.name,
1122            entity_type: e.entity_type,
1123            description: None,
1124        })
1125        .collect()
1126}
1127
1128pub fn extract_graph_auto(
1129    body: &str,
1130    paths: &AppPaths,
1131    variant: GlinerVariant,
1132) -> Result<ExtractionResult> {
1133    let regex_entities = apply_regex_prefilter(body);
1134    let threshold = crate::constants::gliner_confidence_threshold();
1135
1136    let mut gliner_used = false;
1137    let ner_entities = match get_or_init_gliner(paths, variant) {
1138        Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1139            Ok(ents) => {
1140                gliner_used = true;
1141                ents
1142            }
1143            Err(e) => {
1144                tracing::warn!("GLiNER NER failed, falling back to regex-only extraction: {e:#}");
1145                Vec::new()
1146            }
1147        },
1148        None => Vec::new(),
1149    };
1150
1151    let merged = merge_and_deduplicate(regex_entities, ner_entities);
1152    let extended = extend_with_numeric_suffix(merged, body);
1153    let with_models = augment_versioned_model_names(extended, body);
1154    let with_models: Vec<ExtractedEntity> = with_models
1155        .into_iter()
1156        .filter(|e| !regex_section_marker().is_match(&e.name))
1157        .collect();
1158    let entities = to_new_entities(with_models);
1159    let (relationships, relationships_truncated) =
1160        build_relationships_by_sentence_cooccurrence(body, &entities);
1161
1162    let extraction_method = if gliner_used {
1163        format!("gliner-{variant}+regex")
1164    } else {
1165        "regex-only".to_string()
1166    };
1167
1168    let urls = extract_urls(body);
1169
1170    Ok(ExtractionResult {
1171        entities,
1172        relationships,
1173        relationships_truncated,
1174        extraction_method,
1175        urls,
1176    })
1177}
1178
1179pub struct RegexExtractor;
1180
1181impl Extractor for RegexExtractor {
1182    fn extract(&self, body: &str) -> Result<ExtractionResult> {
1183        let regex_entities = apply_regex_prefilter(body);
1184        let entities = to_new_entities(regex_entities);
1185        let (relationships, relationships_truncated) =
1186            build_relationships_by_sentence_cooccurrence(body, &entities);
1187        let urls = extract_urls(body);
1188        Ok(ExtractionResult {
1189            entities,
1190            relationships,
1191            relationships_truncated,
1192            extraction_method: "regex-only".to_string(),
1193            urls,
1194        })
1195    }
1196}
1197
1198#[cfg(test)]
1199mod tests {
1200    use super::*;
1201    use crate::entity_type::EntityType;
1202
1203    fn make_paths() -> AppPaths {
1204        use std::path::PathBuf;
1205        AppPaths {
1206            db: PathBuf::from("/tmp/test.sqlite"),
1207            models: PathBuf::from("/tmp/test_models"),
1208        }
1209    }
1210
1211    #[test]
1212    fn regex_email_captures_address() {
1213        let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1214        // v1.0.20: emails are classified as "concept" (regex alone cannot distinguish person from role).
1215        assert!(ents
1216            .iter()
1217            .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1218    }
1219
1220    #[test]
1221    fn regex_all_caps_filters_pt_rule_word() {
1222        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE must not become "entities".
1223        let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1224        assert!(
1225            !ents.iter().any(|e| e.name == "NUNCA"),
1226            "NUNCA must be filtered as a stopword"
1227        );
1228        assert!(
1229            !ents.iter().any(|e| e.name == "PROIBIDO"),
1230            "PROIBIDO must be filtered"
1231        );
1232        assert!(
1233            !ents.iter().any(|e| e.name == "DEVE"),
1234            "DEVE must be filtered"
1235        );
1236    }
1237
1238    #[test]
1239    fn regex_all_caps_accepts_underscored_constant() {
1240        // Technical constants like MAX_RETRY, TIMEOUT_MS must always be accepted.
1241        let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1242        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1243        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1244    }
1245
1246    #[test]
1247    fn regex_all_caps_accepts_domain_acronym() {
1248        // Legitimate (non-stopword) acronyms must pass: OPENAI, NVIDIA, GOOGLE.
1249        let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1250        assert!(ents.iter().any(|e| e.name == "OPENAI"));
1251        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1252    }
1253
1254    #[test]
1255    fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1256        // v1.0.24 P0-2: URLs were removed from apply_regex_prefilter and now go through extract_urls.
1257        let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1258        assert!(
1259            !ents.iter().any(|e| e.name.starts_with("https://")),
1260            "URLs must not appear as entities after the P0-2 split"
1261        );
1262    }
1263
1264    #[test]
1265    fn extract_urls_captures_https() {
1266        let urls = extract_urls("see https://docs.rs/crate for details");
1267        assert_eq!(urls.len(), 1);
1268        assert_eq!(urls[0].url, "https://docs.rs/crate");
1269        assert!(urls[0].offset > 0);
1270    }
1271
1272    #[test]
1273    fn extract_urls_trim_sufixo_pontuacao() {
1274        let urls = extract_urls("link: https://example.com/path. fim");
1275        assert!(!urls.is_empty());
1276        assert!(
1277            !urls[0].url.ends_with('.'),
1278            "sufixo ponto deve ser removido"
1279        );
1280    }
1281
1282    #[test]
1283    fn extract_urls_dedupes_repeated() {
1284        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1285        let urls = extract_urls(body);
1286        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1287    }
1288
1289    #[test]
1290    fn regex_uuid_captura_identificador() {
1291        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1292        assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1293    }
1294
1295    #[test]
1296    fn regex_all_caps_captura_constante() {
1297        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1298        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1299        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1300    }
1301
1302    #[test]
1303    fn regex_all_caps_ignores_short_words() {
1304        let ents = apply_regex_prefilter("use AI em seu projeto");
1305        assert!(
1306            !ents.iter().any(|e| e.name == "AI"),
1307            "AI tem apenas 2 chars, deve ser ignorado"
1308        );
1309    }
1310
1311    #[test]
1312    fn build_relationships_respeitam_max_rels() {
1313        let entities: Vec<NewEntity> = (0..20)
1314            .map(|i| NewEntity {
1315                name: format!("entidade_{i}"),
1316                entity_type: EntityType::Concept,
1317                description: None,
1318            })
1319            .collect();
1320        let (rels, truncated) = build_relationships(&entities);
1321        let max_rels = crate::constants::max_relationships_per_memory();
1322        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1323        if rels.len() == max_rels {
1324            assert!(truncated, "truncated deve ser true quando atingiu o cap");
1325        }
1326    }
1327
1328    #[test]
1329    fn build_relationships_without_duplicates() {
1330        let entities: Vec<NewEntity> = (0..5)
1331            .map(|i| NewEntity {
1332                name: format!("ent_{i}"),
1333                entity_type: EntityType::Concept,
1334                description: None,
1335            })
1336            .collect();
1337        let (rels, _truncated) = build_relationships(&entities);
1338        let mut pares: std::collections::HashSet<(String, String)> =
1339            std::collections::HashSet::new();
1340        for r in &rels {
1341            let par = (r.source.clone(), r.target.clone());
1342            assert!(pares.insert(par), "par duplicado encontrado");
1343        }
1344    }
1345
1346    #[test]
1347    fn merge_dedupes_by_lowercase_name() {
1348        // v1.0.25: collision detection is scoped per entity_type; same name + same type
1349        // must deduplicate to one entry. Different types are kept separately.
1350        let a = vec![ExtractedEntity {
1351            name: "Rust".to_string(),
1352            entity_type: EntityType::Concept,
1353        }];
1354        let b = vec![ExtractedEntity {
1355            name: "rust".to_string(),
1356            entity_type: EntityType::Concept,
1357        }];
1358        let merged = merge_and_deduplicate(a, b);
1359        assert_eq!(
1360            merged.len(),
1361            1,
1362            "rust and Rust with the same type are the same entity"
1363        );
1364    }
1365
1366    #[test]
1367    fn regex_extractor_implements_trait() {
1368        let extractor = RegexExtractor;
1369        let result = extractor
1370            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1371            .unwrap();
1372        assert!(!result.entities.is_empty());
1373    }
1374
1375    #[test]
1376    fn extract_returns_ok_without_model() {
1377        // Without a downloaded model, must return Ok with regex-only entities.
1378        let paths = make_paths();
1379        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1380        let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1381        assert!(result
1382            .entities
1383            .iter()
1384            .any(|e| e.name.contains("teste@exemplo.com")));
1385    }
1386
1387    #[test]
1388    fn stopwords_filter_v1024_terms() {
1389        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1390        // by apply_regex_prefilter so they do not appear as entities.
1391        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1392                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1393        let ents = apply_regex_prefilter(body);
1394        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1395        for word in &[
1396            "ACEITE",
1397            "ACK",
1398            "ACL",
1399            "BORDA",
1400            "CHECKLIST",
1401            "COMPLETED",
1402            "CONFIRME",
1403            "DEVEMOS",
1404            "DONE",
1405            "FIXED",
1406            "NEGUE",
1407            "PENDING",
1408            "PLAN",
1409            "PODEMOS",
1410            "RECUSE",
1411            "TOKEN",
1412            "VAMOS",
1413        ] {
1414            assert!(
1415                !names.contains(word),
1416                "v1.0.24 stopword {word} should be filtered but was found in entities"
1417            );
1418        }
1419    }
1420
1421    #[test]
1422    fn dedup_normalizes_unicode_combining_marks() {
1423        // v1.0.24 P1-E: "Caf\u{e9}" (NFC precomposed) and "Cafe\u{301}" (NFD with
1424        // combining acute accent) must deduplicate to a single entity after NFKC
1425        // normalization.
1426        let nfc = vec![ExtractedEntity {
1427            name: "Caf\u{e9}".to_string(),
1428            entity_type: EntityType::Concept,
1429        }];
1430        // Build the NFD form: 'e' followed by combining acute accent U+0301
1431        let nfd_name = "Cafe\u{301}".to_string();
1432        let nfd = vec![ExtractedEntity {
1433            name: nfd_name,
1434            entity_type: EntityType::Concept,
1435        }];
1436        let merged = merge_and_deduplicate(nfc, nfd);
1437        assert_eq!(
1438            merged.len(),
1439            1,
1440            "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1441        );
1442    }
1443
1444    #[test]
1445    fn extraction_method_regex_only_unchanged() {
1446        // RegexExtractor always returns "regex-only" regardless of GLINER_MODEL state.
1447        // This guards against accidentally changing the regex-only fallback string.
1448        let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1449        assert_eq!(
1450            result.extraction_method, "regex-only",
1451            "RegexExtractor must return regex-only"
1452        );
1453    }
1454
1455    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1456
1457    #[test]
1458    fn extend_suffix_pure_numeric_unchanged() {
1459        // Existing behaviour: pure-numeric suffix must still work after P2-E.
1460        let ents = vec![ExtractedEntity {
1461            name: "GPT".to_string(),
1462            entity_type: EntityType::Concept,
1463        }];
1464        let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1465        assert_eq!(
1466            result[0].name, "GPT-5",
1467            "purely numeric suffix must be extended"
1468        );
1469    }
1470
1471    #[test]
1472    fn extend_suffix_alphanumeric_letter_after_digit() {
1473        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1474        let ents = vec![ExtractedEntity {
1475            name: "GPT".to_string(),
1476            entity_type: EntityType::Concept,
1477        }];
1478        let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1479        assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
1480    }
1481
1482    #[test]
1483    fn extend_suffix_alphanumeric_b_suffix() {
1484        // P2-E: "5b" suffix (digit + 'b') must be captured.
1485        let ents = vec![ExtractedEntity {
1486            name: "Llama".to_string(),
1487            entity_type: EntityType::Concept,
1488        }];
1489        let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1490        assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
1491    }
1492
1493    #[test]
1494    fn extend_suffix_alphanumeric_x_suffix() {
1495        // P2-E: "8x" suffix (digit + 'x') must be captured.
1496        let ents = vec![ExtractedEntity {
1497            name: "Mistral".to_string(),
1498            entity_type: EntityType::Concept,
1499        }];
1500        let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1501        assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
1502    }
1503
1504    // --- P2-D: augment_versioned_model_names extended regex ---
1505
1506    #[test]
1507    fn augment_versioned_gpt4o() {
1508        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1509        let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1510        assert!(
1511            result.iter().any(|e| e.name == "GPT-4o"),
1512            "GPT-4o must be captured by augment, found: {:?}",
1513            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1514        );
1515    }
1516
1517    #[test]
1518    fn augment_versioned_claude_4_sonnet() {
1519        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1520        let result =
1521            augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1522        assert!(
1523            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1524            "Claude 4 Sonnet must be captured, found: {:?}",
1525            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1526        );
1527    }
1528
1529    #[test]
1530    fn augment_versioned_llama_3_pro() {
1531        // P2-D: "Llama 3 Pro" must be captured with release tier.
1532        let result =
1533            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1534        assert!(
1535            result.iter().any(|e| e.name == "Llama 3 Pro"),
1536            "Llama 3 Pro deve ser capturado, achados: {:?}",
1537            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1538        );
1539    }
1540
1541    #[test]
1542    fn augment_versioned_mixtral_8x7b() {
1543        // P2-D: "Mixtral 8x7B" composite version must be captured.
1544        let result =
1545            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1546        assert!(
1547            result.iter().any(|e| e.name == "Mixtral 8x7B"),
1548            "Mixtral 8x7B deve ser capturado, achados: {:?}",
1549            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1550        );
1551    }
1552
1553    #[test]
1554    fn augment_versioned_does_not_duplicate_existing() {
1555        // P2-D back-compat: entities already present must not be duplicated.
1556        let existing = vec![ExtractedEntity {
1557            name: "Claude 4".to_string(),
1558            entity_type: EntityType::Concept,
1559        }];
1560        let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1561        let count = result.iter().filter(|e| e.name == "Claude 4").count();
1562        assert_eq!(count, 1, "Claude 4 must not be duplicated");
1563    }
1564
1565    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1566
1567    #[test]
1568    fn stopwords_filter_url_jwt_api_v1025() {
1569        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1570        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1571        let ents = apply_regex_prefilter(body);
1572        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1573        for blocked in &[
1574            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1575        ] {
1576            assert!(
1577                !names.contains(blocked),
1578                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1579            );
1580        }
1581    }
1582
1583    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1584
1585    #[test]
1586    fn section_markers_etapa_fase_filtered_v1025() {
1587        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1588        // Body intentionally uses PT-BR section keywords (Etapa/Fase/Migra\u{e7}\u{e3}o) to
1589        // exercise the PT-BR section-marker filter. ASCII-escaped per the project policy.
1590        let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1591        let ents = apply_regex_prefilter(body);
1592        assert!(
1593            !ents
1594                .iter()
1595                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1596            "section markers must be stripped; entities: {:?}",
1597            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1598        );
1599    }
1600
1601    #[test]
1602    fn section_markers_passo_secao_filtered_v1025() {
1603        // PT-BR keywords Passo/Se\u{e7}\u{e3}o written with Unicode escapes per the
1604        // project language policy.
1605        let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1606        let ents = apply_regex_prefilter(body);
1607        assert!(
1608            !ents
1609                .iter()
1610                .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
1611            "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1612            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1613        );
1614    }
1615
1616    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1617
1618    #[test]
1619    fn brand_camelcase_extracted_as_organization_v1025() {
1620        // "OpenAI" is a CamelCase brand that NER model often misses.
1621        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1622        let ents = apply_regex_prefilter(body);
1623        let openai = ents.iter().find(|e| e.name == "OpenAI");
1624        assert!(
1625            openai.is_some(),
1626            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1627            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1628        );
1629        assert_eq!(
1630            openai.unwrap().entity_type,
1631            EntityType::Organization,
1632            "brand CamelCase must map to organization (V008)"
1633        );
1634    }
1635
1636    #[test]
1637    fn brand_postgresql_extracted_as_organization_v1025() {
1638        let body = "migrating from MySQL to PostgreSQL for better performance.";
1639        let ents = apply_regex_prefilter(body);
1640        assert!(
1641            ents.iter()
1642                .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
1643            "PostgreSQL must be extracted as organization; entities: {:?}",
1644            ents.iter()
1645                .map(|e| (&e.name, &e.entity_type))
1646                .collect::<Vec<_>>()
1647        );
1648    }
1649
1650    // --- P0-3 longest-wins v1.0.25 ---
1651
1652    fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1653        ExtractedEntity {
1654            name: name.to_string(),
1655            entity_type,
1656        }
1657    }
1658
1659    #[test]
1660    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1661        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1662        let regex = vec![entity("Sonne", EntityType::Concept)];
1663        let ner = vec![entity("Sonnet", EntityType::Concept)];
1664        let result = merge_and_deduplicate(regex, ner);
1665        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1666        assert_eq!(result[0].name, "Sonnet");
1667    }
1668
1669    #[test]
1670    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1671        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1672        let regex = vec![
1673            entity("Open", EntityType::Organization),
1674            entity("OpenAI", EntityType::Organization),
1675        ];
1676        let result = merge_and_deduplicate(regex, vec![]);
1677        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1678        assert_eq!(result[0].name, "OpenAI");
1679    }
1680
1681    #[test]
1682    fn merge_keeps_both_when_no_containment_v1025() {
1683        // "Alice" and "Bob" share no containment — both must be preserved.
1684        let regex = vec![
1685            entity("Alice", EntityType::Person),
1686            entity("Bob", EntityType::Person),
1687        ];
1688        let result = merge_and_deduplicate(regex, vec![]);
1689        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1690    }
1691
1692    #[test]
1693    fn merge_respects_entity_type_boundary_v1025() {
1694        // Same name "Apple" but different types: both must survive independently.
1695        let regex = vec![
1696            entity("Apple", EntityType::Organization),
1697            entity("Apple", EntityType::Concept),
1698        ];
1699        let result = merge_and_deduplicate(regex, vec![]);
1700        assert_eq!(
1701            result.len(),
1702            2,
1703            "expected 2 entities (different types), got: {result:?}"
1704        );
1705    }
1706
1707    #[test]
1708    fn merge_case_insensitive_dedup_v1025() {
1709        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1710        let regex = vec![
1711            entity("OpenAI", EntityType::Organization),
1712            entity("openai", EntityType::Organization),
1713        ];
1714        let result = merge_and_deduplicate(regex, vec![]);
1715        assert_eq!(
1716            result.len(),
1717            1,
1718            "expected 1 entity after case-insensitive dedup, got: {result:?}"
1719        );
1720    }
1721
1722    // ── v1.0.31 A1: NER cap protects against pathological body sizes ──
1723
1724    #[test]
1725    fn extract_graph_auto_handles_large_body_under_30s() {
1726        // Regression guard for the v1.0.31 A1 fix. A 80 KB body without real
1727        // entities must complete in under 30 s; before the cap it took 5+ minutes.
1728        let body = "x ".repeat(40_000);
1729        let paths = make_paths();
1730        let start = std::time::Instant::now();
1731        let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1732            .expect("extraction must not error");
1733        let elapsed = start.elapsed();
1734        assert!(
1735            elapsed.as_secs() < 30,
1736            "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1737            elapsed.as_secs()
1738        );
1739        // No real entities expected in synthetic body, but the call must succeed.
1740        let _ = result.entities;
1741    }
1742
1743    // ── v1.0.31 A11: PT-BR uppercase noise must not leak as entities ──
1744
1745    #[test]
1746    fn pt_uppercase_stopwords_filtered_v1031() {
1747        let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1748                    SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1749        let ents = apply_regex_prefilter(body);
1750        let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
1751        for stop in &[
1752            "ADAPTER",
1753            "PROJETO",
1754            "PASSIVA",
1755            "SOMENTE",
1756            "LEITURA",
1757            "REGRA",
1758            "OBRIGATORIA",
1759            "EXEMPLO",
1760            "DEFAULT",
1761        ] {
1762            assert!(
1763                !names.contains(&stop.to_string()),
1764                "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1765            );
1766        }
1767    }
1768
1769    #[test]
1770    fn pt_underscored_identifier_preserved_v1031() {
1771        // Identifiers with underscore must still pass through (FLOWAIPER_API_KEY,
1772        // MAX_RETRY etc. are intentional entities, not noise).
1773        let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1774        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1775        assert!(names.contains(&"FLOWAIPER_API_KEY"));
1776        assert!(names.contains(&"MAX_TIMEOUT"));
1777    }
1778
1779    // ── v1.0.31 A9: relationships only between entities co-occurring in same sentence ──
1780
1781    #[test]
1782    fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1783        let body = "Alice met Bob at the conference. Carol works alone in another room.";
1784        let entities = vec![
1785            NewEntity {
1786                name: "Alice".to_string(),
1787                entity_type: EntityType::Person,
1788                description: None,
1789            },
1790            NewEntity {
1791                name: "Bob".to_string(),
1792                entity_type: EntityType::Person,
1793                description: None,
1794            },
1795            NewEntity {
1796                name: "Carol".to_string(),
1797                entity_type: EntityType::Person,
1798                description: None,
1799            },
1800        ];
1801        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1802        assert!(!truncated);
1803        assert_eq!(
1804            rels.len(),
1805            1,
1806            "only Alice/Bob should pair (same sentence); Carol is isolated"
1807        );
1808        let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1809        assert!(
1810            matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
1811            "unexpected pair {pair:?}"
1812        );
1813    }
1814
1815    #[test]
1816    fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1817        let body = "Alice is here.";
1818        let entities = vec![NewEntity {
1819            name: "Alice".to_string(),
1820            entity_type: EntityType::Person,
1821            description: None,
1822        }];
1823        let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1824        assert!(rels.is_empty());
1825        assert!(!truncated);
1826    }
1827
1828    #[test]
1829    fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1830        let body = "Alice met Bob. Bob saw Alice again.";
1831        let entities = vec![
1832            NewEntity {
1833                name: "Alice".to_string(),
1834                entity_type: EntityType::Person,
1835                description: None,
1836            },
1837            NewEntity {
1838                name: "Bob".to_string(),
1839                entity_type: EntityType::Person,
1840                description: None,
1841            },
1842        ];
1843        let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1844        assert_eq!(
1845            rels.len(),
1846            1,
1847            "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1848        );
1849    }
1850
1851    #[test]
1852    fn extraction_max_tokens_default_is_5000() {
1853        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1854        assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1855    }
1856
1857    #[test]
1858    fn extraction_max_tokens_env_override_clamped() {
1859        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1860        assert_eq!(
1861            crate::constants::extraction_max_tokens(),
1862            5_000,
1863            "value below 512 must fall back to default"
1864        );
1865
1866        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1867        assert_eq!(
1868            crate::constants::extraction_max_tokens(),
1869            5_000,
1870            "value above 100_000 must fall back to default"
1871        );
1872
1873        std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1874        assert_eq!(
1875            crate::constants::extraction_max_tokens(),
1876            8_000,
1877            "valid value must be honoured"
1878        );
1879
1880        std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1881    }
1882
1883    #[test]
1884    fn gliner_variant_from_str_valid() {
1885        assert_eq!(
1886            "fp32".parse::<GlinerVariant>().unwrap(),
1887            GlinerVariant::Fp32
1888        );
1889        assert_eq!(
1890            "fp16".parse::<GlinerVariant>().unwrap(),
1891            GlinerVariant::Fp16
1892        );
1893        assert_eq!(
1894            "int8".parse::<GlinerVariant>().unwrap(),
1895            GlinerVariant::Int8
1896        );
1897        assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1898        assert_eq!(
1899            "q4f16".parse::<GlinerVariant>().unwrap(),
1900            GlinerVariant::Q4f16
1901        );
1902        // Case-insensitive
1903        assert_eq!(
1904            "FP32".parse::<GlinerVariant>().unwrap(),
1905            GlinerVariant::Fp32
1906        );
1907        assert_eq!(
1908            "INT8".parse::<GlinerVariant>().unwrap(),
1909            GlinerVariant::Int8
1910        );
1911    }
1912
1913    #[test]
1914    fn gliner_variant_from_str_invalid() {
1915        assert!("invalid".parse::<GlinerVariant>().is_err());
1916        assert!("fp64".parse::<GlinerVariant>().is_err());
1917        assert!("".parse::<GlinerVariant>().is_err());
1918    }
1919
1920    #[test]
1921    fn gliner_variant_filename_mapping() {
1922        assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1923        assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1924        assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1925        assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1926        assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1927    }
1928
1929    #[test]
1930    fn gliner_variant_display() {
1931        assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1932        assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1933        assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1934        assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1935        assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1936    }
1937
1938    #[test]
1939    fn gliner_variant_display_size() {
1940        assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1941        assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1942    }
1943
1944    #[test]
1945    fn gliner_entity_labels_covers_all_types() {
1946        let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1947        assert!(label_types.contains(&EntityType::Person));
1948        assert!(label_types.contains(&EntityType::Organization));
1949        assert!(label_types.contains(&EntityType::Location));
1950        assert!(label_types.contains(&EntityType::Date));
1951        assert!(label_types.contains(&EntityType::Project));
1952        assert!(label_types.contains(&EntityType::Tool));
1953        assert!(label_types.contains(&EntityType::File));
1954        assert!(label_types.contains(&EntityType::Concept));
1955        assert!(label_types.contains(&EntityType::Decision));
1956        assert!(label_types.contains(&EntityType::Incident));
1957        assert!(label_types.contains(&EntityType::Dashboard));
1958        assert!(label_types.contains(&EntityType::IssueTracker));
1959        assert!(label_types.contains(&EntityType::Memory));
1960        assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1961    }
1962
1963    #[test]
1964    fn gliner_entity_labels_no_duplicates() {
1965        let mut seen = std::collections::HashSet::new();
1966        for (label, _) in GLINER_ENTITY_LABELS {
1967            assert!(seen.insert(*label), "duplicate label: {label}");
1968        }
1969    }
1970
1971    #[test]
1972    fn extract_graph_auto_regex_only_fallback() {
1973        // extract_graph_auto must succeed and capture regex entities regardless of whether
1974        // GLiNER model files exist in the test environment (GLINER_MODEL is a global OnceLock
1975        // that may already be initialised by a sibling test, so we cannot assert on
1976        // extraction_method; use RegexExtractor for that invariant).
1977        let result = extract_graph_auto(
1978            "Contact someone@test.com about OPENAI project",
1979            &make_paths(),
1980            GlinerVariant::Fp32,
1981        );
1982        assert!(result.is_ok());
1983        let res = result.unwrap();
1984        // Regex prefilter must always capture the email entity
1985        assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
1986        // extraction_method must be one of the two valid values
1987        assert!(
1988            res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
1989            "unexpected extraction_method: {}",
1990            res.extraction_method
1991        );
1992    }
1993
1994    #[test]
1995    fn gliner_variant_roundtrip() {
1996        for variant in &[
1997            GlinerVariant::Fp32,
1998            GlinerVariant::Fp16,
1999            GlinerVariant::Int8,
2000            GlinerVariant::Q4,
2001            GlinerVariant::Q4f16,
2002        ] {
2003            let s = format!("{variant}");
2004            let parsed: GlinerVariant = s.parse().unwrap();
2005            assert_eq!(*variant, parsed);
2006        }
2007    }
2008}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs