sqlite_graphrag/
extraction.rs

1//! Entity and URL extraction pipeline (NER + regex prefilter).
2//!
3//! Runs named-entity recognition and regex heuristics to extract structured
4//! entities and hyperlinks from raw memory bodies before embedding.
5
6use std::collections::HashMap;
7use std::path::{Path, PathBuf};
8use std::sync::OnceLock;
9
10use anyhow::{Context, Result};
11use candle_core::{DType, Device, Tensor};
12use candle_nn::{Linear, Module, VarBuilder};
13use candle_transformers::models::bert::{BertModel, Config as BertConfig};
14use regex::Regex;
15use serde::Deserialize;
16use unicode_normalization::UnicodeNormalization;
17
18use crate::paths::AppPaths;
19use crate::storage::entities::{NewEntity, NewRelationship};
20
21const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
22const MAX_SEQ_LEN: usize = 512;
23const STRIDE: usize = 256;
24const MAX_ENTS: usize = 30;
25const TOP_K_RELATIONS: usize = 5;
26const DEFAULT_RELATION: &str = "mentions";
27const MIN_ENTITY_CHARS: usize = 2;
28
29static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
30static REGEX_URL: OnceLock<Regex> = OnceLock::new();
31static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
32static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
33// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
34static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
35// v1.0.25 P0-2: captures CamelCase brand names that BERT NER often misses (e.g. "OpenAI", "PostgreSQL").
36static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
37
38// v1.0.20: stopwords para filtrar palavras-regra PT-BR/EN comuns capturadas como ALL_CAPS.
39// Sem este filtro, corpus técnico em PT-BR contendo regras formatadas em CAPS (NUNCA, PROIBIDO, DEVE)
40// gerava ~70% de "entidades" lixo. Mantemos identificadores tipo MAX_RETRY (com underscore).
41// v1.0.22: lista expandida com termos observados em stress test 495 arquivos do flowaiper.
42// Inclui verbos (ADICIONAR, VALIDAR), adjetivos (ALTA, BAIXA), substantivos comuns (BANCO, CASO),
43// HTTP methods (GET, POST, DELETE) e formatos de dados genéricos (JSON, XML).
44// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
45// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
46// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
47// and common abbreviations (ACK, ACL).
48// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
49// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
50// and generic tech terms from being extracted as entities.
51const ALL_CAPS_STOPWORDS: &[&str] = &[
52    "ACEITE",
53    "ACK",
54    "ACL",
55    "ACRESCENTADO",
56    "ADICIONAR",
57    "AGENTS",
58    "ALL",
59    "ALTA",
60    "ALWAYS",
61    "API",
62    "ARTEFATOS",
63    "ATIVO",
64    "BAIXA",
65    "BANCO",
66    "BORDA",
67    "BLOQUEAR",
68    "BUG",
69    "CAPÍTULO",
70    "CASO",
71    "CHECKLIST",
72    "CLI",
73    "COMPLETED",
74    "CONFIRMADO",
75    "CONFIRME",
76    "CONTRATO",
77    "CRÍTICO",
78    "CRITICAL",
79    "CSV",
80    "DEVE",
81    "DEVEMOS",
82    "DISCO",
83    "DONE",
84    "EFEITO",
85    "ENTRADA",
86    "ERROR",
87    "ESSA",
88    "ESSE",
89    "ESSENCIAL",
90    "ESTA",
91    "ESTE",
92    "ETAPA",
93    "EVITAR",
94    "EXPANDIR",
95    "EXPOR",
96    "FALHA",
97    "FASE",
98    "FIXED",
99    "FIXME",
100    "FORBIDDEN",
101    "HACK",
102    "HEARTBEAT",
103    "HTTP",
104    "HTTPS",
105    "INATIVO",
106    "JAMAIS",
107    "JSON",
108    "JWT",
109    "LLM",
110    "MUST",
111    "NEGUE",
112    "NEVER",
113    "NOTE",
114    "NUNCA",
115    "OBRIGATÓRIO",
116    "PADRÃO",
117    "PASSO",
118    "PENDING",
119    "PLAN",
120    "PODEMOS",
121    "PROIBIDO",
122    "RECUSE",
123    "REGRAS",
124    "REQUIRED",
125    "REQUISITO",
126    "REST",
127    "SEÇÃO",
128    "SEMPRE",
129    "SHALL",
130    "SHOULD",
131    "SOUL",
132    "TODAS",
133    "TODO",
134    "TODOS",
135    "TOKEN",
136    "TOOLS",
137    "TSV",
138    "UI",
139    "URL",
140    "USAR",
141    "VALIDAR",
142    "VAMOS",
143    "VOCÊ",
144    "WARNING",
145    "XML",
146    "YAML",
147];
148
149// v1.0.22: HTTP methods são verbos de protocolo, não entidades semanticamente úteis.
150// Filtrados em apply_regex_prefilter (regex_all_caps) e iob_to_entities (single-token).
151const HTTP_METHODS: &[&str] = &[
152    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
153];
154
155fn is_filtered_all_caps(token: &str) -> bool {
156    // Identificadores com underscore são preservados (ex: MAX_RETRY, FLOWAIPER_API_KEY)
157    let is_identifier = token.contains('_');
158    if is_identifier {
159        return false;
160    }
161    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
162}
163
164fn regex_email() -> &'static Regex {
165    REGEX_EMAIL
166        .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
167}
168
169fn regex_url() -> &'static Regex {
170    REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
171}
172
173fn regex_uuid() -> &'static Regex {
174    REGEX_UUID.get_or_init(|| {
175        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
176            .unwrap()
177    })
178}
179
180fn regex_all_caps() -> &'static Regex {
181    REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
182}
183
184fn regex_section_marker() -> &'static Regex {
185    REGEX_SECTION_MARKER.get_or_init(|| {
186        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1", etc.
187        Regex::new(r"\b(?:Etapa|Fase|Passo|Seção|Capítulo)\s+\d+\b").unwrap()
188    })
189}
190
191fn regex_brand_camel() -> &'static Regex {
192    REGEX_BRAND_CAMEL.get_or_init(|| {
193        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
194        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
195        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b").unwrap()
196    })
197}
198
199#[derive(Debug, Clone, PartialEq)]
200pub struct ExtractedEntity {
201    pub name: String,
202    pub entity_type: String,
203}
204
205/// URL with source offset extracted from the memory body.
206#[derive(Debug, Clone)]
207pub struct ExtractedUrl {
208    pub url: String,
209    /// Byte position in the body where the URL was found.
210    pub offset: usize,
211}
212
213#[derive(Debug, Clone)]
214pub struct ExtractionResult {
215    pub entities: Vec<NewEntity>,
216    pub relationships: Vec<NewRelationship>,
217    /// True when build_relationships hit the cap before covering all entity pairs.
218    /// Exposed in RememberResponse so callers can detect when relationships were cut.
219    pub relationships_truncated: bool,
220    /// Extraction method used: "bert+regex" or "regex-only".
221    /// Useful for auditing, metrics and user reports.
222    pub extraction_method: String,
223    /// URLs extracted from the body — stored separately from graph entities.
224    pub urls: Vec<ExtractedUrl>,
225}
226
227pub trait Extractor: Send + Sync {
228    fn extract(&self, body: &str) -> Result<ExtractionResult>;
229}
230
231#[derive(Deserialize)]
232struct ModelConfig {
233    #[serde(default)]
234    id2label: HashMap<String, String>,
235    hidden_size: usize,
236}
237
238struct BertNerModel {
239    bert: BertModel,
240    classifier: Linear,
241    device: Device,
242    id2label: HashMap<usize, String>,
243}
244
245impl BertNerModel {
246    fn load(model_dir: &Path) -> Result<Self> {
247        let config_path = model_dir.join("config.json");
248        let weights_path = model_dir.join("model.safetensors");
249
250        let config_str = std::fs::read_to_string(&config_path)
251            .with_context(|| format!("lendo config.json em {config_path:?}"))?;
252        let model_cfg: ModelConfig =
253            serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
254
255        let id2label: HashMap<usize, String> = model_cfg
256            .id2label
257            .into_iter()
258            .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
259            .collect();
260
261        let num_labels = id2label.len().max(9);
262        let hidden_size = model_cfg.hidden_size;
263
264        let bert_config_str = std::fs::read_to_string(&config_path)
265            .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
266        let bert_cfg: BertConfig =
267            serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
268
269        let device = Device::Cpu;
270
271        // SAFETY: VarBuilder::from_mmaped_safetensors requires unsafe because it relies on
272        // memory-mapping the weights file. Soundness assumptions:
273        // 1. The file at `weights_path` is not concurrently modified during model lifetime
274        //    (we only read; the cache directory is owned by the current process via 0600 perms).
275        // 2. The mmaped region remains valid for the lifetime of the `VarBuilder` and any
276        //    derived tensors (enforced by candle's internal lifetime tracking).
277        // 3. The safetensors format is well-formed (verified by candle's parser before mmap).
278        let vb = unsafe {
279            VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
280                .with_context(|| format!("mapeando {weights_path:?}"))?
281        };
282        let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
283
284        // v1.0.20 fix P0 secundário: carregar classifier head do safetensors em vez de zeros.
285        // Em v1.0.19 usávamos Tensor::zeros, o que produzia argmax constante e inferência degenerada.
286        let cls_vb = vb.pp("classifier");
287        let weight = cls_vb
288            .get((num_labels, hidden_size), "weight")
289            .context("carregando classifier.weight do safetensors")?;
290        let bias = cls_vb
291            .get(num_labels, "bias")
292            .context("carregando classifier.bias do safetensors")?;
293        let classifier = Linear::new(weight, Some(bias));
294
295        Ok(Self {
296            bert,
297            classifier,
298            device,
299            id2label,
300        })
301    }
302
303    fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
304        let len = token_ids.len();
305        let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
306        let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
307
308        let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
309            .context("criando tensor input_ids")?;
310        let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
311            .context("criando tensor token_type_ids")?;
312        let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
313            .context("criando tensor attention_mask")?;
314
315        let sequence_output = self
316            .bert
317            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
318            .context("forward pass do BertModel")?;
319
320        let logits = self
321            .classifier
322            .forward(&sequence_output)
323            .context("forward pass do classificador")?;
324
325        let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
326
327        let num_tokens = logits_2d.dim(0).context("dim(0)")?;
328
329        let mut labels = Vec::with_capacity(num_tokens);
330        for i in 0..num_tokens {
331            let token_logits = logits_2d.get(i).context("get token logits")?;
332            let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
333            let argmax = vec
334                .iter()
335                .enumerate()
336                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
337                .map(|(idx, _)| idx)
338                .unwrap_or(0);
339            let label = self
340                .id2label
341                .get(&argmax)
342                .cloned()
343                .unwrap_or_else(|| "O".to_string());
344            labels.push(label);
345        }
346
347        Ok(labels)
348    }
349
350    /// Run a batched forward pass over multiple tokenised windows at once.
351    ///
352    /// Windows are padded on the right with token_id=0 and attention_mask=0 to
353    /// the length of the longest window in the batch.  The attention mask ensures
354    /// BERT ignores padded positions (bert.rs:515-528 adds -3.4e38 before softmax).
355    ///
356    /// Returns one label vector per window, each of length equal to that window's
357    /// original (pre-padding) token count.
358    fn predict_batch(&self, windows: &[(Vec<u32>, Vec<String>)]) -> Result<Vec<Vec<String>>> {
359        let batch_size = windows.len();
360        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
361        if max_len == 0 {
362            return Ok(vec![vec![]; batch_size]);
363        }
364
365        let mut padded_ids: Vec<Tensor> = Vec::with_capacity(batch_size);
366        let mut padded_masks: Vec<Tensor> = Vec::with_capacity(batch_size);
367
368        for (ids, _) in windows {
369            let len = ids.len();
370            let pad_right = max_len - len;
371
372            let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
373            // Build 1-D token tensor then pad to max_len
374            let t = Tensor::from_vec(ids_i64, len, &self.device)
375                .context("criando tensor de ids para batch")?;
376            let t = t
377                .pad_with_zeros(0, 0, pad_right)
378                .context("padding tensor de ids")?;
379            padded_ids.push(t);
380
381            // Attention mask: 1 for real tokens, 0 for padding
382            let mut mask_i64 = vec![1i64; len];
383            mask_i64.extend(vec![0i64; pad_right]);
384            let m = Tensor::from_vec(mask_i64, max_len, &self.device)
385                .context("criando tensor de máscara para batch")?;
386            padded_masks.push(m);
387        }
388
389        // Stack 1-D tensors into (batch_size, max_len)
390        let input_ids = Tensor::stack(&padded_ids, 0).context("stack input_ids")?;
391        let attn_mask = Tensor::stack(&padded_masks, 0).context("stack attn_mask")?;
392        let token_type_ids = Tensor::zeros((batch_size, max_len), DType::I64, &self.device)
393            .context("criando token_type_ids batch")?;
394
395        // Single forward pass for the entire batch
396        let sequence_output = self
397            .bert
398            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
399            .context("forward pass batch BertModel")?;
400        // sequence_output: (batch_size, max_len, hidden_size)
401
402        let logits = self
403            .classifier
404            .forward(&sequence_output)
405            .context("forward pass batch classificador")?;
406        // logits: (batch_size, max_len, num_labels)
407
408        let mut results = Vec::with_capacity(batch_size);
409        for (i, (window_ids, _)) in windows.iter().enumerate() {
410            let example_logits = logits.get(i).context("get logits exemplo")?;
411            // (max_len, num_labels) — slice only real tokens, discard padding
412            let real_len = window_ids.len();
413            let example_slice = example_logits
414                .narrow(0, 0, real_len)
415                .context("narrow para tokens reais")?;
416            let logits_2d: Vec<Vec<f32>> = example_slice.to_vec2().context("to_vec2 logits")?;
417
418            let labels: Vec<String> = logits_2d
419                .iter()
420                .map(|token_logits| {
421                    let argmax = token_logits
422                        .iter()
423                        .enumerate()
424                        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
425                        .map(|(idx, _)| idx)
426                        .unwrap_or(0);
427                    self.id2label
428                        .get(&argmax)
429                        .cloned()
430                        .unwrap_or_else(|| "O".to_string())
431                })
432                .collect();
433
434            results.push(labels);
435        }
436
437        Ok(results)
438    }
439}
440
441static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
442
443fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
444    NER_MODEL
445        .get_or_init(|| match load_model(paths) {
446            Ok(m) => Some(m),
447            Err(e) => {
448                tracing::warn!("NER model unavailable (graceful degradation): {e:#}");
449                None
450            }
451        })
452        .as_ref()
453}
454
455fn model_dir(paths: &AppPaths) -> PathBuf {
456    paths.models.join("bert-multilingual-ner")
457}
458
459fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
460    let dir = model_dir(paths);
461    std::fs::create_dir_all(&dir)
462        .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
463
464    let weights = dir.join("model.safetensors");
465    let config = dir.join("config.json");
466    let tokenizer = dir.join("tokenizer.json");
467
468    if weights.exists() && config.exists() && tokenizer.exists() {
469        return Ok(dir);
470    }
471
472    tracing::info!("Downloading NER model (first run, ~676 MB)...");
473    crate::output::emit_progress_i18n(
474        "Downloading NER model (first run, ~676 MB)...",
475        "Baixando modelo NER (primeira execução, ~676 MB)...",
476    );
477
478    let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
479    let repo = api.model(MODEL_ID.to_string());
480
481    // v1.0.20 fix P0 primário: tokenizer.json no repo Davlan está apenas em onnx/tokenizer.json.
482    // Em v1.0.19 buscávamos da raiz e recebíamos 404, caindo em graceful degradation 100% das vezes.
483    // Mapeamos (remote_path, local_filename) para baixar do subfolder mantendo nome plano local.
484    for (remote, local) in &[
485        ("model.safetensors", "model.safetensors"),
486        ("config.json", "config.json"),
487        ("onnx/tokenizer.json", "tokenizer.json"),
488        ("tokenizer_config.json", "tokenizer_config.json"),
489    ] {
490        let dest = dir.join(local);
491        if !dest.exists() {
492            let src = repo
493                .get(remote)
494                .with_context(|| format!("baixando {remote} do HF Hub"))?;
495            std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
496        }
497    }
498
499    Ok(dir)
500}
501
502fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
503    let dir = ensure_model_files(paths)?;
504    BertNerModel::load(&dir)
505}
506
507fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
508    let mut entities = Vec::new();
509    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
510
511    let add = |entities: &mut Vec<ExtractedEntity>,
512               seen: &mut std::collections::HashSet<String>,
513               name: &str,
514               entity_type: &str| {
515        let name = name.trim().to_string();
516        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
517            entities.push(ExtractedEntity {
518                name,
519                entity_type: entity_type.to_string(),
520            });
521        }
522    };
523
524    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
525    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
526    let cleaned = regex_section_marker().replace_all(body, " ");
527    let cleaned = cleaned.as_ref();
528
529    for m in regex_email().find_iter(cleaned) {
530        // v1.0.20: email é "concept" (regex sozinho não distingue pessoa de mailing list/role).
531        add(&mut entities, &mut seen, m.as_str(), "concept");
532    }
533    for m in regex_uuid().find_iter(cleaned) {
534        add(&mut entities, &mut seen, m.as_str(), "concept");
535    }
536    for m in regex_all_caps().find_iter(cleaned) {
537        let candidate = m.as_str();
538        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
539        if !is_filtered_all_caps(candidate) {
540            add(&mut entities, &mut seen, candidate, "concept");
541        }
542    }
543    // v1.0.25 P0-2: capture CamelCase brand names that BERT NER often misses.
544    // Maps to "organization" (V008 schema) because brand names are typically organisations.
545    for m in regex_brand_camel().find_iter(cleaned) {
546        let name = m.as_str();
547        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
548        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
549            add(&mut entities, &mut seen, name, "organization");
550        }
551    }
552
553    entities
554}
555
556/// Extracts URLs from a memory body, deduplicated by text.
557/// URLs are stored in the `memory_urls` table separately from graph entities.
558/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
559pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
560    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
561    let mut result = Vec::new();
562    for m in regex_url().find_iter(body) {
563        let raw = m.as_str();
564        let cleaned = raw
565            .trim_end_matches('`')
566            .trim_end_matches(',')
567            .trim_end_matches('.')
568            .trim_end_matches(';')
569            .trim_end_matches(')')
570            .trim_end_matches(']')
571            .trim_end_matches('}');
572        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
573            result.push(ExtractedUrl {
574                url: cleaned.to_string(),
575                offset: m.start(),
576            });
577        }
578    }
579    result
580}
581
582fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
583    let mut entities: Vec<ExtractedEntity> = Vec::new();
584    let mut current_parts: Vec<String> = Vec::new();
585    let mut current_type: Option<String> = None;
586
587    let flush =
588        |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
589            if let Some(t) = typ.take() {
590                let name = parts.join(" ").trim().to_string();
591                // v1.0.22: filtra single-token entities que sejam stopwords ALL CAPS ou HTTP methods.
592                // BERT NER classifica algumas dessas como B-MISC/B-ORG; pós-filtro aqui evita
593                // poluir o grafo com verbos/protocolos genéricos.
594                let is_single_caps = !name.contains(' ')
595                    && name == name.to_uppercase()
596                    && name.len() >= MIN_ENTITY_CHARS;
597                let should_skip = is_single_caps && is_filtered_all_caps(&name);
598                // v1.0.25 P0-4: BERT may independently label section-structure tokens (e.g.
599                // "Etapa 3", "Fase 1") even though apply_regex_prefilter strips them from the
600                // input text before regex extraction. Apply the same guard here to avoid the
601                // BERT path re-introducing these markers as graph entities.
602                let is_section_marker = regex_section_marker().is_match(&name);
603                if name.len() >= MIN_ENTITY_CHARS && !should_skip && !is_section_marker {
604                    entities.push(ExtractedEntity {
605                        name,
606                        entity_type: t,
607                    });
608                }
609                parts.clear();
610            }
611        };
612
613    for (token, label) in tokens.iter().zip(labels.iter()) {
614        if label == "O" {
615            flush(&mut current_parts, &mut current_type, &mut entities);
616            continue;
617        }
618
619        let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
620            ("B", rest)
621        } else if let Some(rest) = label.strip_prefix("I-") {
622            ("I", rest)
623        } else {
624            flush(&mut current_parts, &mut current_type, &mut entities);
625            continue;
626        };
627
628        // v1.0.25 P0-2: Portuguese monosyllabic verbs that BERT often misclassifies as person names.
629        // Only filtered when confidence is unavailable (no logit gate here); these tokens are
630        // structurally unlikely to be real proper names in a technical corpus.
631        const PT_VERB_FALSE_POSITIVES: &[&str] = &[
632            "Lê", "Vê", "Cá", "Pôr", "Ser", "Vir", "Ver", "Dar", "Ler", "Ter",
633        ];
634
635        let entity_type = match bio_type {
636            // v1.0.25 V008: DATE is now a first-class entity type instead of being discarded.
637            "DATE" => "date",
638            "PER" => {
639                // Filter well-known PT monosyllabic verbs misclassified as persons.
640                if PT_VERB_FALSE_POSITIVES.contains(&token.as_str()) {
641                    flush(&mut current_parts, &mut current_type, &mut entities);
642                    continue;
643                }
644                "person"
645            }
646            "ORG" => {
647                let t = token.to_lowercase();
648                if t.contains("lib")
649                    || t.contains("sdk")
650                    || t.contains("cli")
651                    || t.contains("crate")
652                    || t.contains("npm")
653                {
654                    "tool"
655                } else {
656                    // v1.0.25 V008: "organization" replaces the v1.0.24 default "project".
657                    "organization"
658                }
659            }
660            // v1.0.25 V008: "location" replaces "concept" for geographic tokens.
661            "LOC" => "location",
662            other => other,
663        };
664
665        if prefix == "B" {
666            if token.starts_with("##") {
667                // BERT confuso: subword com B-prefix indica continuação de entidade anterior.
668                // Anexar à última parte da entidade atual; senão descartar.
669                let clean = token.strip_prefix("##").unwrap_or(token.as_str());
670                if let Some(last) = current_parts.last_mut() {
671                    last.push_str(clean);
672                }
673                continue;
674            }
675            flush(&mut current_parts, &mut current_type, &mut entities);
676            current_parts.push(token.clone());
677            current_type = Some(entity_type.to_string());
678        } else if prefix == "I" && current_type.is_some() {
679            let clean = token.strip_prefix("##").unwrap_or(token.as_str());
680            if token.starts_with("##") {
681                if let Some(last) = current_parts.last_mut() {
682                    last.push_str(clean);
683                }
684            } else {
685                current_parts.push(clean.to_string());
686            }
687        }
688    }
689
690    flush(&mut current_parts, &mut current_type, &mut entities);
691    entities
692}
693
694/// Returns (relationships, truncated) where truncated is true when the cap was hit
695/// before all entity pairs were covered. Exposed in RememberResponse as
696/// `relationships_truncated` so callers can decide whether to increase the cap.
697fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
698    if entities.len() < 2 {
699        return (Vec::new(), false);
700    }
701
702    // v1.0.22: cap configurável via env var (constants::max_relationships_per_memory).
703    // Permite usuários com corpus denso aumentar além do default 50.
704    let max_rels = crate::constants::max_relationships_per_memory();
705    let n = entities.len().min(MAX_ENTS);
706    let mut rels: Vec<NewRelationship> = Vec::new();
707    let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
708
709    let mut hit_cap = false;
710    'outer: for i in 0..n {
711        if rels.len() >= max_rels {
712            hit_cap = true;
713            break;
714        }
715
716        let mut for_entity = 0usize;
717        for j in (i + 1)..n {
718            if for_entity >= TOP_K_RELATIONS {
719                break;
720            }
721            if rels.len() >= max_rels {
722                hit_cap = true;
723                break 'outer;
724            }
725
726            let src = &entities[i].name;
727            let tgt = &entities[j].name;
728            let key = (src.clone(), tgt.clone());
729
730            if seen.contains(&key) {
731                continue;
732            }
733            seen.insert(key);
734
735            rels.push(NewRelationship {
736                source: src.clone(),
737                target: tgt.clone(),
738                relation: DEFAULT_RELATION.to_string(),
739                strength: 0.5,
740                description: None,
741            });
742            for_entity += 1;
743        }
744    }
745
746    // v1.0.20: avisar quando relacionamentos foram truncados antes de cobrir todos os pares possíveis.
747    if hit_cap {
748        tracing::warn!(
749            "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
750            n.saturating_sub(1)
751        );
752    }
753
754    (rels, hit_cap)
755}
756
757fn run_ner_sliding_window(
758    model: &BertNerModel,
759    body: &str,
760    paths: &AppPaths,
761) -> Result<Vec<ExtractedEntity>> {
762    let tokenizer_path = model_dir(paths).join("tokenizer.json");
763    let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
764        .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
765
766    let encoding = tokenizer
767        .encode(body, false)
768        .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
769
770    let all_ids: Vec<u32> = encoding.get_ids().to_vec();
771    let all_tokens: Vec<String> = encoding
772        .get_tokens()
773        .iter()
774        .map(|s| s.to_string())
775        .collect();
776
777    if all_ids.is_empty() {
778        return Ok(Vec::new());
779    }
780
781    // Phase 1: collect all sliding windows before any inference
782    let mut windows: Vec<(Vec<u32>, Vec<String>)> = Vec::new();
783    let mut start = 0usize;
784    loop {
785        let end = (start + MAX_SEQ_LEN).min(all_ids.len());
786        windows.push((
787            all_ids[start..end].to_vec(),
788            all_tokens[start..end].to_vec(),
789        ));
790        if end >= all_ids.len() {
791            break;
792        }
793        start += STRIDE;
794    }
795
796    // Phase 2: sort by window length ascending to minimise intra-batch padding waste
797    windows.sort_by_key(|(ids, _)| ids.len());
798
799    // Phase 3: batched inference with fallback to single-window predict on error
800    let batch_size = crate::constants::ner_batch_size();
801    let mut entities: Vec<ExtractedEntity> = Vec::new();
802    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
803
804    for chunk in windows.chunks(batch_size) {
805        match model.predict_batch(chunk) {
806            Ok(batch_labels) => {
807                for (labels, (_, tokens)) in batch_labels.iter().zip(chunk.iter()) {
808                    for ent in iob_to_entities(tokens, labels) {
809                        if seen.insert(ent.name.clone()) {
810                            entities.push(ent);
811                        }
812                    }
813                }
814            }
815            Err(e) => {
816                tracing::warn!(
817                    "batch NER falhou (chunk de {} janelas): {e:#} — fallback single-window",
818                    chunk.len()
819                );
820                // Fallback: process each window individually to preserve entities
821                for (ids, tokens) in chunk {
822                    let mask = vec![1u32; ids.len()];
823                    match model.predict(ids, &mask) {
824                        Ok(labels) => {
825                            for ent in iob_to_entities(tokens, &labels) {
826                                if seen.insert(ent.name.clone()) {
827                                    entities.push(ent);
828                                }
829                            }
830                        }
831                        Err(e2) => {
832                            tracing::warn!("NER window fallback also failed: {e2:#}");
833                        }
834                    }
835                }
836            }
837        }
838    }
839
840    Ok(entities)
841}
842
843/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
844/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
845/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
846/// Conservative: only extends when the suffix is at most 7 characters.
847/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
848/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
849fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
850    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
851    // Matches: separator + digits + optional decimal + optional lowercase letter
852    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
853    let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)").unwrap());
854
855    entities
856        .into_iter()
857        .map(|ent| {
858            // Encontra a primeira ocorrência case-sensitive da entidade no body
859            if let Some(pos) = body.find(&ent.name) {
860                let after_pos = pos + ent.name.len();
861                if after_pos < body.len() {
862                    let after = &body[after_pos..];
863                    if let Some(m) = suffix_re.find(after) {
864                        let suffix = m.as_str();
865                        // Conservative: cap suffix length to 7 chars to avoid grabbing
866                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
867                        if suffix.len() <= 7 {
868                            let extended = format!("{}{}", ent.name, suffix);
869                            return ExtractedEntity {
870                                name: extended,
871                                entity_type: ent.entity_type,
872                            };
873                        }
874                    }
875                }
876            }
877            ent
878        })
879        .collect()
880}
881
882/// Captures versioned model names that BERT NER consistently misses.
883///
884/// BERT NER often classifies tokens like "Claude" or "Llama" as common nouns,
885/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
886/// never sees these candidates and the version suffix gets lost.
887///
888/// This function scans the body with a conservative regex, matching capitalised
889/// words followed by a space-or-hyphen and a small integer. Matches that are not
890/// already covered by an existing entity (case-insensitive) are appended with the
891/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
892/// items downstream.
893///
894/// v1.0.24 P2-D: regex extended to cover:
895/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
896/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
897/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
898///
899/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
900/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
901fn augment_versioned_model_names(
902    entities: Vec<ExtractedEntity>,
903    body: &str,
904) -> Vec<ExtractedEntity> {
905    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
906    // Pattern breakdown:
907    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
908    //   [\s\-]+               — separator: space(s) or hyphen(s)
909    //   \d+(?:\.\d+)?         — version number, optional decimal
910    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
911    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
912    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
913        Regex::new(
914            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
915        )
916        .unwrap()
917    });
918
919    let mut existing_lc: std::collections::HashSet<String> =
920        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
921    let mut result = entities;
922
923    for caps in model_re.captures_iter(body) {
924        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
925        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
926        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
927        if full_match.is_empty() || full_match.len() > 24 {
928            continue;
929        }
930        let normalized_lc = full_match.to_lowercase();
931        if existing_lc.contains(&normalized_lc) {
932            continue;
933        }
934        // Stop appending once the global entity cap is reached to keep parity with
935        // `merge_and_deduplicate` truncation semantics.
936        if result.len() >= MAX_ENTS {
937            break;
938        }
939        existing_lc.insert(normalized_lc);
940        result.push(ExtractedEntity {
941            name: full_match.to_string(),
942            entity_type: "concept".to_string(),
943        });
944    }
945
946    result
947}
948
949fn merge_and_deduplicate(
950    regex_ents: Vec<ExtractedEntity>,
951    ner_ents: Vec<ExtractedEntity>,
952) -> Vec<ExtractedEntity> {
953    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
954    // and is scoped per entity_type. This fixes two bugs from prior versions:
955    //
956    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
957    //    start_with "sonnet", so the pair could survive dedup depending on insertion
958    //    order. contains() catches both directions unconditionally.
959    //
960    // 2. The lookup key omitted entity_type, so "Apple/organization" and
961    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
962    //
963    // Earlier invariants preserved:
964    // - NFKC normalization before lowercasing (v1.0.24).
965    // - Longest-wins: on collision keep the entity with the longer name.
966    // - Truncation warning at MAX_ENTS.
967    let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
968    let mut result: Vec<ExtractedEntity> = Vec::new();
969    let mut truncated = false;
970
971    let total_input = regex_ents.len() + ner_ents.len();
972    for ent in regex_ents.into_iter().chain(ner_ents) {
973        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
974        // Composite key: entity_type + NUL + normalised lowercase name.
975        // Collision search is scoped to the same type so that e.g.
976        // "Apple/organization" and "Apple/concept" are kept separately.
977        let key = format!("{}\0{}", ent.entity_type, name_lc);
978
979        // Scan stored entries for substring containment within the same type.
980        // Two names collide when one is a case-insensitive substring of the other:
981        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
982        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
983        let mut collision_idx: Option<usize> = None;
984        for (existing_key, idx) in &by_lc {
985            // Fast-path: check type prefix matches before scanning the name.
986            let type_prefix = format!("{}\0", ent.entity_type);
987            if !existing_key.starts_with(&type_prefix) {
988                continue;
989            }
990            let existing_name_lc = &existing_key[type_prefix.len()..];
991            if existing_name_lc == name_lc
992                || existing_name_lc.contains(name_lc.as_str())
993                || name_lc.contains(existing_name_lc)
994            {
995                collision_idx = Some(*idx);
996                break;
997            }
998        }
999        match collision_idx {
1000            Some(idx) => {
1001                // Replace stored entity only when the new candidate is strictly
1002                // longer; otherwise drop the new one.
1003                if ent.name.len() > result[idx].name.len() {
1004                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1005                    let old_key = format!("{}\0{}", result[idx].entity_type, old_name_lc);
1006                    by_lc.remove(&old_key);
1007                    result[idx] = ent;
1008                    by_lc.insert(key, idx);
1009                }
1010            }
1011            None => {
1012                by_lc.insert(key, result.len());
1013                result.push(ent);
1014            }
1015        }
1016        if result.len() >= MAX_ENTS {
1017            truncated = true;
1018            break;
1019        }
1020    }
1021
1022    // v1.0.20: avisar quando truncamento silencioso descarta entidades acima do MAX_ENTS.
1023    if truncated {
1024        tracing::warn!(
1025            "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
1026        );
1027    }
1028
1029    result
1030}
1031
1032fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1033    extracted
1034        .into_iter()
1035        .map(|e| NewEntity {
1036            name: e.name,
1037            entity_type: e.entity_type,
1038            description: None,
1039        })
1040        .collect()
1041}
1042
1043pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
1044    let regex_entities = apply_regex_prefilter(body);
1045
1046    let mut bert_used = false;
1047    let ner_entities = match get_or_init_model(paths) {
1048        Some(model) => match run_ner_sliding_window(model, body, paths) {
1049            Ok(ents) => {
1050                bert_used = true;
1051                ents
1052            }
1053            Err(e) => {
1054                tracing::warn!("NER falhou, usando apenas regex: {e:#}");
1055                Vec::new()
1056            }
1057        },
1058        None => Vec::new(),
1059    };
1060
1061    let merged = merge_and_deduplicate(regex_entities, ner_entities);
1062    // v1.0.22: estender entidades NER com sufixos numéricos do body (GPT-5, Claude 4, Python 3).
1063    let extended = extend_with_numeric_suffix(merged, body);
1064    // v1.0.23: capture versioned model names that BERT NER does not detect on its own
1065    // (e.g. "Claude 4", "Llama 3"). Hyphenated variants like "GPT-5" are already covered
1066    // by the NER+suffix pipeline above, but space-separated names need a dedicated pass.
1067    let with_models = augment_versioned_model_names(extended, body);
1068    // v1.0.25 P0-4: augment_versioned_model_names matches any capitalised word followed by a
1069    // digit, which inadvertently captures PT-BR section markers ("Etapa 3", "Fase 1"). Strip
1070    // them here as a final guard after the full augmentation pipeline.
1071    let with_models: Vec<ExtractedEntity> = with_models
1072        .into_iter()
1073        .filter(|e| !regex_section_marker().is_match(&e.name))
1074        .collect();
1075    let entities = to_new_entities(with_models);
1076    let (relationships, relationships_truncated) = build_relationships(&entities);
1077
1078    let extraction_method = if bert_used {
1079        "bert+regex-batch".to_string()
1080    } else {
1081        "regex-only".to_string()
1082    };
1083
1084    let urls = extract_urls(body);
1085
1086    Ok(ExtractionResult {
1087        entities,
1088        relationships,
1089        relationships_truncated,
1090        extraction_method,
1091        urls,
1092    })
1093}
1094
1095pub struct RegexExtractor;
1096
1097impl Extractor for RegexExtractor {
1098    fn extract(&self, body: &str) -> Result<ExtractionResult> {
1099        let regex_entities = apply_regex_prefilter(body);
1100        let entities = to_new_entities(regex_entities);
1101        let (relationships, relationships_truncated) = build_relationships(&entities);
1102        let urls = extract_urls(body);
1103        Ok(ExtractionResult {
1104            entities,
1105            relationships,
1106            relationships_truncated,
1107            extraction_method: "regex-only".to_string(),
1108            urls,
1109        })
1110    }
1111}
1112
1113#[cfg(test)]
1114mod tests {
1115    use super::*;
1116
1117    fn make_paths() -> AppPaths {
1118        use std::path::PathBuf;
1119        AppPaths {
1120            db: PathBuf::from("/tmp/test.sqlite"),
1121            models: PathBuf::from("/tmp/test_models"),
1122        }
1123    }
1124
1125    #[test]
1126    fn regex_email_captura_endereco() {
1127        let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
1128        // v1.0.20: emails são classificados como "concept" (regex sozinho não distingue pessoa de role).
1129        assert!(ents
1130            .iter()
1131            .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
1132    }
1133
1134    #[test]
1135    fn regex_all_caps_filtra_palavra_regra_pt() {
1136        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE não devem virar "entidades".
1137        let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
1138        assert!(
1139            !ents.iter().any(|e| e.name == "NUNCA"),
1140            "NUNCA deveria ser filtrado como stopword"
1141        );
1142        assert!(
1143            !ents.iter().any(|e| e.name == "PROIBIDO"),
1144            "PROIBIDO deveria ser filtrado"
1145        );
1146        assert!(
1147            !ents.iter().any(|e| e.name == "DEVE"),
1148            "DEVE deveria ser filtrado"
1149        );
1150    }
1151
1152    #[test]
1153    fn regex_all_caps_aceita_constante_com_underscore() {
1154        // Constantes técnicas tipo MAX_RETRY, TIMEOUT_MS sempre devem ser aceitas.
1155        let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
1156        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1157        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1158    }
1159
1160    #[test]
1161    fn regex_all_caps_aceita_acronimo_dominio() {
1162        // Acrônimos legítimos (não-stopword) devem passar: OPENAI, NVIDIA, GOOGLE.
1163        let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
1164        assert!(ents.iter().any(|e| e.name == "OPENAI"));
1165        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1166    }
1167
1168    #[test]
1169    fn regex_url_nao_aparece_em_apply_regex_prefilter() {
1170        // v1.0.24 P0-2: URLs foram removidas de apply_regex_prefilter e agora vão para extract_urls.
1171        let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
1172        assert!(
1173            !ents.iter().any(|e| e.name.starts_with("https://")),
1174            "URLs não devem aparecer como entidades após split P0-2"
1175        );
1176    }
1177
1178    #[test]
1179    fn extract_urls_captura_https() {
1180        let urls = extract_urls("veja https://docs.rs/crate para detalhes");
1181        assert_eq!(urls.len(), 1);
1182        assert_eq!(urls[0].url, "https://docs.rs/crate");
1183        assert!(urls[0].offset > 0);
1184    }
1185
1186    #[test]
1187    fn extract_urls_trim_sufixo_pontuacao() {
1188        let urls = extract_urls("link: https://example.com/path. fim");
1189        assert!(!urls.is_empty());
1190        assert!(
1191            !urls[0].url.ends_with('.'),
1192            "sufixo ponto deve ser removido"
1193        );
1194    }
1195
1196    #[test]
1197    fn extract_urls_deduplica_repetidas() {
1198        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1199        let urls = extract_urls(body);
1200        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1201    }
1202
1203    #[test]
1204    fn regex_uuid_captura_identificador() {
1205        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1206        assert!(ents.iter().any(|e| e.entity_type == "concept"));
1207    }
1208
1209    #[test]
1210    fn regex_all_caps_captura_constante() {
1211        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1212        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1213        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1214    }
1215
1216    #[test]
1217    fn regex_all_caps_ignora_palavras_curtas() {
1218        let ents = apply_regex_prefilter("use AI em seu projeto");
1219        assert!(
1220            !ents.iter().any(|e| e.name == "AI"),
1221            "AI tem apenas 2 chars, deve ser ignorado"
1222        );
1223    }
1224
1225    #[test]
1226    fn iob_decodifica_per_para_person() {
1227        let tokens = vec![
1228            "John".to_string(),
1229            "Doe".to_string(),
1230            "trabalhou".to_string(),
1231        ];
1232        let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
1233        let ents = iob_to_entities(&tokens, &labels);
1234        assert_eq!(ents.len(), 1);
1235        assert_eq!(ents[0].entity_type, "person");
1236        assert!(ents[0].name.contains("John"));
1237    }
1238
1239    #[test]
1240    fn iob_strip_subword_b_prefix() {
1241        // v1.0.21 P0: BERT às vezes emite ##AI com B-prefix (subword confuso).
1242        // Deve mergear na entidade ativa em vez de criar entidade fantasma "##AI".
1243        let tokens = vec!["Open".to_string(), "##AI".to_string()];
1244        let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
1245        let ents = iob_to_entities(&tokens, &labels);
1246        assert!(
1247            ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
1248            "deveria mergear ##AI ou descartar"
1249        );
1250    }
1251
1252    #[test]
1253    fn iob_subword_orphan_descarta() {
1254        // v1.0.21 P0: subword órfão sem entidade ativa não deve virar entidade.
1255        let tokens = vec!["##AI".to_string()];
1256        let labels = vec!["B-ORG".to_string()];
1257        let ents = iob_to_entities(&tokens, &labels);
1258        assert!(
1259            ents.is_empty(),
1260            "subword órfão sem entidade ativa deve ser descartado"
1261        );
1262    }
1263
1264    #[test]
1265    fn iob_mapeia_date_para_date_v1025() {
1266        // v1.0.25 V008: DATE is now emitted instead of discarded.
1267        let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
1268        let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
1269        let ents = iob_to_entities(&tokens, &labels);
1270        assert_eq!(ents.len(), 1, "DATE deve ser emitido como entidade v1.0.25");
1271        assert_eq!(ents[0].entity_type, "date");
1272    }
1273
1274    #[test]
1275    fn iob_mapeia_org_para_organization_v1025() {
1276        // v1.0.25 V008: B-ORG without tool keywords maps to "organization" not "project".
1277        let tokens = vec!["Empresa".to_string()];
1278        let labels = vec!["B-ORG".to_string()];
1279        let ents = iob_to_entities(&tokens, &labels);
1280        assert_eq!(ents[0].entity_type, "organization");
1281    }
1282
1283    #[test]
1284    fn iob_mapeia_org_sdk_para_tool() {
1285        let tokens = vec!["tokio-sdk".to_string()];
1286        let labels = vec!["B-ORG".to_string()];
1287        let ents = iob_to_entities(&tokens, &labels);
1288        assert_eq!(ents[0].entity_type, "tool");
1289    }
1290
1291    #[test]
1292    fn iob_mapeia_loc_para_location_v1025() {
1293        // v1.0.25 V008: B-LOC maps to "location" not "concept".
1294        let tokens = vec!["Brasil".to_string()];
1295        let labels = vec!["B-LOC".to_string()];
1296        let ents = iob_to_entities(&tokens, &labels);
1297        assert_eq!(ents[0].entity_type, "location");
1298    }
1299
1300    #[test]
1301    fn build_relationships_respeitam_max_rels() {
1302        let entities: Vec<NewEntity> = (0..20)
1303            .map(|i| NewEntity {
1304                name: format!("entidade_{i}"),
1305                entity_type: "concept".to_string(),
1306                description: None,
1307            })
1308            .collect();
1309        let (rels, truncated) = build_relationships(&entities);
1310        let max_rels = crate::constants::max_relationships_per_memory();
1311        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1312        if rels.len() == max_rels {
1313            assert!(truncated, "truncated deve ser true quando atingiu o cap");
1314        }
1315    }
1316
1317    #[test]
1318    fn build_relationships_sem_duplicatas() {
1319        let entities: Vec<NewEntity> = (0..5)
1320            .map(|i| NewEntity {
1321                name: format!("ent_{i}"),
1322                entity_type: "concept".to_string(),
1323                description: None,
1324            })
1325            .collect();
1326        let (rels, _truncated) = build_relationships(&entities);
1327        let mut pares: std::collections::HashSet<(String, String)> =
1328            std::collections::HashSet::new();
1329        for r in &rels {
1330            let par = (r.source.clone(), r.target.clone());
1331            assert!(pares.insert(par), "par duplicado encontrado");
1332        }
1333    }
1334
1335    #[test]
1336    fn merge_deduplica_por_nome_lowercase() {
1337        // v1.0.25: collision detection is scoped per entity_type; same name + same type
1338        // must deduplicate to one entry. Different types are kept separately.
1339        let a = vec![ExtractedEntity {
1340            name: "Rust".to_string(),
1341            entity_type: "concept".to_string(),
1342        }];
1343        let b = vec![ExtractedEntity {
1344            name: "rust".to_string(),
1345            entity_type: "concept".to_string(),
1346        }];
1347        let merged = merge_and_deduplicate(a, b);
1348        assert_eq!(
1349            merged.len(),
1350            1,
1351            "rust e Rust com mesmo tipo são a mesma entidade"
1352        );
1353    }
1354
1355    #[test]
1356    fn regex_extractor_implementa_trait() {
1357        let extractor = RegexExtractor;
1358        let result = extractor
1359            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1360            .unwrap();
1361        assert!(!result.entities.is_empty());
1362    }
1363
1364    #[test]
1365    fn extract_retorna_ok_sem_modelo() {
1366        // Sem modelo baixado, deve retornar Ok com apenas entidades regex
1367        let paths = make_paths();
1368        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1369        let result = extract_graph_auto(body, &paths).unwrap();
1370        assert!(result
1371            .entities
1372            .iter()
1373            .any(|e| e.name.contains("teste@exemplo.com")));
1374    }
1375
1376    #[test]
1377    fn stopwords_filter_v1024_terms() {
1378        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1379        // by apply_regex_prefilter so they do not appear as entities.
1380        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1381                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1382        let ents = apply_regex_prefilter(body);
1383        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1384        for word in &[
1385            "ACEITE",
1386            "ACK",
1387            "ACL",
1388            "BORDA",
1389            "CHECKLIST",
1390            "COMPLETED",
1391            "CONFIRME",
1392            "DEVEMOS",
1393            "DONE",
1394            "FIXED",
1395            "NEGUE",
1396            "PENDING",
1397            "PLAN",
1398            "PODEMOS",
1399            "RECUSE",
1400            "TOKEN",
1401            "VAMOS",
1402        ] {
1403            assert!(
1404                !names.contains(word),
1405                "v1.0.24 stopword {word} should be filtered but was found in entities"
1406            );
1407        }
1408    }
1409
1410    #[test]
1411    fn dedup_normalizes_unicode_combining_marks() {
1412        // v1.0.24 P1-E: "Café" (NFC precomposed) and "Cafe\u{301}" (NFD with
1413        // combining acute accent) must deduplicate to a single entity after NFKC
1414        // normalization.
1415        let nfc = vec![ExtractedEntity {
1416            name: "Café".to_string(),
1417            entity_type: "concept".to_string(),
1418        }];
1419        // Build the NFD form: 'e' followed by combining acute accent U+0301
1420        let nfd_name = "Cafe\u{301}".to_string();
1421        let nfd = vec![ExtractedEntity {
1422            name: nfd_name,
1423            entity_type: "concept".to_string(),
1424        }];
1425        let merged = merge_and_deduplicate(nfc, nfd);
1426        assert_eq!(
1427            merged.len(),
1428            1,
1429            "NFC 'Café' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1430        );
1431    }
1432
1433    // ── predict_batch regression tests ──────────────────────────────────────
1434
1435    #[test]
1436    fn predict_batch_output_count_matches_input() {
1437        // Verify that predict_batch returns exactly one Vec<String> per window
1438        // without requiring a real model.  We test the shape contract by
1439        // constructing the padding logic manually and asserting counts.
1440        //
1441        // Two windows of different lengths: 3 tokens and 5 tokens.
1442        let w1_ids: Vec<u32> = vec![101, 100, 102];
1443        let w1_tok: Vec<String> = vec!["[CLS]".into(), "hello".into(), "[SEP]".into()];
1444        let w2_ids: Vec<u32> = vec![101, 100, 200, 300, 102];
1445        let w2_tok: Vec<String> = vec![
1446            "[CLS]".into(),
1447            "world".into(),
1448            "foo".into(),
1449            "bar".into(),
1450            "[SEP]".into(),
1451        ];
1452        let windows: Vec<(Vec<u32>, Vec<String>)> =
1453            vec![(w1_ids.clone(), w1_tok), (w2_ids.clone(), w2_tok)];
1454
1455        // Verify padding logic and output length contracts using tensor operations
1456        // that do NOT require BertModel::forward.
1457        let device = Device::Cpu;
1458        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap();
1459        assert_eq!(max_len, 5, "max_len deve ser 5");
1460
1461        let mut padded_ids: Vec<Tensor> = Vec::new();
1462        for (ids, _) in &windows {
1463            let len = ids.len();
1464            let pad_right = max_len - len;
1465            let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
1466            let t = Tensor::from_vec(ids_i64, len, &device).unwrap();
1467            let t = t.pad_with_zeros(0, 0, pad_right).unwrap();
1468            assert_eq!(
1469                t.dims(),
1470                &[max_len],
1471                "cada janela deve ter shape (max_len,) após padding"
1472            );
1473            padded_ids.push(t);
1474        }
1475
1476        let stacked = Tensor::stack(&padded_ids, 0).unwrap();
1477        assert_eq!(
1478            stacked.dims(),
1479            &[2, max_len],
1480            "stack deve produzir (batch_size=2, max_len=5)"
1481        );
1482
1483        // Verify narrow preserves only real tokens for each window
1484        // (simulates what predict_batch does after classifier.forward)
1485        let fake_logits_data: Vec<f32> = vec![0.0f32; 2 * max_len * 9]; // batch×seq×num_labels=9
1486        let fake_logits =
1487            Tensor::from_vec(fake_logits_data, (2usize, max_len, 9usize), &device).unwrap();
1488        for (i, (ids, _)) in windows.iter().enumerate() {
1489            let real_len = ids.len();
1490            let example = fake_logits.get(i).unwrap();
1491            let sliced = example.narrow(0, 0, real_len).unwrap();
1492            assert_eq!(
1493                sliced.dims(),
1494                &[real_len, 9],
1495                "narrow deve preservar apenas {real_len} tokens reais"
1496            );
1497        }
1498    }
1499
1500    #[test]
1501    fn predict_batch_empty_windows_returns_empty() {
1502        // predict_batch with no windows must return an empty Vec, not panic.
1503        // We test the guard logic directly on the batch size/max_len path.
1504        let windows: Vec<(Vec<u32>, Vec<String>)> = vec![];
1505        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
1506        assert_eq!(max_len, 0, "zero windows → max_len 0");
1507        // The real predict_batch returns Ok(vec![]) when max_len == 0.
1508        // We assert the expected output shape by reproducing the guard here.
1509        let result: Vec<Vec<String>> = if max_len == 0 {
1510            Vec::new()
1511        } else {
1512            unreachable!()
1513        };
1514        assert!(result.is_empty());
1515    }
1516
1517    #[test]
1518    fn ner_batch_size_default_is_8() {
1519        // Verify that ner_batch_size() returns the documented default when the
1520        // env var is absent.  We clear the var to avoid cross-test contamination.
1521        std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1522        assert_eq!(crate::constants::ner_batch_size(), 8);
1523    }
1524
1525    #[test]
1526    fn ner_batch_size_env_override_clamped() {
1527        // Override via env var; values outside [1, 32] must be clamped.
1528        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "64");
1529        assert_eq!(crate::constants::ner_batch_size(), 32, "deve clampar em 32");
1530
1531        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "0");
1532        assert_eq!(crate::constants::ner_batch_size(), 1, "deve clampar em 1");
1533
1534        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "4");
1535        assert_eq!(
1536            crate::constants::ner_batch_size(),
1537            4,
1538            "valor válido preservado"
1539        );
1540
1541        std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1542    }
1543
1544    #[test]
1545    fn extraction_method_regex_only_unchanged() {
1546        // RegexExtractor always returns "regex-only" regardless of NER_MODEL OnceLock state.
1547        // This guards against accidentally changing the regex-only fallback string.
1548        let result = RegexExtractor.extract("contato: dev@acme.io").unwrap();
1549        assert_eq!(
1550            result.extraction_method, "regex-only",
1551            "RegexExtractor deve retornar regex-only"
1552        );
1553    }
1554
1555    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1556
1557    #[test]
1558    fn extend_suffix_pure_numeric_unchanged() {
1559        // Existing behaviour: pure-numeric suffix must still work after P2-E.
1560        let ents = vec![ExtractedEntity {
1561            name: "GPT".to_string(),
1562            entity_type: "concept".to_string(),
1563        }];
1564        let result = extend_with_numeric_suffix(ents, "usando GPT-5 no projeto");
1565        assert_eq!(
1566            result[0].name, "GPT-5",
1567            "sufixo puramente numérico deve ser estendido"
1568        );
1569    }
1570
1571    #[test]
1572    fn extend_suffix_alphanumeric_letter_after_digit() {
1573        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1574        let ents = vec![ExtractedEntity {
1575            name: "GPT".to_string(),
1576            entity_type: "concept".to_string(),
1577        }];
1578        let result = extend_with_numeric_suffix(ents, "usando GPT-4o para tarefas avançadas");
1579        assert_eq!(result[0].name, "GPT-4o", "sufixo '4o' deve ser aceito");
1580    }
1581
1582    #[test]
1583    fn extend_suffix_alphanumeric_b_suffix() {
1584        // P2-E: "5b" suffix (digit + 'b') must be captured.
1585        let ents = vec![ExtractedEntity {
1586            name: "Llama".to_string(),
1587            entity_type: "concept".to_string(),
1588        }];
1589        let result = extend_with_numeric_suffix(ents, "modelo Llama-5b open-weight");
1590        assert_eq!(result[0].name, "Llama-5b", "sufixo '5b' deve ser aceito");
1591    }
1592
1593    #[test]
1594    fn extend_suffix_alphanumeric_x_suffix() {
1595        // P2-E: "8x" suffix (digit + 'x') must be captured.
1596        let ents = vec![ExtractedEntity {
1597            name: "Mistral".to_string(),
1598            entity_type: "concept".to_string(),
1599        }];
1600        let result = extend_with_numeric_suffix(ents, "testando Mistral-8x em produção");
1601        assert_eq!(result[0].name, "Mistral-8x", "sufixo '8x' deve ser aceito");
1602    }
1603
1604    // --- P2-D: augment_versioned_model_names extended regex ---
1605
1606    #[test]
1607    fn augment_versioned_gpt4o() {
1608        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1609        let result = augment_versioned_model_names(vec![], "usando GPT-4o para análise");
1610        assert!(
1611            result.iter().any(|e| e.name == "GPT-4o"),
1612            "GPT-4o deve ser capturado pelo augment, achados: {:?}",
1613            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1614        );
1615    }
1616
1617    #[test]
1618    fn augment_versioned_claude_4_sonnet() {
1619        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1620        let result =
1621            augment_versioned_model_names(vec![], "melhor modelo: Claude 4 Sonnet lançado hoje");
1622        assert!(
1623            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1624            "Claude 4 Sonnet deve ser capturado, achados: {:?}",
1625            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1626        );
1627    }
1628
1629    #[test]
1630    fn augment_versioned_llama_3_pro() {
1631        // P2-D: "Llama 3 Pro" must be captured with release tier.
1632        let result =
1633            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1634        assert!(
1635            result.iter().any(|e| e.name == "Llama 3 Pro"),
1636            "Llama 3 Pro deve ser capturado, achados: {:?}",
1637            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1638        );
1639    }
1640
1641    #[test]
1642    fn augment_versioned_mixtral_8x7b() {
1643        // P2-D: "Mixtral 8x7B" composite version must be captured.
1644        let result =
1645            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1646        assert!(
1647            result.iter().any(|e| e.name == "Mixtral 8x7B"),
1648            "Mixtral 8x7B deve ser capturado, achados: {:?}",
1649            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1650        );
1651    }
1652
1653    #[test]
1654    fn augment_versioned_does_not_duplicate_existing() {
1655        // P2-D back-compat: entities already present must not be duplicated.
1656        let existing = vec![ExtractedEntity {
1657            name: "Claude 4".to_string(),
1658            entity_type: "concept".to_string(),
1659        }];
1660        let result = augment_versioned_model_names(existing, "usando Claude 4 no projeto");
1661        let count = result.iter().filter(|e| e.name == "Claude 4").count();
1662        assert_eq!(count, 1, "Claude 4 não deve ser duplicado");
1663    }
1664
1665    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1666
1667    #[test]
1668    fn stopwords_filter_url_jwt_api_v1025() {
1669        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1670        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1671        let ents = apply_regex_prefilter(body);
1672        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1673        for blocked in &[
1674            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1675        ] {
1676            assert!(
1677                !names.contains(blocked),
1678                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1679            );
1680        }
1681    }
1682
1683    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1684
1685    #[test]
1686    fn section_markers_etapa_fase_filtered_v1025() {
1687        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1688        let body = "Etapa 3 do plano: implementar Fase 1 da Migração.";
1689        let ents = apply_regex_prefilter(body);
1690        assert!(
1691            !ents
1692                .iter()
1693                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1694            "section markers must be stripped; entities: {:?}",
1695            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1696        );
1697    }
1698
1699    #[test]
1700    fn section_markers_passo_secao_filtered_v1025() {
1701        let body = "Siga Passo 2 conforme Seção 3 do manual.";
1702        let ents = apply_regex_prefilter(body);
1703        assert!(
1704            !ents
1705                .iter()
1706                .any(|e| e.name.contains("Passo") || e.name.contains("Seção")),
1707            "Passo/Seção section markers must be stripped; entities: {:?}",
1708            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1709        );
1710    }
1711
1712    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1713
1714    #[test]
1715    fn brand_camelcase_extracted_as_organization_v1025() {
1716        // "OpenAI" is a CamelCase brand that BERT NER often misses.
1717        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1718        let ents = apply_regex_prefilter(body);
1719        let openai = ents.iter().find(|e| e.name == "OpenAI");
1720        assert!(
1721            openai.is_some(),
1722            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1723            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1724        );
1725        assert_eq!(
1726            openai.unwrap().entity_type,
1727            "organization",
1728            "brand CamelCase must map to organization (V008)"
1729        );
1730    }
1731
1732    #[test]
1733    fn brand_postgresql_extracted_as_organization_v1025() {
1734        let body = "migrating from MySQL to PostgreSQL for better performance.";
1735        let ents = apply_regex_prefilter(body);
1736        assert!(
1737            ents.iter()
1738                .any(|e| e.name == "PostgreSQL" && e.entity_type == "organization"),
1739            "PostgreSQL must be extracted as organization; entities: {:?}",
1740            ents.iter()
1741                .map(|e| (&e.name, &e.entity_type))
1742                .collect::<Vec<_>>()
1743        );
1744    }
1745
1746    // ── v1.0.25 V008 alignment ──
1747
1748    #[test]
1749    fn iob_org_maps_to_organization_not_project_v1025() {
1750        // B-ORG without tool keywords must emit "organization" (V008), not "project".
1751        let tokens = vec!["Microsoft".to_string()];
1752        let labels = vec!["B-ORG".to_string()];
1753        let ents = iob_to_entities(&tokens, &labels);
1754        assert_eq!(
1755            ents[0].entity_type, "organization",
1756            "B-ORG must map to organization (V008); got {}",
1757            ents[0].entity_type
1758        );
1759    }
1760
1761    #[test]
1762    fn iob_loc_maps_to_location_not_concept_v1025() {
1763        // B-LOC must emit "location" (V008), not "concept".
1764        let tokens = vec!["São".to_string(), "Paulo".to_string()];
1765        let labels = vec!["B-LOC".to_string(), "I-LOC".to_string()];
1766        let ents = iob_to_entities(&tokens, &labels);
1767        assert_eq!(
1768            ents[0].entity_type, "location",
1769            "B-LOC must map to location (V008); got {}",
1770            ents[0].entity_type
1771        );
1772    }
1773
1774    #[test]
1775    fn iob_date_maps_to_date_not_discarded_v1025() {
1776        // B-DATE must emit "date" (V008) instead of being discarded.
1777        let tokens = vec!["2025".to_string(), "-".to_string(), "12".to_string()];
1778        let labels = vec![
1779            "B-DATE".to_string(),
1780            "I-DATE".to_string(),
1781            "I-DATE".to_string(),
1782        ];
1783        let ents = iob_to_entities(&tokens, &labels);
1784        assert_eq!(
1785            ents.len(),
1786            1,
1787            "DATE entity must be emitted (V008); entities: {ents:?}"
1788        );
1789        assert_eq!(ents[0].entity_type, "date");
1790    }
1791
1792    // ── v1.0.25 P0-2: PT verb false-positive filter ──
1793
1794    #[test]
1795    fn pt_verb_le_filtered_as_per_v1025() {
1796        // "Lê" is a PT monosyllabic verb; when tagged B-PER it must be dropped.
1797        let tokens = vec!["Lê".to_string(), "o".to_string(), "livro".to_string()];
1798        let labels = vec!["B-PER".to_string(), "O".to_string(), "O".to_string()];
1799        let ents = iob_to_entities(&tokens, &labels);
1800        assert!(
1801            !ents
1802                .iter()
1803                .any(|e| e.name == "Lê" && e.entity_type == "person"),
1804            "PT verb 'Lê' tagged B-PER must be filtered; entities: {ents:?}"
1805        );
1806    }
1807
1808    #[test]
1809    fn pt_verb_ver_filtered_as_per_v1025() {
1810        // "Ver" is a PT verb that BERT sometimes tags B-PER; must be filtered.
1811        let tokens = vec!["Ver".to_string()];
1812        let labels = vec!["B-PER".to_string()];
1813        let ents = iob_to_entities(&tokens, &labels);
1814        assert!(
1815            ents.is_empty(),
1816            "PT verb 'Ver' tagged B-PER must be filtered; entities: {ents:?}"
1817        );
1818    }
1819
1820    // --- P0-3 longest-wins v1.0.25 ---
1821
1822    fn entity(name: &str, entity_type: &str) -> ExtractedEntity {
1823        ExtractedEntity {
1824            name: name.to_string(),
1825            entity_type: entity_type.to_string(),
1826        }
1827    }
1828
1829    #[test]
1830    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1831        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1832        let regex = vec![entity("Sonne", "concept")];
1833        let ner = vec![entity("Sonnet", "concept")];
1834        let result = merge_and_deduplicate(regex, ner);
1835        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1836        assert_eq!(result[0].name, "Sonnet");
1837    }
1838
1839    #[test]
1840    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1841        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1842        let regex = vec![
1843            entity("Open", "organization"),
1844            entity("OpenAI", "organization"),
1845        ];
1846        let result = merge_and_deduplicate(regex, vec![]);
1847        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1848        assert_eq!(result[0].name, "OpenAI");
1849    }
1850
1851    #[test]
1852    fn merge_keeps_both_when_no_containment_v1025() {
1853        // "Alice" and "Bob" share no containment — both must be preserved.
1854        let regex = vec![entity("Alice", "person"), entity("Bob", "person")];
1855        let result = merge_and_deduplicate(regex, vec![]);
1856        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1857    }
1858
1859    #[test]
1860    fn merge_respects_entity_type_boundary_v1025() {
1861        // Same name "Apple" but different types: both must survive independently.
1862        let regex = vec![entity("Apple", "organization"), entity("Apple", "concept")];
1863        let result = merge_and_deduplicate(regex, vec![]);
1864        assert_eq!(
1865            result.len(),
1866            2,
1867            "expected 2 entities (different types), got: {result:?}"
1868        );
1869    }
1870
1871    #[test]
1872    fn merge_case_insensitive_dedup_v1025() {
1873        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1874        let regex = vec![
1875            entity("OpenAI", "organization"),
1876            entity("openai", "organization"),
1877        ];
1878        let result = merge_and_deduplicate(regex, vec![]);
1879        assert_eq!(
1880            result.len(),
1881            1,
1882            "expected 1 entity after case-insensitive dedup, got: {result:?}"
1883        );
1884    }
1885
1886    // ── v1.0.25 P0-4: section markers must be filtered in iob_to_entities too ──
1887
1888    #[test]
1889    fn iob_section_marker_etapa_filtered_v1025() {
1890        // BERT may tag "Etapa" (B-MISC) + "3" (I-MISC) as a span; flush must drop it.
1891        let tokens = vec!["Etapa".to_string(), "3".to_string()];
1892        let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1893        let ents = iob_to_entities(&tokens, &labels);
1894        assert!(
1895            !ents.iter().any(|e| e.name.contains("Etapa")),
1896            "section marker 'Etapa 3' from BERT must be filtered; entities: {ents:?}"
1897        );
1898    }
1899
1900    #[test]
1901    fn iob_section_marker_fase_filtered_v1025() {
1902        // BERT may tag "Fase" (B-MISC) + "1" (I-MISC) as a span; flush must drop it.
1903        let tokens = vec!["Fase".to_string(), "1".to_string()];
1904        let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1905        let ents = iob_to_entities(&tokens, &labels);
1906        assert!(
1907            !ents.iter().any(|e| e.name.contains("Fase")),
1908            "section marker 'Fase 1' from BERT must be filtered; entities: {ents:?}"
1909        );
1910    }
1911}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs