sqlite_graphrag/
extraction.rs

1//! Entity and URL extraction pipeline (NER + regex prefilter).
2//!
3//! Runs named-entity recognition and regex heuristics to extract structured
4//! entities and hyperlinks from raw memory bodies before embedding.
5
6use std::collections::HashMap;
7use std::path::{Path, PathBuf};
8use std::sync::OnceLock;
9
10use anyhow::{Context, Result};
11use candle_core::{DType, Device, Tensor};
12use candle_nn::{Linear, Module, VarBuilder};
13use candle_transformers::models::bert::{BertModel, Config as BertConfig};
14use regex::Regex;
15use serde::Deserialize;
16use unicode_normalization::UnicodeNormalization;
17
18use crate::paths::AppPaths;
19use crate::storage::entities::{NewEntity, NewRelationship};
20
21const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
22const MAX_SEQ_LEN: usize = 512;
23const STRIDE: usize = 256;
24const MAX_ENTS: usize = 30;
25const TOP_K_RELATIONS: usize = 5;
26const DEFAULT_RELATION: &str = "mentions";
27const MIN_ENTITY_CHARS: usize = 2;
28
29static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
30static REGEX_URL: OnceLock<Regex> = OnceLock::new();
31static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
32static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
33// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
34static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
35// v1.0.25 P0-2: captures CamelCase brand names that BERT NER often misses (e.g. "OpenAI", "PostgreSQL").
36static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
37
38// v1.0.20: stopwords para filtrar palavras-regra PT-BR/EN comuns capturadas como ALL_CAPS.
39// Sem este filtro, corpus técnico em PT-BR contendo regras formatadas em CAPS (NUNCA, PROIBIDO, DEVE)
40// gerava ~70% de "entidades" lixo. Mantemos identificadores tipo MAX_RETRY (com underscore).
41// v1.0.22: lista expandida com termos observados em stress test 495 arquivos do flowaiper.
42// Inclui verbos (ADICIONAR, VALIDAR), adjetivos (ALTA, BAIXA), substantivos comuns (BANCO, CASO),
43// HTTP methods (GET, POST, DELETE) e formatos de dados genéricos (JSON, XML).
44// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
45// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
46// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
47// and common abbreviations (ACK, ACL).
48// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
49// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
50// and generic tech terms from being extracted as entities.
51const ALL_CAPS_STOPWORDS: &[&str] = &[
52    "ACEITE",
53    "ACK",
54    "ACL",
55    "ACRESCENTADO",
56    "ADICIONAR",
57    "AGENTS",
58    "ALL",
59    "ALTA",
60    "ALWAYS",
61    "API",
62    "ARTEFATOS",
63    "ATIVO",
64    "BAIXA",
65    "BANCO",
66    "BORDA",
67    "BLOQUEAR",
68    "BUG",
69    "CAPÍTULO",
70    "CASO",
71    "CHECKLIST",
72    "CLI",
73    "COMPLETED",
74    "CONFIRMADO",
75    "CONFIRME",
76    "CONTRATO",
77    "CRÍTICO",
78    "CRITICAL",
79    "CSV",
80    "DEVE",
81    "DEVEMOS",
82    "DISCO",
83    "DONE",
84    "EFEITO",
85    "ENTRADA",
86    "ERROR",
87    "ESSA",
88    "ESSE",
89    "ESSENCIAL",
90    "ESTA",
91    "ESTE",
92    "ETAPA",
93    "EVITAR",
94    "EXPANDIR",
95    "EXPOR",
96    "FALHA",
97    "FASE",
98    "FIXED",
99    "FIXME",
100    "FORBIDDEN",
101    "HACK",
102    "HEARTBEAT",
103    "HTTP",
104    "HTTPS",
105    "INATIVO",
106    "JAMAIS",
107    "JSON",
108    "JWT",
109    "LLM",
110    "MUST",
111    "NEGUE",
112    "NEVER",
113    "NOTE",
114    "NUNCA",
115    "OBRIGATÓRIO",
116    "PADRÃO",
117    "PASSO",
118    "PENDING",
119    "PLAN",
120    "PODEMOS",
121    "PROIBIDO",
122    "RECUSE",
123    "REGRAS",
124    "REQUIRED",
125    "REQUISITO",
126    "REST",
127    "SEÇÃO",
128    "SEMPRE",
129    "SHALL",
130    "SHOULD",
131    "SOUL",
132    "TODAS",
133    "TODO",
134    "TODOS",
135    "TOKEN",
136    "TOOLS",
137    "TSV",
138    "UI",
139    "URL",
140    "USAR",
141    "VALIDAR",
142    "VAMOS",
143    "VOCÊ",
144    "WARNING",
145    "XML",
146    "YAML",
147];
148
149// v1.0.22: HTTP methods são verbos de protocolo, não entidades semanticamente úteis.
150// Filtrados em apply_regex_prefilter (regex_all_caps) e iob_to_entities (single-token).
151const HTTP_METHODS: &[&str] = &[
152    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
153];
154
155fn is_filtered_all_caps(token: &str) -> bool {
156    // Identificadores com underscore são preservados (ex: MAX_RETRY, FLOWAIPER_API_KEY)
157    let is_identifier = token.contains('_');
158    if is_identifier {
159        return false;
160    }
161    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
162}
163
164fn regex_email() -> &'static Regex {
165    REGEX_EMAIL
166        .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
167}
168
169fn regex_url() -> &'static Regex {
170    REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
171}
172
173fn regex_uuid() -> &'static Regex {
174    REGEX_UUID.get_or_init(|| {
175        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
176            .unwrap()
177    })
178}
179
180fn regex_all_caps() -> &'static Regex {
181    REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
182}
183
184fn regex_section_marker() -> &'static Regex {
185    REGEX_SECTION_MARKER.get_or_init(|| {
186        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1", etc.
187        Regex::new(r"\b(?:Etapa|Fase|Passo|Seção|Capítulo)\s+\d+\b").unwrap()
188    })
189}
190
191fn regex_brand_camel() -> &'static Regex {
192    REGEX_BRAND_CAMEL.get_or_init(|| {
193        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
194        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
195        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b").unwrap()
196    })
197}
198
199#[derive(Debug, Clone, PartialEq)]
200pub struct ExtractedEntity {
201    pub name: String,
202    pub entity_type: String,
203}
204
205/// URL with source offset extracted from the memory body.
206#[derive(Debug, Clone)]
207pub struct ExtractedUrl {
208    pub url: String,
209    /// Byte position in the body where the URL was found.
210    pub offset: usize,
211}
212
213#[derive(Debug, Clone)]
214pub struct ExtractionResult {
215    pub entities: Vec<NewEntity>,
216    pub relationships: Vec<NewRelationship>,
217    /// True when build_relationships hit the cap before covering all entity pairs.
218    /// Exposed in RememberResponse so callers can detect when relationships were cut.
219    pub relationships_truncated: bool,
220    /// Extraction method used: "bert+regex" or "regex-only".
221    /// Useful for auditing, metrics and user reports.
222    pub extraction_method: String,
223    /// URLs extracted from the body — stored separately from graph entities.
224    pub urls: Vec<ExtractedUrl>,
225}
226
227pub trait Extractor: Send + Sync {
228    fn extract(&self, body: &str) -> Result<ExtractionResult>;
229}
230
231#[derive(Deserialize)]
232struct ModelConfig {
233    #[serde(default)]
234    id2label: HashMap<String, String>,
235    hidden_size: usize,
236}
237
238struct BertNerModel {
239    bert: BertModel,
240    classifier: Linear,
241    device: Device,
242    id2label: HashMap<usize, String>,
243}
244
245impl BertNerModel {
246    fn load(model_dir: &Path) -> Result<Self> {
247        let config_path = model_dir.join("config.json");
248        let weights_path = model_dir.join("model.safetensors");
249
250        let config_str = std::fs::read_to_string(&config_path)
251            .with_context(|| format!("lendo config.json em {config_path:?}"))?;
252        let model_cfg: ModelConfig =
253            serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
254
255        let id2label: HashMap<usize, String> = model_cfg
256            .id2label
257            .into_iter()
258            .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
259            .collect();
260
261        let num_labels = id2label.len().max(9);
262        let hidden_size = model_cfg.hidden_size;
263
264        let bert_config_str = std::fs::read_to_string(&config_path)
265            .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
266        let bert_cfg: BertConfig =
267            serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
268
269        let device = Device::Cpu;
270
271        let vb = unsafe {
272            VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
273                .with_context(|| format!("mapeando {weights_path:?}"))?
274        };
275        let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
276
277        // v1.0.20 fix P0 secundário: carregar classifier head do safetensors em vez de zeros.
278        // Em v1.0.19 usávamos Tensor::zeros, o que produzia argmax constante e inferência degenerada.
279        let cls_vb = vb.pp("classifier");
280        let weight = cls_vb
281            .get((num_labels, hidden_size), "weight")
282            .context("carregando classifier.weight do safetensors")?;
283        let bias = cls_vb
284            .get(num_labels, "bias")
285            .context("carregando classifier.bias do safetensors")?;
286        let classifier = Linear::new(weight, Some(bias));
287
288        Ok(Self {
289            bert,
290            classifier,
291            device,
292            id2label,
293        })
294    }
295
296    fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
297        let len = token_ids.len();
298        let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
299        let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
300
301        let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
302            .context("criando tensor input_ids")?;
303        let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
304            .context("criando tensor token_type_ids")?;
305        let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
306            .context("criando tensor attention_mask")?;
307
308        let sequence_output = self
309            .bert
310            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
311            .context("forward pass do BertModel")?;
312
313        let logits = self
314            .classifier
315            .forward(&sequence_output)
316            .context("forward pass do classificador")?;
317
318        let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
319
320        let num_tokens = logits_2d.dim(0).context("dim(0)")?;
321
322        let mut labels = Vec::with_capacity(num_tokens);
323        for i in 0..num_tokens {
324            let token_logits = logits_2d.get(i).context("get token logits")?;
325            let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
326            let argmax = vec
327                .iter()
328                .enumerate()
329                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
330                .map(|(idx, _)| idx)
331                .unwrap_or(0);
332            let label = self
333                .id2label
334                .get(&argmax)
335                .cloned()
336                .unwrap_or_else(|| "O".to_string());
337            labels.push(label);
338        }
339
340        Ok(labels)
341    }
342
343    /// Run a batched forward pass over multiple tokenised windows at once.
344    ///
345    /// Windows are padded on the right with token_id=0 and attention_mask=0 to
346    /// the length of the longest window in the batch.  The attention mask ensures
347    /// BERT ignores padded positions (bert.rs:515-528 adds -3.4e38 before softmax).
348    ///
349    /// Returns one label vector per window, each of length equal to that window's
350    /// original (pre-padding) token count.
351    fn predict_batch(&self, windows: &[(Vec<u32>, Vec<String>)]) -> Result<Vec<Vec<String>>> {
352        let batch_size = windows.len();
353        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
354        if max_len == 0 {
355            return Ok(vec![vec![]; batch_size]);
356        }
357
358        let mut padded_ids: Vec<Tensor> = Vec::with_capacity(batch_size);
359        let mut padded_masks: Vec<Tensor> = Vec::with_capacity(batch_size);
360
361        for (ids, _) in windows {
362            let len = ids.len();
363            let pad_right = max_len - len;
364
365            let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
366            // Build 1-D token tensor then pad to max_len
367            let t = Tensor::from_vec(ids_i64, len, &self.device)
368                .context("criando tensor de ids para batch")?;
369            let t = t
370                .pad_with_zeros(0, 0, pad_right)
371                .context("padding tensor de ids")?;
372            padded_ids.push(t);
373
374            // Attention mask: 1 for real tokens, 0 for padding
375            let mut mask_i64 = vec![1i64; len];
376            mask_i64.extend(vec![0i64; pad_right]);
377            let m = Tensor::from_vec(mask_i64, max_len, &self.device)
378                .context("criando tensor de máscara para batch")?;
379            padded_masks.push(m);
380        }
381
382        // Stack 1-D tensors into (batch_size, max_len)
383        let input_ids = Tensor::stack(&padded_ids, 0).context("stack input_ids")?;
384        let attn_mask = Tensor::stack(&padded_masks, 0).context("stack attn_mask")?;
385        let token_type_ids = Tensor::zeros((batch_size, max_len), DType::I64, &self.device)
386            .context("criando token_type_ids batch")?;
387
388        // Single forward pass for the entire batch
389        let sequence_output = self
390            .bert
391            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
392            .context("forward pass batch BertModel")?;
393        // sequence_output: (batch_size, max_len, hidden_size)
394
395        let logits = self
396            .classifier
397            .forward(&sequence_output)
398            .context("forward pass batch classificador")?;
399        // logits: (batch_size, max_len, num_labels)
400
401        let mut results = Vec::with_capacity(batch_size);
402        for (i, (window_ids, _)) in windows.iter().enumerate() {
403            let example_logits = logits.get(i).context("get logits exemplo")?;
404            // (max_len, num_labels) — slice only real tokens, discard padding
405            let real_len = window_ids.len();
406            let example_slice = example_logits
407                .narrow(0, 0, real_len)
408                .context("narrow para tokens reais")?;
409            let logits_2d: Vec<Vec<f32>> = example_slice.to_vec2().context("to_vec2 logits")?;
410
411            let labels: Vec<String> = logits_2d
412                .iter()
413                .map(|token_logits| {
414                    let argmax = token_logits
415                        .iter()
416                        .enumerate()
417                        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
418                        .map(|(idx, _)| idx)
419                        .unwrap_or(0);
420                    self.id2label
421                        .get(&argmax)
422                        .cloned()
423                        .unwrap_or_else(|| "O".to_string())
424                })
425                .collect();
426
427            results.push(labels);
428        }
429
430        Ok(results)
431    }
432}
433
434static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
435
436fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
437    NER_MODEL
438        .get_or_init(|| match load_model(paths) {
439            Ok(m) => Some(m),
440            Err(e) => {
441                tracing::warn!("NER model unavailable (graceful degradation): {e:#}");
442                None
443            }
444        })
445        .as_ref()
446}
447
448fn model_dir(paths: &AppPaths) -> PathBuf {
449    paths.models.join("bert-multilingual-ner")
450}
451
452fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
453    let dir = model_dir(paths);
454    std::fs::create_dir_all(&dir)
455        .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
456
457    let weights = dir.join("model.safetensors");
458    let config = dir.join("config.json");
459    let tokenizer = dir.join("tokenizer.json");
460
461    if weights.exists() && config.exists() && tokenizer.exists() {
462        return Ok(dir);
463    }
464
465    tracing::info!("Downloading NER model (first run, ~676 MB)...");
466    crate::output::emit_progress_i18n(
467        "Downloading NER model (first run, ~676 MB)...",
468        "Baixando modelo NER (primeira execução, ~676 MB)...",
469    );
470
471    let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
472    let repo = api.model(MODEL_ID.to_string());
473
474    // v1.0.20 fix P0 primário: tokenizer.json no repo Davlan está apenas em onnx/tokenizer.json.
475    // Em v1.0.19 buscávamos da raiz e recebíamos 404, caindo em graceful degradation 100% das vezes.
476    // Mapeamos (remote_path, local_filename) para baixar do subfolder mantendo nome plano local.
477    for (remote, local) in &[
478        ("model.safetensors", "model.safetensors"),
479        ("config.json", "config.json"),
480        ("onnx/tokenizer.json", "tokenizer.json"),
481        ("tokenizer_config.json", "tokenizer_config.json"),
482    ] {
483        let dest = dir.join(local);
484        if !dest.exists() {
485            let src = repo
486                .get(remote)
487                .with_context(|| format!("baixando {remote} do HF Hub"))?;
488            std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
489        }
490    }
491
492    Ok(dir)
493}
494
495fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
496    let dir = ensure_model_files(paths)?;
497    BertNerModel::load(&dir)
498}
499
500fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
501    let mut entities = Vec::new();
502    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
503
504    let add = |entities: &mut Vec<ExtractedEntity>,
505               seen: &mut std::collections::HashSet<String>,
506               name: &str,
507               entity_type: &str| {
508        let name = name.trim().to_string();
509        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
510            entities.push(ExtractedEntity {
511                name,
512                entity_type: entity_type.to_string(),
513            });
514        }
515    };
516
517    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
518    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
519    let cleaned = regex_section_marker().replace_all(body, " ");
520    let cleaned = cleaned.as_ref();
521
522    for m in regex_email().find_iter(cleaned) {
523        // v1.0.20: email é "concept" (regex sozinho não distingue pessoa de mailing list/role).
524        add(&mut entities, &mut seen, m.as_str(), "concept");
525    }
526    for m in regex_uuid().find_iter(cleaned) {
527        add(&mut entities, &mut seen, m.as_str(), "concept");
528    }
529    for m in regex_all_caps().find_iter(cleaned) {
530        let candidate = m.as_str();
531        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
532        if !is_filtered_all_caps(candidate) {
533            add(&mut entities, &mut seen, candidate, "concept");
534        }
535    }
536    // v1.0.25 P0-2: capture CamelCase brand names that BERT NER often misses.
537    // Maps to "organization" (V008 schema) because brand names are typically organisations.
538    for m in regex_brand_camel().find_iter(cleaned) {
539        let name = m.as_str();
540        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
541        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
542            add(&mut entities, &mut seen, name, "organization");
543        }
544    }
545
546    entities
547}
548
549/// Extracts URLs from a memory body, deduplicated by text.
550/// URLs are stored in the `memory_urls` table separately from graph entities.
551/// v1.0.24: split of the URL block that polluted apply_regex_prefilter with entity_type='concept'.
552pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
553    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
554    let mut result = Vec::new();
555    for m in regex_url().find_iter(body) {
556        let raw = m.as_str();
557        let cleaned = raw
558            .trim_end_matches('`')
559            .trim_end_matches(',')
560            .trim_end_matches('.')
561            .trim_end_matches(';')
562            .trim_end_matches(')')
563            .trim_end_matches(']')
564            .trim_end_matches('}');
565        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
566            result.push(ExtractedUrl {
567                url: cleaned.to_string(),
568                offset: m.start(),
569            });
570        }
571    }
572    result
573}
574
575fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
576    let mut entities: Vec<ExtractedEntity> = Vec::new();
577    let mut current_parts: Vec<String> = Vec::new();
578    let mut current_type: Option<String> = None;
579
580    let flush =
581        |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
582            if let Some(t) = typ.take() {
583                let name = parts.join(" ").trim().to_string();
584                // v1.0.22: filtra single-token entities que sejam stopwords ALL CAPS ou HTTP methods.
585                // BERT NER classifica algumas dessas como B-MISC/B-ORG; pós-filtro aqui evita
586                // poluir o grafo com verbos/protocolos genéricos.
587                let is_single_caps = !name.contains(' ')
588                    && name == name.to_uppercase()
589                    && name.len() >= MIN_ENTITY_CHARS;
590                let should_skip = is_single_caps && is_filtered_all_caps(&name);
591                // v1.0.25 P0-4: BERT may independently label section-structure tokens (e.g.
592                // "Etapa 3", "Fase 1") even though apply_regex_prefilter strips them from the
593                // input text before regex extraction. Apply the same guard here to avoid the
594                // BERT path re-introducing these markers as graph entities.
595                let is_section_marker = regex_section_marker().is_match(&name);
596                if name.len() >= MIN_ENTITY_CHARS && !should_skip && !is_section_marker {
597                    entities.push(ExtractedEntity {
598                        name,
599                        entity_type: t,
600                    });
601                }
602                parts.clear();
603            }
604        };
605
606    for (token, label) in tokens.iter().zip(labels.iter()) {
607        if label == "O" {
608            flush(&mut current_parts, &mut current_type, &mut entities);
609            continue;
610        }
611
612        let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
613            ("B", rest)
614        } else if let Some(rest) = label.strip_prefix("I-") {
615            ("I", rest)
616        } else {
617            flush(&mut current_parts, &mut current_type, &mut entities);
618            continue;
619        };
620
621        // v1.0.25 P0-2: Portuguese monosyllabic verbs that BERT often misclassifies as person names.
622        // Only filtered when confidence is unavailable (no logit gate here); these tokens are
623        // structurally unlikely to be real proper names in a technical corpus.
624        const PT_VERB_FALSE_POSITIVES: &[&str] = &[
625            "Lê", "Vê", "Cá", "Pôr", "Ser", "Vir", "Ver", "Dar", "Ler", "Ter",
626        ];
627
628        let entity_type = match bio_type {
629            // v1.0.25 V008: DATE is now a first-class entity type instead of being discarded.
630            "DATE" => "date",
631            "PER" => {
632                // Filter well-known PT monosyllabic verbs misclassified as persons.
633                if PT_VERB_FALSE_POSITIVES.contains(&token.as_str()) {
634                    flush(&mut current_parts, &mut current_type, &mut entities);
635                    continue;
636                }
637                "person"
638            }
639            "ORG" => {
640                let t = token.to_lowercase();
641                if t.contains("lib")
642                    || t.contains("sdk")
643                    || t.contains("cli")
644                    || t.contains("crate")
645                    || t.contains("npm")
646                {
647                    "tool"
648                } else {
649                    // v1.0.25 V008: "organization" replaces the v1.0.24 default "project".
650                    "organization"
651                }
652            }
653            // v1.0.25 V008: "location" replaces "concept" for geographic tokens.
654            "LOC" => "location",
655            other => other,
656        };
657
658        if prefix == "B" {
659            if token.starts_with("##") {
660                // BERT confuso: subword com B-prefix indica continuação de entidade anterior.
661                // Anexar à última parte da entidade atual; senão descartar.
662                let clean = token.strip_prefix("##").unwrap_or(token.as_str());
663                if let Some(last) = current_parts.last_mut() {
664                    last.push_str(clean);
665                }
666                continue;
667            }
668            flush(&mut current_parts, &mut current_type, &mut entities);
669            current_parts.push(token.clone());
670            current_type = Some(entity_type.to_string());
671        } else if prefix == "I" && current_type.is_some() {
672            let clean = token.strip_prefix("##").unwrap_or(token.as_str());
673            if token.starts_with("##") {
674                if let Some(last) = current_parts.last_mut() {
675                    last.push_str(clean);
676                }
677            } else {
678                current_parts.push(clean.to_string());
679            }
680        }
681    }
682
683    flush(&mut current_parts, &mut current_type, &mut entities);
684    entities
685}
686
687/// Returns (relationships, truncated) where truncated is true when the cap was hit
688/// before all entity pairs were covered. Exposed in RememberResponse as
689/// `relationships_truncated` so callers can decide whether to increase the cap.
690fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
691    if entities.len() < 2 {
692        return (Vec::new(), false);
693    }
694
695    // v1.0.22: cap configurável via env var (constants::max_relationships_per_memory).
696    // Permite usuários com corpus denso aumentar além do default 50.
697    let max_rels = crate::constants::max_relationships_per_memory();
698    let n = entities.len().min(MAX_ENTS);
699    let mut rels: Vec<NewRelationship> = Vec::new();
700    let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
701
702    let mut hit_cap = false;
703    'outer: for i in 0..n {
704        if rels.len() >= max_rels {
705            hit_cap = true;
706            break;
707        }
708
709        let mut for_entity = 0usize;
710        for j in (i + 1)..n {
711            if for_entity >= TOP_K_RELATIONS {
712                break;
713            }
714            if rels.len() >= max_rels {
715                hit_cap = true;
716                break 'outer;
717            }
718
719            let src = &entities[i].name;
720            let tgt = &entities[j].name;
721            let key = (src.clone(), tgt.clone());
722
723            if seen.contains(&key) {
724                continue;
725            }
726            seen.insert(key);
727
728            rels.push(NewRelationship {
729                source: src.clone(),
730                target: tgt.clone(),
731                relation: DEFAULT_RELATION.to_string(),
732                strength: 0.5,
733                description: None,
734            });
735            for_entity += 1;
736        }
737    }
738
739    // v1.0.20: avisar quando relacionamentos foram truncados antes de cobrir todos os pares possíveis.
740    if hit_cap {
741        tracing::warn!(
742            "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
743            n.saturating_sub(1)
744        );
745    }
746
747    (rels, hit_cap)
748}
749
750fn run_ner_sliding_window(
751    model: &BertNerModel,
752    body: &str,
753    paths: &AppPaths,
754) -> Result<Vec<ExtractedEntity>> {
755    let tokenizer_path = model_dir(paths).join("tokenizer.json");
756    let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
757        .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
758
759    let encoding = tokenizer
760        .encode(body, false)
761        .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
762
763    let all_ids: Vec<u32> = encoding.get_ids().to_vec();
764    let all_tokens: Vec<String> = encoding
765        .get_tokens()
766        .iter()
767        .map(|s| s.to_string())
768        .collect();
769
770    if all_ids.is_empty() {
771        return Ok(Vec::new());
772    }
773
774    // Phase 1: collect all sliding windows before any inference
775    let mut windows: Vec<(Vec<u32>, Vec<String>)> = Vec::new();
776    let mut start = 0usize;
777    loop {
778        let end = (start + MAX_SEQ_LEN).min(all_ids.len());
779        windows.push((
780            all_ids[start..end].to_vec(),
781            all_tokens[start..end].to_vec(),
782        ));
783        if end >= all_ids.len() {
784            break;
785        }
786        start += STRIDE;
787    }
788
789    // Phase 2: sort by window length ascending to minimise intra-batch padding waste
790    windows.sort_by_key(|(ids, _)| ids.len());
791
792    // Phase 3: batched inference with fallback to single-window predict on error
793    let batch_size = crate::constants::ner_batch_size();
794    let mut entities: Vec<ExtractedEntity> = Vec::new();
795    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
796
797    for chunk in windows.chunks(batch_size) {
798        match model.predict_batch(chunk) {
799            Ok(batch_labels) => {
800                for (labels, (_, tokens)) in batch_labels.iter().zip(chunk.iter()) {
801                    for ent in iob_to_entities(tokens, labels) {
802                        if seen.insert(ent.name.clone()) {
803                            entities.push(ent);
804                        }
805                    }
806                }
807            }
808            Err(e) => {
809                tracing::warn!(
810                    "batch NER falhou (chunk de {} janelas): {e:#} — fallback single-window",
811                    chunk.len()
812                );
813                // Fallback: process each window individually to preserve entities
814                for (ids, tokens) in chunk {
815                    let mask = vec![1u32; ids.len()];
816                    match model.predict(ids, &mask) {
817                        Ok(labels) => {
818                            for ent in iob_to_entities(tokens, &labels) {
819                                if seen.insert(ent.name.clone()) {
820                                    entities.push(ent);
821                                }
822                            }
823                        }
824                        Err(e2) => {
825                            tracing::warn!("NER window fallback also failed: {e2:#}");
826                        }
827                    }
828                }
829            }
830        }
831    }
832
833    Ok(entities)
834}
835
836/// v1.0.22 P1: extends entities with hyphenated or space-separated numeric suffixes.
837/// Cases: GPT extracted but body contains "GPT-5" → rewrites to "GPT-5".
838/// Cases: Claude extracted but body contains "Claude 4" → rewrites to "Claude 4".
839/// Conservative: only extends when the suffix is at most 7 characters.
840/// v1.0.24 P2-E: suffix accepts an optional lowercase ASCII letter after digits to cover
841/// models such as "GPT-4o", "Llama-5b", "Mistral-8x" (digits + [a-z]? + [x\d+]?).
842fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
843    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
844    // Matches: separator + digits + optional decimal + optional lowercase letter
845    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
846    let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)").unwrap());
847
848    entities
849        .into_iter()
850        .map(|ent| {
851            // Encontra a primeira ocorrência case-sensitive da entidade no body
852            if let Some(pos) = body.find(&ent.name) {
853                let after_pos = pos + ent.name.len();
854                if after_pos < body.len() {
855                    let after = &body[after_pos..];
856                    if let Some(m) = suffix_re.find(after) {
857                        let suffix = m.as_str();
858                        // Conservative: cap suffix length to 7 chars to avoid grabbing
859                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
860                        if suffix.len() <= 7 {
861                            let extended = format!("{}{}", ent.name, suffix);
862                            return ExtractedEntity {
863                                name: extended,
864                                entity_type: ent.entity_type,
865                            };
866                        }
867                    }
868                }
869            }
870            ent
871        })
872        .collect()
873}
874
875/// Captures versioned model names that BERT NER consistently misses.
876///
877/// BERT NER often classifies tokens like "Claude" or "Llama" as common nouns,
878/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
879/// never sees these candidates and the version suffix gets lost.
880///
881/// This function scans the body with a conservative regex, matching capitalised
882/// words followed by a space-or-hyphen and a small integer. Matches that are not
883/// already covered by an existing entity (case-insensitive) are appended with the
884/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
885/// items downstream.
886///
887/// v1.0.24 P2-D: regex extended to cover:
888/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
889/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
890/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
891///
892/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
893/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
894fn augment_versioned_model_names(
895    entities: Vec<ExtractedEntity>,
896    body: &str,
897) -> Vec<ExtractedEntity> {
898    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
899    // Pattern breakdown:
900    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
901    //   [\s\-]+               — separator: space(s) or hyphen(s)
902    //   \d+(?:\.\d+)?         — version number, optional decimal
903    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
904    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
905    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
906        Regex::new(
907            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
908        )
909        .unwrap()
910    });
911
912    let mut existing_lc: std::collections::HashSet<String> =
913        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
914    let mut result = entities;
915
916    for caps in model_re.captures_iter(body) {
917        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
918        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
919        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
920        if full_match.is_empty() || full_match.len() > 24 {
921            continue;
922        }
923        let normalized_lc = full_match.to_lowercase();
924        if existing_lc.contains(&normalized_lc) {
925            continue;
926        }
927        // Stop appending once the global entity cap is reached to keep parity with
928        // `merge_and_deduplicate` truncation semantics.
929        if result.len() >= MAX_ENTS {
930            break;
931        }
932        existing_lc.insert(normalized_lc);
933        result.push(ExtractedEntity {
934            name: full_match.to_string(),
935            entity_type: "concept".to_string(),
936        });
937    }
938
939    result
940}
941
942fn merge_and_deduplicate(
943    regex_ents: Vec<ExtractedEntity>,
944    ner_ents: Vec<ExtractedEntity>,
945) -> Vec<ExtractedEntity> {
946    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
947    // and is scoped per entity_type. This fixes two bugs from prior versions:
948    //
949    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
950    //    start_with "sonnet", so the pair could survive dedup depending on insertion
951    //    order. contains() catches both directions unconditionally.
952    //
953    // 2. The lookup key omitted entity_type, so "Apple/organization" and
954    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
955    //
956    // Earlier invariants preserved:
957    // - NFKC normalization before lowercasing (v1.0.24).
958    // - Longest-wins: on collision keep the entity with the longer name.
959    // - Truncation warning at MAX_ENTS.
960    let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
961    let mut result: Vec<ExtractedEntity> = Vec::new();
962    let mut truncated = false;
963
964    let total_input = regex_ents.len() + ner_ents.len();
965    for ent in regex_ents.into_iter().chain(ner_ents) {
966        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
967        // Composite key: entity_type + NUL + normalised lowercase name.
968        // Collision search is scoped to the same type so that e.g.
969        // "Apple/organization" and "Apple/concept" are kept separately.
970        let key = format!("{}\0{}", ent.entity_type, name_lc);
971
972        // Scan stored entries for substring containment within the same type.
973        // Two names collide when one is a case-insensitive substring of the other:
974        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
975        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
976        let mut collision_idx: Option<usize> = None;
977        for (existing_key, idx) in &by_lc {
978            // Fast-path: check type prefix matches before scanning the name.
979            let type_prefix = format!("{}\0", ent.entity_type);
980            if !existing_key.starts_with(&type_prefix) {
981                continue;
982            }
983            let existing_name_lc = &existing_key[type_prefix.len()..];
984            if existing_name_lc == name_lc
985                || existing_name_lc.contains(name_lc.as_str())
986                || name_lc.contains(existing_name_lc)
987            {
988                collision_idx = Some(*idx);
989                break;
990            }
991        }
992        match collision_idx {
993            Some(idx) => {
994                // Replace stored entity only when the new candidate is strictly
995                // longer; otherwise drop the new one.
996                if ent.name.len() > result[idx].name.len() {
997                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
998                    let old_key = format!("{}\0{}", result[idx].entity_type, old_name_lc);
999                    by_lc.remove(&old_key);
1000                    result[idx] = ent;
1001                    by_lc.insert(key, idx);
1002                }
1003            }
1004            None => {
1005                by_lc.insert(key, result.len());
1006                result.push(ent);
1007            }
1008        }
1009        if result.len() >= MAX_ENTS {
1010            truncated = true;
1011            break;
1012        }
1013    }
1014
1015    // v1.0.20: avisar quando truncamento silencioso descarta entidades acima do MAX_ENTS.
1016    if truncated {
1017        tracing::warn!(
1018            "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
1019        );
1020    }
1021
1022    result
1023}
1024
1025fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1026    extracted
1027        .into_iter()
1028        .map(|e| NewEntity {
1029            name: e.name,
1030            entity_type: e.entity_type,
1031            description: None,
1032        })
1033        .collect()
1034}
1035
1036pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
1037    let regex_entities = apply_regex_prefilter(body);
1038
1039    let mut bert_used = false;
1040    let ner_entities = match get_or_init_model(paths) {
1041        Some(model) => match run_ner_sliding_window(model, body, paths) {
1042            Ok(ents) => {
1043                bert_used = true;
1044                ents
1045            }
1046            Err(e) => {
1047                tracing::warn!("NER falhou, usando apenas regex: {e:#}");
1048                Vec::new()
1049            }
1050        },
1051        None => Vec::new(),
1052    };
1053
1054    let merged = merge_and_deduplicate(regex_entities, ner_entities);
1055    // v1.0.22: estender entidades NER com sufixos numéricos do body (GPT-5, Claude 4, Python 3).
1056    let extended = extend_with_numeric_suffix(merged, body);
1057    // v1.0.23: capture versioned model names that BERT NER does not detect on its own
1058    // (e.g. "Claude 4", "Llama 3"). Hyphenated variants like "GPT-5" are already covered
1059    // by the NER+suffix pipeline above, but space-separated names need a dedicated pass.
1060    let with_models = augment_versioned_model_names(extended, body);
1061    // v1.0.25 P0-4: augment_versioned_model_names matches any capitalised word followed by a
1062    // digit, which inadvertently captures PT-BR section markers ("Etapa 3", "Fase 1"). Strip
1063    // them here as a final guard after the full augmentation pipeline.
1064    let with_models: Vec<ExtractedEntity> = with_models
1065        .into_iter()
1066        .filter(|e| !regex_section_marker().is_match(&e.name))
1067        .collect();
1068    let entities = to_new_entities(with_models);
1069    let (relationships, relationships_truncated) = build_relationships(&entities);
1070
1071    let extraction_method = if bert_used {
1072        "bert+regex-batch".to_string()
1073    } else {
1074        "regex-only".to_string()
1075    };
1076
1077    let urls = extract_urls(body);
1078
1079    Ok(ExtractionResult {
1080        entities,
1081        relationships,
1082        relationships_truncated,
1083        extraction_method,
1084        urls,
1085    })
1086}
1087
1088pub struct RegexExtractor;
1089
1090impl Extractor for RegexExtractor {
1091    fn extract(&self, body: &str) -> Result<ExtractionResult> {
1092        let regex_entities = apply_regex_prefilter(body);
1093        let entities = to_new_entities(regex_entities);
1094        let (relationships, relationships_truncated) = build_relationships(&entities);
1095        let urls = extract_urls(body);
1096        Ok(ExtractionResult {
1097            entities,
1098            relationships,
1099            relationships_truncated,
1100            extraction_method: "regex-only".to_string(),
1101            urls,
1102        })
1103    }
1104}
1105
1106#[cfg(test)]
1107mod tests {
1108    use super::*;
1109
1110    fn make_paths() -> AppPaths {
1111        use std::path::PathBuf;
1112        AppPaths {
1113            db: PathBuf::from("/tmp/test.sqlite"),
1114            models: PathBuf::from("/tmp/test_models"),
1115        }
1116    }
1117
1118    #[test]
1119    fn regex_email_captura_endereco() {
1120        let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
1121        // v1.0.20: emails são classificados como "concept" (regex sozinho não distingue pessoa de role).
1122        assert!(ents
1123            .iter()
1124            .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
1125    }
1126
1127    #[test]
1128    fn regex_all_caps_filtra_palavra_regra_pt() {
1129        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE não devem virar "entidades".
1130        let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
1131        assert!(
1132            !ents.iter().any(|e| e.name == "NUNCA"),
1133            "NUNCA deveria ser filtrado como stopword"
1134        );
1135        assert!(
1136            !ents.iter().any(|e| e.name == "PROIBIDO"),
1137            "PROIBIDO deveria ser filtrado"
1138        );
1139        assert!(
1140            !ents.iter().any(|e| e.name == "DEVE"),
1141            "DEVE deveria ser filtrado"
1142        );
1143    }
1144
1145    #[test]
1146    fn regex_all_caps_aceita_constante_com_underscore() {
1147        // Constantes técnicas tipo MAX_RETRY, TIMEOUT_MS sempre devem ser aceitas.
1148        let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
1149        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1150        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1151    }
1152
1153    #[test]
1154    fn regex_all_caps_aceita_acronimo_dominio() {
1155        // Acrônimos legítimos (não-stopword) devem passar: OPENAI, NVIDIA, GOOGLE.
1156        let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
1157        assert!(ents.iter().any(|e| e.name == "OPENAI"));
1158        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1159    }
1160
1161    #[test]
1162    fn regex_url_nao_aparece_em_apply_regex_prefilter() {
1163        // v1.0.24 P0-2: URLs foram removidas de apply_regex_prefilter e agora vão para extract_urls.
1164        let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
1165        assert!(
1166            !ents.iter().any(|e| e.name.starts_with("https://")),
1167            "URLs não devem aparecer como entidades após split P0-2"
1168        );
1169    }
1170
1171    #[test]
1172    fn extract_urls_captura_https() {
1173        let urls = extract_urls("veja https://docs.rs/crate para detalhes");
1174        assert_eq!(urls.len(), 1);
1175        assert_eq!(urls[0].url, "https://docs.rs/crate");
1176        assert!(urls[0].offset > 0);
1177    }
1178
1179    #[test]
1180    fn extract_urls_trim_sufixo_pontuacao() {
1181        let urls = extract_urls("link: https://example.com/path. fim");
1182        assert!(!urls.is_empty());
1183        assert!(
1184            !urls[0].url.ends_with('.'),
1185            "sufixo ponto deve ser removido"
1186        );
1187    }
1188
1189    #[test]
1190    fn extract_urls_deduplica_repetidas() {
1191        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1192        let urls = extract_urls(body);
1193        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1194    }
1195
1196    #[test]
1197    fn regex_uuid_captura_identificador() {
1198        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1199        assert!(ents.iter().any(|e| e.entity_type == "concept"));
1200    }
1201
1202    #[test]
1203    fn regex_all_caps_captura_constante() {
1204        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1205        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1206        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1207    }
1208
1209    #[test]
1210    fn regex_all_caps_ignora_palavras_curtas() {
1211        let ents = apply_regex_prefilter("use AI em seu projeto");
1212        assert!(
1213            !ents.iter().any(|e| e.name == "AI"),
1214            "AI tem apenas 2 chars, deve ser ignorado"
1215        );
1216    }
1217
1218    #[test]
1219    fn iob_decodifica_per_para_person() {
1220        let tokens = vec![
1221            "John".to_string(),
1222            "Doe".to_string(),
1223            "trabalhou".to_string(),
1224        ];
1225        let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
1226        let ents = iob_to_entities(&tokens, &labels);
1227        assert_eq!(ents.len(), 1);
1228        assert_eq!(ents[0].entity_type, "person");
1229        assert!(ents[0].name.contains("John"));
1230    }
1231
1232    #[test]
1233    fn iob_strip_subword_b_prefix() {
1234        // v1.0.21 P0: BERT às vezes emite ##AI com B-prefix (subword confuso).
1235        // Deve mergear na entidade ativa em vez de criar entidade fantasma "##AI".
1236        let tokens = vec!["Open".to_string(), "##AI".to_string()];
1237        let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
1238        let ents = iob_to_entities(&tokens, &labels);
1239        assert!(
1240            ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
1241            "deveria mergear ##AI ou descartar"
1242        );
1243    }
1244
1245    #[test]
1246    fn iob_subword_orphan_descarta() {
1247        // v1.0.21 P0: subword órfão sem entidade ativa não deve virar entidade.
1248        let tokens = vec!["##AI".to_string()];
1249        let labels = vec!["B-ORG".to_string()];
1250        let ents = iob_to_entities(&tokens, &labels);
1251        assert!(
1252            ents.is_empty(),
1253            "subword órfão sem entidade ativa deve ser descartado"
1254        );
1255    }
1256
1257    #[test]
1258    fn iob_mapeia_date_para_date_v1025() {
1259        // v1.0.25 V008: DATE is now emitted instead of discarded.
1260        let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
1261        let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
1262        let ents = iob_to_entities(&tokens, &labels);
1263        assert_eq!(ents.len(), 1, "DATE deve ser emitido como entidade v1.0.25");
1264        assert_eq!(ents[0].entity_type, "date");
1265    }
1266
1267    #[test]
1268    fn iob_mapeia_org_para_organization_v1025() {
1269        // v1.0.25 V008: B-ORG without tool keywords maps to "organization" not "project".
1270        let tokens = vec!["Empresa".to_string()];
1271        let labels = vec!["B-ORG".to_string()];
1272        let ents = iob_to_entities(&tokens, &labels);
1273        assert_eq!(ents[0].entity_type, "organization");
1274    }
1275
1276    #[test]
1277    fn iob_mapeia_org_sdk_para_tool() {
1278        let tokens = vec!["tokio-sdk".to_string()];
1279        let labels = vec!["B-ORG".to_string()];
1280        let ents = iob_to_entities(&tokens, &labels);
1281        assert_eq!(ents[0].entity_type, "tool");
1282    }
1283
1284    #[test]
1285    fn iob_mapeia_loc_para_location_v1025() {
1286        // v1.0.25 V008: B-LOC maps to "location" not "concept".
1287        let tokens = vec!["Brasil".to_string()];
1288        let labels = vec!["B-LOC".to_string()];
1289        let ents = iob_to_entities(&tokens, &labels);
1290        assert_eq!(ents[0].entity_type, "location");
1291    }
1292
1293    #[test]
1294    fn build_relationships_respeitam_max_rels() {
1295        let entities: Vec<NewEntity> = (0..20)
1296            .map(|i| NewEntity {
1297                name: format!("entidade_{i}"),
1298                entity_type: "concept".to_string(),
1299                description: None,
1300            })
1301            .collect();
1302        let (rels, truncated) = build_relationships(&entities);
1303        let max_rels = crate::constants::max_relationships_per_memory();
1304        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1305        if rels.len() == max_rels {
1306            assert!(truncated, "truncated deve ser true quando atingiu o cap");
1307        }
1308    }
1309
1310    #[test]
1311    fn build_relationships_sem_duplicatas() {
1312        let entities: Vec<NewEntity> = (0..5)
1313            .map(|i| NewEntity {
1314                name: format!("ent_{i}"),
1315                entity_type: "concept".to_string(),
1316                description: None,
1317            })
1318            .collect();
1319        let (rels, _truncated) = build_relationships(&entities);
1320        let mut pares: std::collections::HashSet<(String, String)> =
1321            std::collections::HashSet::new();
1322        for r in &rels {
1323            let par = (r.source.clone(), r.target.clone());
1324            assert!(pares.insert(par), "par duplicado encontrado");
1325        }
1326    }
1327
1328    #[test]
1329    fn merge_deduplica_por_nome_lowercase() {
1330        // v1.0.25: collision detection is scoped per entity_type; same name + same type
1331        // must deduplicate to one entry. Different types are kept separately.
1332        let a = vec![ExtractedEntity {
1333            name: "Rust".to_string(),
1334            entity_type: "concept".to_string(),
1335        }];
1336        let b = vec![ExtractedEntity {
1337            name: "rust".to_string(),
1338            entity_type: "concept".to_string(),
1339        }];
1340        let merged = merge_and_deduplicate(a, b);
1341        assert_eq!(
1342            merged.len(),
1343            1,
1344            "rust e Rust com mesmo tipo são a mesma entidade"
1345        );
1346    }
1347
1348    #[test]
1349    fn regex_extractor_implementa_trait() {
1350        let extractor = RegexExtractor;
1351        let result = extractor
1352            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1353            .unwrap();
1354        assert!(!result.entities.is_empty());
1355    }
1356
1357    #[test]
1358    fn extract_retorna_ok_sem_modelo() {
1359        // Sem modelo baixado, deve retornar Ok com apenas entidades regex
1360        let paths = make_paths();
1361        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1362        let result = extract_graph_auto(body, &paths).unwrap();
1363        assert!(result
1364            .entities
1365            .iter()
1366            .any(|e| e.name.contains("teste@exemplo.com")));
1367    }
1368
1369    #[test]
1370    fn stopwords_filter_v1024_terms() {
1371        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1372        // by apply_regex_prefilter so they do not appear as entities.
1373        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1374                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1375        let ents = apply_regex_prefilter(body);
1376        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1377        for word in &[
1378            "ACEITE",
1379            "ACK",
1380            "ACL",
1381            "BORDA",
1382            "CHECKLIST",
1383            "COMPLETED",
1384            "CONFIRME",
1385            "DEVEMOS",
1386            "DONE",
1387            "FIXED",
1388            "NEGUE",
1389            "PENDING",
1390            "PLAN",
1391            "PODEMOS",
1392            "RECUSE",
1393            "TOKEN",
1394            "VAMOS",
1395        ] {
1396            assert!(
1397                !names.contains(word),
1398                "v1.0.24 stopword {word} should be filtered but was found in entities"
1399            );
1400        }
1401    }
1402
1403    #[test]
1404    fn dedup_normalizes_unicode_combining_marks() {
1405        // v1.0.24 P1-E: "Café" (NFC precomposed) and "Cafe\u{301}" (NFD with
1406        // combining acute accent) must deduplicate to a single entity after NFKC
1407        // normalization.
1408        let nfc = vec![ExtractedEntity {
1409            name: "Café".to_string(),
1410            entity_type: "concept".to_string(),
1411        }];
1412        // Build the NFD form: 'e' followed by combining acute accent U+0301
1413        let nfd_name = "Cafe\u{301}".to_string();
1414        let nfd = vec![ExtractedEntity {
1415            name: nfd_name,
1416            entity_type: "concept".to_string(),
1417        }];
1418        let merged = merge_and_deduplicate(nfc, nfd);
1419        assert_eq!(
1420            merged.len(),
1421            1,
1422            "NFC 'Café' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1423        );
1424    }
1425
1426    // ── predict_batch regression tests ──────────────────────────────────────
1427
1428    #[test]
1429    fn predict_batch_output_count_matches_input() {
1430        // Verify that predict_batch returns exactly one Vec<String> per window
1431        // without requiring a real model.  We test the shape contract by
1432        // constructing the padding logic manually and asserting counts.
1433        //
1434        // Two windows of different lengths: 3 tokens and 5 tokens.
1435        let w1_ids: Vec<u32> = vec![101, 100, 102];
1436        let w1_tok: Vec<String> = vec!["[CLS]".into(), "hello".into(), "[SEP]".into()];
1437        let w2_ids: Vec<u32> = vec![101, 100, 200, 300, 102];
1438        let w2_tok: Vec<String> = vec![
1439            "[CLS]".into(),
1440            "world".into(),
1441            "foo".into(),
1442            "bar".into(),
1443            "[SEP]".into(),
1444        ];
1445        let windows: Vec<(Vec<u32>, Vec<String>)> =
1446            vec![(w1_ids.clone(), w1_tok), (w2_ids.clone(), w2_tok)];
1447
1448        // Verify padding logic and output length contracts using tensor operations
1449        // that do NOT require BertModel::forward.
1450        let device = Device::Cpu;
1451        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap();
1452        assert_eq!(max_len, 5, "max_len deve ser 5");
1453
1454        let mut padded_ids: Vec<Tensor> = Vec::new();
1455        for (ids, _) in &windows {
1456            let len = ids.len();
1457            let pad_right = max_len - len;
1458            let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
1459            let t = Tensor::from_vec(ids_i64, len, &device).unwrap();
1460            let t = t.pad_with_zeros(0, 0, pad_right).unwrap();
1461            assert_eq!(
1462                t.dims(),
1463                &[max_len],
1464                "cada janela deve ter shape (max_len,) após padding"
1465            );
1466            padded_ids.push(t);
1467        }
1468
1469        let stacked = Tensor::stack(&padded_ids, 0).unwrap();
1470        assert_eq!(
1471            stacked.dims(),
1472            &[2, max_len],
1473            "stack deve produzir (batch_size=2, max_len=5)"
1474        );
1475
1476        // Verify narrow preserves only real tokens for each window
1477        // (simulates what predict_batch does after classifier.forward)
1478        let fake_logits_data: Vec<f32> = vec![0.0f32; 2 * max_len * 9]; // batch×seq×num_labels=9
1479        let fake_logits =
1480            Tensor::from_vec(fake_logits_data, (2usize, max_len, 9usize), &device).unwrap();
1481        for (i, (ids, _)) in windows.iter().enumerate() {
1482            let real_len = ids.len();
1483            let example = fake_logits.get(i).unwrap();
1484            let sliced = example.narrow(0, 0, real_len).unwrap();
1485            assert_eq!(
1486                sliced.dims(),
1487                &[real_len, 9],
1488                "narrow deve preservar apenas {real_len} tokens reais"
1489            );
1490        }
1491    }
1492
1493    #[test]
1494    fn predict_batch_empty_windows_returns_empty() {
1495        // predict_batch with no windows must return an empty Vec, not panic.
1496        // We test the guard logic directly on the batch size/max_len path.
1497        let windows: Vec<(Vec<u32>, Vec<String>)> = vec![];
1498        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
1499        assert_eq!(max_len, 0, "zero windows → max_len 0");
1500        // The real predict_batch returns Ok(vec![]) when max_len == 0.
1501        // We assert the expected output shape by reproducing the guard here.
1502        let result: Vec<Vec<String>> = if max_len == 0 {
1503            Vec::new()
1504        } else {
1505            unreachable!()
1506        };
1507        assert!(result.is_empty());
1508    }
1509
1510    #[test]
1511    fn ner_batch_size_default_is_8() {
1512        // Verify that ner_batch_size() returns the documented default when the
1513        // env var is absent.  We clear the var to avoid cross-test contamination.
1514        std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1515        assert_eq!(crate::constants::ner_batch_size(), 8);
1516    }
1517
1518    #[test]
1519    fn ner_batch_size_env_override_clamped() {
1520        // Override via env var; values outside [1, 32] must be clamped.
1521        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "64");
1522        assert_eq!(crate::constants::ner_batch_size(), 32, "deve clampar em 32");
1523
1524        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "0");
1525        assert_eq!(crate::constants::ner_batch_size(), 1, "deve clampar em 1");
1526
1527        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "4");
1528        assert_eq!(
1529            crate::constants::ner_batch_size(),
1530            4,
1531            "valor válido preservado"
1532        );
1533
1534        std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1535    }
1536
1537    #[test]
1538    fn extraction_method_regex_only_unchanged() {
1539        // RegexExtractor always returns "regex-only" regardless of NER_MODEL OnceLock state.
1540        // This guards against accidentally changing the regex-only fallback string.
1541        let result = RegexExtractor.extract("contato: dev@acme.io").unwrap();
1542        assert_eq!(
1543            result.extraction_method, "regex-only",
1544            "RegexExtractor deve retornar regex-only"
1545        );
1546    }
1547
1548    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1549
1550    #[test]
1551    fn extend_suffix_pure_numeric_unchanged() {
1552        // Existing behaviour: pure-numeric suffix must still work after P2-E.
1553        let ents = vec![ExtractedEntity {
1554            name: "GPT".to_string(),
1555            entity_type: "concept".to_string(),
1556        }];
1557        let result = extend_with_numeric_suffix(ents, "usando GPT-5 no projeto");
1558        assert_eq!(
1559            result[0].name, "GPT-5",
1560            "sufixo puramente numérico deve ser estendido"
1561        );
1562    }
1563
1564    #[test]
1565    fn extend_suffix_alphanumeric_letter_after_digit() {
1566        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1567        let ents = vec![ExtractedEntity {
1568            name: "GPT".to_string(),
1569            entity_type: "concept".to_string(),
1570        }];
1571        let result = extend_with_numeric_suffix(ents, "usando GPT-4o para tarefas avançadas");
1572        assert_eq!(result[0].name, "GPT-4o", "sufixo '4o' deve ser aceito");
1573    }
1574
1575    #[test]
1576    fn extend_suffix_alphanumeric_b_suffix() {
1577        // P2-E: "5b" suffix (digit + 'b') must be captured.
1578        let ents = vec![ExtractedEntity {
1579            name: "Llama".to_string(),
1580            entity_type: "concept".to_string(),
1581        }];
1582        let result = extend_with_numeric_suffix(ents, "modelo Llama-5b open-weight");
1583        assert_eq!(result[0].name, "Llama-5b", "sufixo '5b' deve ser aceito");
1584    }
1585
1586    #[test]
1587    fn extend_suffix_alphanumeric_x_suffix() {
1588        // P2-E: "8x" suffix (digit + 'x') must be captured.
1589        let ents = vec![ExtractedEntity {
1590            name: "Mistral".to_string(),
1591            entity_type: "concept".to_string(),
1592        }];
1593        let result = extend_with_numeric_suffix(ents, "testando Mistral-8x em produção");
1594        assert_eq!(result[0].name, "Mistral-8x", "sufixo '8x' deve ser aceito");
1595    }
1596
1597    // --- P2-D: augment_versioned_model_names extended regex ---
1598
1599    #[test]
1600    fn augment_versioned_gpt4o() {
1601        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1602        let result = augment_versioned_model_names(vec![], "usando GPT-4o para análise");
1603        assert!(
1604            result.iter().any(|e| e.name == "GPT-4o"),
1605            "GPT-4o deve ser capturado pelo augment, achados: {:?}",
1606            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1607        );
1608    }
1609
1610    #[test]
1611    fn augment_versioned_claude_4_sonnet() {
1612        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1613        let result =
1614            augment_versioned_model_names(vec![], "melhor modelo: Claude 4 Sonnet lançado hoje");
1615        assert!(
1616            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1617            "Claude 4 Sonnet deve ser capturado, achados: {:?}",
1618            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1619        );
1620    }
1621
1622    #[test]
1623    fn augment_versioned_llama_3_pro() {
1624        // P2-D: "Llama 3 Pro" must be captured with release tier.
1625        let result =
1626            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1627        assert!(
1628            result.iter().any(|e| e.name == "Llama 3 Pro"),
1629            "Llama 3 Pro deve ser capturado, achados: {:?}",
1630            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1631        );
1632    }
1633
1634    #[test]
1635    fn augment_versioned_mixtral_8x7b() {
1636        // P2-D: "Mixtral 8x7B" composite version must be captured.
1637        let result =
1638            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1639        assert!(
1640            result.iter().any(|e| e.name == "Mixtral 8x7B"),
1641            "Mixtral 8x7B deve ser capturado, achados: {:?}",
1642            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1643        );
1644    }
1645
1646    #[test]
1647    fn augment_versioned_does_not_duplicate_existing() {
1648        // P2-D back-compat: entities already present must not be duplicated.
1649        let existing = vec![ExtractedEntity {
1650            name: "Claude 4".to_string(),
1651            entity_type: "concept".to_string(),
1652        }];
1653        let result = augment_versioned_model_names(existing, "usando Claude 4 no projeto");
1654        let count = result.iter().filter(|e| e.name == "Claude 4").count();
1655        assert_eq!(count, 1, "Claude 4 não deve ser duplicado");
1656    }
1657
1658    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1659
1660    #[test]
1661    fn stopwords_filter_url_jwt_api_v1025() {
1662        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1663        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1664        let ents = apply_regex_prefilter(body);
1665        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1666        for blocked in &[
1667            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1668        ] {
1669            assert!(
1670                !names.contains(blocked),
1671                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1672            );
1673        }
1674    }
1675
1676    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1677
1678    #[test]
1679    fn section_markers_etapa_fase_filtered_v1025() {
1680        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1681        let body = "Etapa 3 do plano: implementar Fase 1 da Migração.";
1682        let ents = apply_regex_prefilter(body);
1683        assert!(
1684            !ents
1685                .iter()
1686                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1687            "section markers must be stripped; entities: {:?}",
1688            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1689        );
1690    }
1691
1692    #[test]
1693    fn section_markers_passo_secao_filtered_v1025() {
1694        let body = "Siga Passo 2 conforme Seção 3 do manual.";
1695        let ents = apply_regex_prefilter(body);
1696        assert!(
1697            !ents
1698                .iter()
1699                .any(|e| e.name.contains("Passo") || e.name.contains("Seção")),
1700            "Passo/Seção section markers must be stripped; entities: {:?}",
1701            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1702        );
1703    }
1704
1705    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1706
1707    #[test]
1708    fn brand_camelcase_extracted_as_organization_v1025() {
1709        // "OpenAI" is a CamelCase brand that BERT NER often misses.
1710        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1711        let ents = apply_regex_prefilter(body);
1712        let openai = ents.iter().find(|e| e.name == "OpenAI");
1713        assert!(
1714            openai.is_some(),
1715            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1716            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1717        );
1718        assert_eq!(
1719            openai.unwrap().entity_type,
1720            "organization",
1721            "brand CamelCase must map to organization (V008)"
1722        );
1723    }
1724
1725    #[test]
1726    fn brand_postgresql_extracted_as_organization_v1025() {
1727        let body = "migrating from MySQL to PostgreSQL for better performance.";
1728        let ents = apply_regex_prefilter(body);
1729        assert!(
1730            ents.iter()
1731                .any(|e| e.name == "PostgreSQL" && e.entity_type == "organization"),
1732            "PostgreSQL must be extracted as organization; entities: {:?}",
1733            ents.iter()
1734                .map(|e| (&e.name, &e.entity_type))
1735                .collect::<Vec<_>>()
1736        );
1737    }
1738
1739    // ── v1.0.25 V008 alignment ──
1740
1741    #[test]
1742    fn iob_org_maps_to_organization_not_project_v1025() {
1743        // B-ORG without tool keywords must emit "organization" (V008), not "project".
1744        let tokens = vec!["Microsoft".to_string()];
1745        let labels = vec!["B-ORG".to_string()];
1746        let ents = iob_to_entities(&tokens, &labels);
1747        assert_eq!(
1748            ents[0].entity_type, "organization",
1749            "B-ORG must map to organization (V008); got {}",
1750            ents[0].entity_type
1751        );
1752    }
1753
1754    #[test]
1755    fn iob_loc_maps_to_location_not_concept_v1025() {
1756        // B-LOC must emit "location" (V008), not "concept".
1757        let tokens = vec!["São".to_string(), "Paulo".to_string()];
1758        let labels = vec!["B-LOC".to_string(), "I-LOC".to_string()];
1759        let ents = iob_to_entities(&tokens, &labels);
1760        assert_eq!(
1761            ents[0].entity_type, "location",
1762            "B-LOC must map to location (V008); got {}",
1763            ents[0].entity_type
1764        );
1765    }
1766
1767    #[test]
1768    fn iob_date_maps_to_date_not_discarded_v1025() {
1769        // B-DATE must emit "date" (V008) instead of being discarded.
1770        let tokens = vec!["2025".to_string(), "-".to_string(), "12".to_string()];
1771        let labels = vec![
1772            "B-DATE".to_string(),
1773            "I-DATE".to_string(),
1774            "I-DATE".to_string(),
1775        ];
1776        let ents = iob_to_entities(&tokens, &labels);
1777        assert_eq!(
1778            ents.len(),
1779            1,
1780            "DATE entity must be emitted (V008); entities: {ents:?}"
1781        );
1782        assert_eq!(ents[0].entity_type, "date");
1783    }
1784
1785    // ── v1.0.25 P0-2: PT verb false-positive filter ──
1786
1787    #[test]
1788    fn pt_verb_le_filtered_as_per_v1025() {
1789        // "Lê" is a PT monosyllabic verb; when tagged B-PER it must be dropped.
1790        let tokens = vec!["Lê".to_string(), "o".to_string(), "livro".to_string()];
1791        let labels = vec!["B-PER".to_string(), "O".to_string(), "O".to_string()];
1792        let ents = iob_to_entities(&tokens, &labels);
1793        assert!(
1794            !ents
1795                .iter()
1796                .any(|e| e.name == "Lê" && e.entity_type == "person"),
1797            "PT verb 'Lê' tagged B-PER must be filtered; entities: {ents:?}"
1798        );
1799    }
1800
1801    #[test]
1802    fn pt_verb_ver_filtered_as_per_v1025() {
1803        // "Ver" is a PT verb that BERT sometimes tags B-PER; must be filtered.
1804        let tokens = vec!["Ver".to_string()];
1805        let labels = vec!["B-PER".to_string()];
1806        let ents = iob_to_entities(&tokens, &labels);
1807        assert!(
1808            ents.is_empty(),
1809            "PT verb 'Ver' tagged B-PER must be filtered; entities: {ents:?}"
1810        );
1811    }
1812
1813    // --- P0-3 longest-wins v1.0.25 ---
1814
1815    fn entity(name: &str, entity_type: &str) -> ExtractedEntity {
1816        ExtractedEntity {
1817            name: name.to_string(),
1818            entity_type: entity_type.to_string(),
1819        }
1820    }
1821
1822    #[test]
1823    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1824        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1825        let regex = vec![entity("Sonne", "concept")];
1826        let ner = vec![entity("Sonnet", "concept")];
1827        let result = merge_and_deduplicate(regex, ner);
1828        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1829        assert_eq!(result[0].name, "Sonnet");
1830    }
1831
1832    #[test]
1833    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1834        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1835        let regex = vec![
1836            entity("Open", "organization"),
1837            entity("OpenAI", "organization"),
1838        ];
1839        let result = merge_and_deduplicate(regex, vec![]);
1840        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1841        assert_eq!(result[0].name, "OpenAI");
1842    }
1843
1844    #[test]
1845    fn merge_keeps_both_when_no_containment_v1025() {
1846        // "Alice" and "Bob" share no containment — both must be preserved.
1847        let regex = vec![entity("Alice", "person"), entity("Bob", "person")];
1848        let result = merge_and_deduplicate(regex, vec![]);
1849        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1850    }
1851
1852    #[test]
1853    fn merge_respects_entity_type_boundary_v1025() {
1854        // Same name "Apple" but different types: both must survive independently.
1855        let regex = vec![entity("Apple", "organization"), entity("Apple", "concept")];
1856        let result = merge_and_deduplicate(regex, vec![]);
1857        assert_eq!(
1858            result.len(),
1859            2,
1860            "expected 2 entities (different types), got: {result:?}"
1861        );
1862    }
1863
1864    #[test]
1865    fn merge_case_insensitive_dedup_v1025() {
1866        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1867        let regex = vec![
1868            entity("OpenAI", "organization"),
1869            entity("openai", "organization"),
1870        ];
1871        let result = merge_and_deduplicate(regex, vec![]);
1872        assert_eq!(
1873            result.len(),
1874            1,
1875            "expected 1 entity after case-insensitive dedup, got: {result:?}"
1876        );
1877    }
1878
1879    // ── v1.0.25 P0-4: section markers must be filtered in iob_to_entities too ──
1880
1881    #[test]
1882    fn iob_section_marker_etapa_filtered_v1025() {
1883        // BERT may tag "Etapa" (B-MISC) + "3" (I-MISC) as a span; flush must drop it.
1884        let tokens = vec!["Etapa".to_string(), "3".to_string()];
1885        let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1886        let ents = iob_to_entities(&tokens, &labels);
1887        assert!(
1888            !ents.iter().any(|e| e.name.contains("Etapa")),
1889            "section marker 'Etapa 3' from BERT must be filtered; entities: {ents:?}"
1890        );
1891    }
1892
1893    #[test]
1894    fn iob_section_marker_fase_filtered_v1025() {
1895        // BERT may tag "Fase" (B-MISC) + "1" (I-MISC) as a span; flush must drop it.
1896        let tokens = vec!["Fase".to_string(), "1".to_string()];
1897        let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1898        let ents = iob_to_entities(&tokens, &labels);
1899        assert!(
1900            !ents.iter().any(|e| e.name.contains("Fase")),
1901            "section marker 'Fase 1' from BERT must be filtered; entities: {ents:?}"
1902        );
1903    }
1904}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs