sqlite_graphrag/
extraction.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::OnceLock;
4
5use anyhow::{Context, Result};
6use candle_core::{DType, Device, Tensor};
7use candle_nn::{Linear, Module, VarBuilder};
8use candle_transformers::models::bert::{BertModel, Config as BertConfig};
9use regex::Regex;
10use serde::Deserialize;
11use unicode_normalization::UnicodeNormalization;
12
13use crate::paths::AppPaths;
14use crate::storage::entities::{NewEntity, NewRelationship};
15
16const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
17const MAX_SEQ_LEN: usize = 512;
18const STRIDE: usize = 256;
19const MAX_ENTS: usize = 30;
20const TOP_K_RELATIONS: usize = 5;
21const DEFAULT_RELATION: &str = "mentions";
22const MIN_ENTITY_CHARS: usize = 2;
23
24static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
25static REGEX_URL: OnceLock<Regex> = OnceLock::new();
26static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
27static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
28// v1.0.25 P0-4: filters section-structure markers like "Etapa 3", "Fase 1", "Passo 2".
29static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
30// v1.0.25 P0-2: captures CamelCase brand names that BERT NER often misses (e.g. "OpenAI", "PostgreSQL").
31static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
32
33// v1.0.20: stopwords para filtrar palavras-regra PT-BR/EN comuns capturadas como ALL_CAPS.
34// Sem este filtro, corpus técnico em PT-BR contendo regras formatadas em CAPS (NUNCA, PROIBIDO, DEVE)
35// gerava ~70% de "entidades" lixo. Mantemos identificadores tipo MAX_RETRY (com underscore).
36// v1.0.22: lista expandida com termos observados em stress test 495 arquivos do flowaiper.
37// Inclui verbos (ADICIONAR, VALIDAR), adjetivos (ALTA, BAIXA), substantivos comuns (BANCO, CASO),
38// HTTP methods (GET, POST, DELETE) e formatos de dados genéricos (JSON, XML).
39// v1.0.24: added 17 new terms observed in audit v1.0.23: generic status words (COMPLETED, DONE,
40// FIXED, PENDING), PT-BR imperative verbs (ACEITE, CONFIRME, NEGUE, RECUSE), PT-BR modal/
41// common verbs (DEVEMOS, PODEMOS, VAMOS), generic nouns (BORDA, CHECKLIST, PLAN, TOKEN),
42// and common abbreviations (ACK, ACL).
43// v1.0.25 P0-4: added technology/protocol acronyms (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL)
44// and PT-BR section-label stems (CAPÍTULO, ETAPA, FASE, PASSO, SEÇÃO) to prevent section markers
45// and generic tech terms from being extracted as entities.
46const ALL_CAPS_STOPWORDS: &[&str] = &[
47    "ACEITE",
48    "ACK",
49    "ACL",
50    "ACRESCENTADO",
51    "ADICIONAR",
52    "AGENTS",
53    "ALL",
54    "ALTA",
55    "ALWAYS",
56    "API",
57    "ARTEFATOS",
58    "ATIVO",
59    "BAIXA",
60    "BANCO",
61    "BORDA",
62    "BLOQUEAR",
63    "BUG",
64    "CAPÍTULO",
65    "CASO",
66    "CHECKLIST",
67    "CLI",
68    "COMPLETED",
69    "CONFIRMADO",
70    "CONFIRME",
71    "CONTRATO",
72    "CRÍTICO",
73    "CRITICAL",
74    "CSV",
75    "DEVE",
76    "DEVEMOS",
77    "DISCO",
78    "DONE",
79    "EFEITO",
80    "ENTRADA",
81    "ERROR",
82    "ESSA",
83    "ESSE",
84    "ESSENCIAL",
85    "ESTA",
86    "ESTE",
87    "ETAPA",
88    "EVITAR",
89    "EXPANDIR",
90    "EXPOR",
91    "FALHA",
92    "FASE",
93    "FIXED",
94    "FIXME",
95    "FORBIDDEN",
96    "HACK",
97    "HEARTBEAT",
98    "HTTP",
99    "HTTPS",
100    "INATIVO",
101    "JAMAIS",
102    "JSON",
103    "JWT",
104    "LLM",
105    "MUST",
106    "NEGUE",
107    "NEVER",
108    "NOTE",
109    "NUNCA",
110    "OBRIGATÓRIO",
111    "PADRÃO",
112    "PASSO",
113    "PENDING",
114    "PLAN",
115    "PODEMOS",
116    "PROIBIDO",
117    "RECUSE",
118    "REGRAS",
119    "REQUIRED",
120    "REQUISITO",
121    "REST",
122    "SEÇÃO",
123    "SEMPRE",
124    "SHALL",
125    "SHOULD",
126    "SOUL",
127    "TODAS",
128    "TODO",
129    "TODOS",
130    "TOKEN",
131    "TOOLS",
132    "TSV",
133    "UI",
134    "URL",
135    "USAR",
136    "VALIDAR",
137    "VAMOS",
138    "VOCÊ",
139    "WARNING",
140    "XML",
141    "YAML",
142];
143
144// v1.0.22: HTTP methods são verbos de protocolo, não entidades semanticamente úteis.
145// Filtrados em apply_regex_prefilter (regex_all_caps) e iob_to_entities (single-token).
146const HTTP_METHODS: &[&str] = &[
147    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
148];
149
150fn is_filtered_all_caps(token: &str) -> bool {
151    // Identificadores com underscore são preservados (ex: MAX_RETRY, FLOWAIPER_API_KEY)
152    let is_identifier = token.contains('_');
153    if is_identifier {
154        return false;
155    }
156    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
157}
158
159fn regex_email() -> &'static Regex {
160    REGEX_EMAIL
161        .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
162}
163
164fn regex_url() -> &'static Regex {
165    REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
166}
167
168fn regex_uuid() -> &'static Regex {
169    REGEX_UUID.get_or_init(|| {
170        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
171            .unwrap()
172    })
173}
174
175fn regex_all_caps() -> &'static Regex {
176    REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
177}
178
179fn regex_section_marker() -> &'static Regex {
180    REGEX_SECTION_MARKER.get_or_init(|| {
181        // Matches PT-BR document-structure labels followed by a number: "Etapa 3", "Fase 1", etc.
182        Regex::new(r"\b(?:Etapa|Fase|Passo|Seção|Capítulo)\s+\d+\b").unwrap()
183    })
184}
185
186fn regex_brand_camel() -> &'static Regex {
187    REGEX_BRAND_CAMEL.get_or_init(|| {
188        // Matches CamelCase brand names: one or more lowercase letters after an uppercase, then
189        // another uppercase followed by more letters. Covers "OpenAI", "PostgreSQL", "ChatGPT".
190        Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b").unwrap()
191    })
192}
193
194#[derive(Debug, Clone, PartialEq)]
195pub struct ExtractedEntity {
196    pub name: String,
197    pub entity_type: String,
198}
199
200/// URL com offset de origem extraída do corpo da memória.
201#[derive(Debug, Clone)]
202pub struct ExtractedUrl {
203    pub url: String,
204    /// Posição em bytes no corpo onde a URL foi encontrada.
205    pub offset: usize,
206}
207
208#[derive(Debug, Clone)]
209pub struct ExtractionResult {
210    pub entities: Vec<NewEntity>,
211    pub relationships: Vec<NewRelationship>,
212    /// True when build_relationships hit the cap before covering all entity pairs.
213    /// Exposed in RememberResponse so callers can detect when relationships were cut.
214    pub relationships_truncated: bool,
215    /// Método usado para extração: "bert+regex" ou "regex-only".
216    /// Útil para auditoria, métricas e reportes ao usuário.
217    pub extraction_method: String,
218    /// URLs extraídas do corpo — armazenadas separadamente das entidades do grafo.
219    pub urls: Vec<ExtractedUrl>,
220}
221
222pub trait Extractor: Send + Sync {
223    fn extract(&self, body: &str) -> Result<ExtractionResult>;
224}
225
226#[derive(Deserialize)]
227struct ModelConfig {
228    #[serde(default)]
229    id2label: HashMap<String, String>,
230    hidden_size: usize,
231}
232
233struct BertNerModel {
234    bert: BertModel,
235    classifier: Linear,
236    device: Device,
237    id2label: HashMap<usize, String>,
238}
239
240impl BertNerModel {
241    fn load(model_dir: &Path) -> Result<Self> {
242        let config_path = model_dir.join("config.json");
243        let weights_path = model_dir.join("model.safetensors");
244
245        let config_str = std::fs::read_to_string(&config_path)
246            .with_context(|| format!("lendo config.json em {config_path:?}"))?;
247        let model_cfg: ModelConfig =
248            serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
249
250        let id2label: HashMap<usize, String> = model_cfg
251            .id2label
252            .into_iter()
253            .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
254            .collect();
255
256        let num_labels = id2label.len().max(9);
257        let hidden_size = model_cfg.hidden_size;
258
259        let bert_config_str = std::fs::read_to_string(&config_path)
260            .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
261        let bert_cfg: BertConfig =
262            serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
263
264        let device = Device::Cpu;
265
266        let vb = unsafe {
267            VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
268                .with_context(|| format!("mapeando {weights_path:?}"))?
269        };
270        let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
271
272        // v1.0.20 fix P0 secundário: carregar classifier head do safetensors em vez de zeros.
273        // Em v1.0.19 usávamos Tensor::zeros, o que produzia argmax constante e inferência degenerada.
274        let cls_vb = vb.pp("classifier");
275        let weight = cls_vb
276            .get((num_labels, hidden_size), "weight")
277            .context("carregando classifier.weight do safetensors")?;
278        let bias = cls_vb
279            .get(num_labels, "bias")
280            .context("carregando classifier.bias do safetensors")?;
281        let classifier = Linear::new(weight, Some(bias));
282
283        Ok(Self {
284            bert,
285            classifier,
286            device,
287            id2label,
288        })
289    }
290
291    fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
292        let len = token_ids.len();
293        let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
294        let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
295
296        let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
297            .context("criando tensor input_ids")?;
298        let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
299            .context("criando tensor token_type_ids")?;
300        let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
301            .context("criando tensor attention_mask")?;
302
303        let sequence_output = self
304            .bert
305            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
306            .context("forward pass do BertModel")?;
307
308        let logits = self
309            .classifier
310            .forward(&sequence_output)
311            .context("forward pass do classificador")?;
312
313        let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
314
315        let num_tokens = logits_2d.dim(0).context("dim(0)")?;
316
317        let mut labels = Vec::with_capacity(num_tokens);
318        for i in 0..num_tokens {
319            let token_logits = logits_2d.get(i).context("get token logits")?;
320            let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
321            let argmax = vec
322                .iter()
323                .enumerate()
324                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
325                .map(|(idx, _)| idx)
326                .unwrap_or(0);
327            let label = self
328                .id2label
329                .get(&argmax)
330                .cloned()
331                .unwrap_or_else(|| "O".to_string());
332            labels.push(label);
333        }
334
335        Ok(labels)
336    }
337
338    /// Run a batched forward pass over multiple tokenised windows at once.
339    ///
340    /// Windows are padded on the right with token_id=0 and attention_mask=0 to
341    /// the length of the longest window in the batch.  The attention mask ensures
342    /// BERT ignores padded positions (bert.rs:515-528 adds -3.4e38 before softmax).
343    ///
344    /// Returns one label vector per window, each of length equal to that window's
345    /// original (pre-padding) token count.
346    fn predict_batch(&self, windows: &[(Vec<u32>, Vec<String>)]) -> Result<Vec<Vec<String>>> {
347        let batch_size = windows.len();
348        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
349        if max_len == 0 {
350            return Ok(vec![vec![]; batch_size]);
351        }
352
353        let mut padded_ids: Vec<Tensor> = Vec::with_capacity(batch_size);
354        let mut padded_masks: Vec<Tensor> = Vec::with_capacity(batch_size);
355
356        for (ids, _) in windows {
357            let len = ids.len();
358            let pad_right = max_len - len;
359
360            let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
361            // Build 1-D token tensor then pad to max_len
362            let t = Tensor::from_vec(ids_i64, len, &self.device)
363                .context("criando tensor de ids para batch")?;
364            let t = t
365                .pad_with_zeros(0, 0, pad_right)
366                .context("padding tensor de ids")?;
367            padded_ids.push(t);
368
369            // Attention mask: 1 for real tokens, 0 for padding
370            let mut mask_i64 = vec![1i64; len];
371            mask_i64.extend(vec![0i64; pad_right]);
372            let m = Tensor::from_vec(mask_i64, max_len, &self.device)
373                .context("criando tensor de máscara para batch")?;
374            padded_masks.push(m);
375        }
376
377        // Stack 1-D tensors into (batch_size, max_len)
378        let input_ids = Tensor::stack(&padded_ids, 0).context("stack input_ids")?;
379        let attn_mask = Tensor::stack(&padded_masks, 0).context("stack attn_mask")?;
380        let token_type_ids = Tensor::zeros((batch_size, max_len), DType::I64, &self.device)
381            .context("criando token_type_ids batch")?;
382
383        // Single forward pass for the entire batch
384        let sequence_output = self
385            .bert
386            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
387            .context("forward pass batch BertModel")?;
388        // sequence_output: (batch_size, max_len, hidden_size)
389
390        let logits = self
391            .classifier
392            .forward(&sequence_output)
393            .context("forward pass batch classificador")?;
394        // logits: (batch_size, max_len, num_labels)
395
396        let mut results = Vec::with_capacity(batch_size);
397        for (i, (window_ids, _)) in windows.iter().enumerate() {
398            let example_logits = logits.get(i).context("get logits exemplo")?;
399            // (max_len, num_labels) — slice only real tokens, discard padding
400            let real_len = window_ids.len();
401            let example_slice = example_logits
402                .narrow(0, 0, real_len)
403                .context("narrow para tokens reais")?;
404            let logits_2d: Vec<Vec<f32>> = example_slice.to_vec2().context("to_vec2 logits")?;
405
406            let labels: Vec<String> = logits_2d
407                .iter()
408                .map(|token_logits| {
409                    let argmax = token_logits
410                        .iter()
411                        .enumerate()
412                        .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
413                        .map(|(idx, _)| idx)
414                        .unwrap_or(0);
415                    self.id2label
416                        .get(&argmax)
417                        .cloned()
418                        .unwrap_or_else(|| "O".to_string())
419                })
420                .collect();
421
422            results.push(labels);
423        }
424
425        Ok(results)
426    }
427}
428
429static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
430
431fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
432    NER_MODEL
433        .get_or_init(|| match load_model(paths) {
434            Ok(m) => Some(m),
435            Err(e) => {
436                tracing::warn!("NER model não disponível (graceful degradation): {e:#}");
437                None
438            }
439        })
440        .as_ref()
441}
442
443fn model_dir(paths: &AppPaths) -> PathBuf {
444    paths.models.join("bert-multilingual-ner")
445}
446
447fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
448    let dir = model_dir(paths);
449    std::fs::create_dir_all(&dir)
450        .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
451
452    let weights = dir.join("model.safetensors");
453    let config = dir.join("config.json");
454    let tokenizer = dir.join("tokenizer.json");
455
456    if weights.exists() && config.exists() && tokenizer.exists() {
457        return Ok(dir);
458    }
459
460    tracing::info!("Baixando modelo NER (primeira execução, ~676 MB)...");
461    crate::output::emit_progress_i18n(
462        "Downloading NER model (first run, ~676 MB)...",
463        "Baixando modelo NER (primeira execução, ~676 MB)...",
464    );
465
466    let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
467    let repo = api.model(MODEL_ID.to_string());
468
469    // v1.0.20 fix P0 primário: tokenizer.json no repo Davlan está apenas em onnx/tokenizer.json.
470    // Em v1.0.19 buscávamos da raiz e recebíamos 404, caindo em graceful degradation 100% das vezes.
471    // Mapeamos (remote_path, local_filename) para baixar do subfolder mantendo nome plano local.
472    for (remote, local) in &[
473        ("model.safetensors", "model.safetensors"),
474        ("config.json", "config.json"),
475        ("onnx/tokenizer.json", "tokenizer.json"),
476        ("tokenizer_config.json", "tokenizer_config.json"),
477    ] {
478        let dest = dir.join(local);
479        if !dest.exists() {
480            let src = repo
481                .get(remote)
482                .with_context(|| format!("baixando {remote} do HF Hub"))?;
483            std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
484        }
485    }
486
487    Ok(dir)
488}
489
490fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
491    let dir = ensure_model_files(paths)?;
492    BertNerModel::load(&dir)
493}
494
495fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
496    let mut entities = Vec::new();
497    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
498
499    let add = |entities: &mut Vec<ExtractedEntity>,
500               seen: &mut std::collections::HashSet<String>,
501               name: &str,
502               entity_type: &str| {
503        let name = name.trim().to_string();
504        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
505            entities.push(ExtractedEntity {
506                name,
507                entity_type: entity_type.to_string(),
508            });
509        }
510    };
511
512    // v1.0.25 P0-4: strip section-structure markers before any other processing so that
513    // "Etapa 3", "Fase 1", "Passo 2" are not fed to downstream regex passes.
514    let cleaned = regex_section_marker().replace_all(body, " ");
515    let cleaned = cleaned.as_ref();
516
517    for m in regex_email().find_iter(cleaned) {
518        // v1.0.20: email é "concept" (regex sozinho não distingue pessoa de mailing list/role).
519        add(&mut entities, &mut seen, m.as_str(), "concept");
520    }
521    for m in regex_uuid().find_iter(cleaned) {
522        add(&mut entities, &mut seen, m.as_str(), "concept");
523    }
524    for m in regex_all_caps().find_iter(cleaned) {
525        let candidate = m.as_str();
526        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
527        if !is_filtered_all_caps(candidate) {
528            add(&mut entities, &mut seen, candidate, "concept");
529        }
530    }
531    // v1.0.25 P0-2: capture CamelCase brand names that BERT NER often misses.
532    // Maps to "organization" (V008 schema) because brand names are typically organisations.
533    for m in regex_brand_camel().find_iter(cleaned) {
534        let name = m.as_str();
535        // Skip if the uppercased form is a known stopword (e.g. "JsonSchema" → "JSONSCHEMA").
536        if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
537            add(&mut entities, &mut seen, name, "organization");
538        }
539    }
540
541    entities
542}
543
544/// Extrai URLs do corpo de uma memória, desduplicadas por texto.
545/// URLs são armazenadas na tabela `memory_urls` separadamente do grafo de entidades.
546/// v1.0.24: split do bloco URL que poluía apply_regex_prefilter com entity_type='concept'.
547pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
548    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
549    let mut result = Vec::new();
550    for m in regex_url().find_iter(body) {
551        let raw = m.as_str();
552        let cleaned = raw
553            .trim_end_matches('`')
554            .trim_end_matches(',')
555            .trim_end_matches('.')
556            .trim_end_matches(';')
557            .trim_end_matches(')')
558            .trim_end_matches(']')
559            .trim_end_matches('}');
560        if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
561            result.push(ExtractedUrl {
562                url: cleaned.to_string(),
563                offset: m.start(),
564            });
565        }
566    }
567    result
568}
569
570fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
571    let mut entities: Vec<ExtractedEntity> = Vec::new();
572    let mut current_parts: Vec<String> = Vec::new();
573    let mut current_type: Option<String> = None;
574
575    let flush =
576        |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
577            if let Some(t) = typ.take() {
578                let name = parts.join(" ").trim().to_string();
579                // v1.0.22: filtra single-token entities que sejam stopwords ALL CAPS ou HTTP methods.
580                // BERT NER classifica algumas dessas como B-MISC/B-ORG; pós-filtro aqui evita
581                // poluir o grafo com verbos/protocolos genéricos.
582                let is_single_caps = !name.contains(' ')
583                    && name == name.to_uppercase()
584                    && name.len() >= MIN_ENTITY_CHARS;
585                let should_skip = is_single_caps && is_filtered_all_caps(&name);
586                // v1.0.25 P0-4: BERT may independently label section-structure tokens (e.g.
587                // "Etapa 3", "Fase 1") even though apply_regex_prefilter strips them from the
588                // input text before regex extraction. Apply the same guard here to avoid the
589                // BERT path re-introducing these markers as graph entities.
590                let is_section_marker = regex_section_marker().is_match(&name);
591                if name.len() >= MIN_ENTITY_CHARS && !should_skip && !is_section_marker {
592                    entities.push(ExtractedEntity {
593                        name,
594                        entity_type: t,
595                    });
596                }
597                parts.clear();
598            }
599        };
600
601    for (token, label) in tokens.iter().zip(labels.iter()) {
602        if label == "O" {
603            flush(&mut current_parts, &mut current_type, &mut entities);
604            continue;
605        }
606
607        let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
608            ("B", rest)
609        } else if let Some(rest) = label.strip_prefix("I-") {
610            ("I", rest)
611        } else {
612            flush(&mut current_parts, &mut current_type, &mut entities);
613            continue;
614        };
615
616        // v1.0.25 P0-2: Portuguese monosyllabic verbs that BERT often misclassifies as person names.
617        // Only filtered when confidence is unavailable (no logit gate here); these tokens are
618        // structurally unlikely to be real proper names in a technical corpus.
619        const PT_VERB_FALSE_POSITIVES: &[&str] = &[
620            "Lê", "Vê", "Cá", "Pôr", "Ser", "Vir", "Ver", "Dar", "Ler", "Ter",
621        ];
622
623        let entity_type = match bio_type {
624            // v1.0.25 V008: DATE is now a first-class entity type instead of being discarded.
625            "DATE" => "date",
626            "PER" => {
627                // Filter well-known PT monosyllabic verbs misclassified as persons.
628                if PT_VERB_FALSE_POSITIVES.contains(&token.as_str()) {
629                    flush(&mut current_parts, &mut current_type, &mut entities);
630                    continue;
631                }
632                "person"
633            }
634            "ORG" => {
635                let t = token.to_lowercase();
636                if t.contains("lib")
637                    || t.contains("sdk")
638                    || t.contains("cli")
639                    || t.contains("crate")
640                    || t.contains("npm")
641                {
642                    "tool"
643                } else {
644                    // v1.0.25 V008: "organization" replaces the v1.0.24 default "project".
645                    "organization"
646                }
647            }
648            // v1.0.25 V008: "location" replaces "concept" for geographic tokens.
649            "LOC" => "location",
650            other => other,
651        };
652
653        if prefix == "B" {
654            if token.starts_with("##") {
655                // BERT confuso: subword com B-prefix indica continuação de entidade anterior.
656                // Anexar à última parte da entidade atual; senão descartar.
657                let clean = token.strip_prefix("##").unwrap_or(token.as_str());
658                if let Some(last) = current_parts.last_mut() {
659                    last.push_str(clean);
660                }
661                continue;
662            }
663            flush(&mut current_parts, &mut current_type, &mut entities);
664            current_parts.push(token.clone());
665            current_type = Some(entity_type.to_string());
666        } else if prefix == "I" && current_type.is_some() {
667            let clean = token.strip_prefix("##").unwrap_or(token.as_str());
668            if token.starts_with("##") {
669                if let Some(last) = current_parts.last_mut() {
670                    last.push_str(clean);
671                }
672            } else {
673                current_parts.push(clean.to_string());
674            }
675        }
676    }
677
678    flush(&mut current_parts, &mut current_type, &mut entities);
679    entities
680}
681
682/// Returns (relationships, truncated) where truncated is true when the cap was hit
683/// before all entity pairs were covered. Exposed in RememberResponse as
684/// `relationships_truncated` so callers can decide whether to increase the cap.
685fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
686    if entities.len() < 2 {
687        return (Vec::new(), false);
688    }
689
690    // v1.0.22: cap configurável via env var (constants::max_relationships_per_memory).
691    // Permite usuários com corpus denso aumentar além do default 50.
692    let max_rels = crate::constants::max_relationships_per_memory();
693    let n = entities.len().min(MAX_ENTS);
694    let mut rels: Vec<NewRelationship> = Vec::new();
695    let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
696
697    let mut hit_cap = false;
698    'outer: for i in 0..n {
699        if rels.len() >= max_rels {
700            hit_cap = true;
701            break;
702        }
703
704        let mut for_entity = 0usize;
705        for j in (i + 1)..n {
706            if for_entity >= TOP_K_RELATIONS {
707                break;
708            }
709            if rels.len() >= max_rels {
710                hit_cap = true;
711                break 'outer;
712            }
713
714            let src = &entities[i].name;
715            let tgt = &entities[j].name;
716            let key = (src.clone(), tgt.clone());
717
718            if seen.contains(&key) {
719                continue;
720            }
721            seen.insert(key);
722
723            rels.push(NewRelationship {
724                source: src.clone(),
725                target: tgt.clone(),
726                relation: DEFAULT_RELATION.to_string(),
727                strength: 0.5,
728                description: None,
729            });
730            for_entity += 1;
731        }
732    }
733
734    // v1.0.20: avisar quando relacionamentos foram truncados antes de cobrir todos os pares possíveis.
735    if hit_cap {
736        tracing::warn!(
737            "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
738            n.saturating_sub(1)
739        );
740    }
741
742    (rels, hit_cap)
743}
744
745fn run_ner_sliding_window(
746    model: &BertNerModel,
747    body: &str,
748    paths: &AppPaths,
749) -> Result<Vec<ExtractedEntity>> {
750    let tokenizer_path = model_dir(paths).join("tokenizer.json");
751    let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
752        .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
753
754    let encoding = tokenizer
755        .encode(body, false)
756        .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
757
758    let all_ids: Vec<u32> = encoding.get_ids().to_vec();
759    let all_tokens: Vec<String> = encoding
760        .get_tokens()
761        .iter()
762        .map(|s| s.to_string())
763        .collect();
764
765    if all_ids.is_empty() {
766        return Ok(Vec::new());
767    }
768
769    // Phase 1: collect all sliding windows before any inference
770    let mut windows: Vec<(Vec<u32>, Vec<String>)> = Vec::new();
771    let mut start = 0usize;
772    loop {
773        let end = (start + MAX_SEQ_LEN).min(all_ids.len());
774        windows.push((
775            all_ids[start..end].to_vec(),
776            all_tokens[start..end].to_vec(),
777        ));
778        if end >= all_ids.len() {
779            break;
780        }
781        start += STRIDE;
782    }
783
784    // Phase 2: sort by window length ascending to minimise intra-batch padding waste
785    windows.sort_by_key(|(ids, _)| ids.len());
786
787    // Phase 3: batched inference with fallback to single-window predict on error
788    let batch_size = crate::constants::ner_batch_size();
789    let mut entities: Vec<ExtractedEntity> = Vec::new();
790    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
791
792    for chunk in windows.chunks(batch_size) {
793        match model.predict_batch(chunk) {
794            Ok(batch_labels) => {
795                for (labels, (_, tokens)) in batch_labels.iter().zip(chunk.iter()) {
796                    for ent in iob_to_entities(tokens, labels) {
797                        if seen.insert(ent.name.clone()) {
798                            entities.push(ent);
799                        }
800                    }
801                }
802            }
803            Err(e) => {
804                tracing::warn!(
805                    "batch NER falhou (chunk de {} janelas): {e:#} — fallback single-window",
806                    chunk.len()
807                );
808                // Fallback: process each window individually to preserve entities
809                for (ids, tokens) in chunk {
810                    let mask = vec![1u32; ids.len()];
811                    match model.predict(ids, &mask) {
812                        Ok(labels) => {
813                            for ent in iob_to_entities(tokens, &labels) {
814                                if seen.insert(ent.name.clone()) {
815                                    entities.push(ent);
816                                }
817                            }
818                        }
819                        Err(e2) => {
820                            tracing::warn!("janela NER fallback também falhou: {e2:#}");
821                        }
822                    }
823                }
824            }
825        }
826    }
827
828    Ok(entities)
829}
830
831/// v1.0.22 P1: estende entidades com sufixos numéricos hifenizados ou separados por espaço.
832/// Casos: GPT extraído mas body contém "GPT-5" → reescreve para "GPT-5".
833/// Casos: Claude extraído mas body contém "Claude 4" → reescreve para "Claude 4".
834/// Conservador: só estende se sufixo tiver até 7 caracteres.
835/// v1.0.24 P2-E: sufixo aceita letra ASCII minúscula opcional após dígitos para cobrir
836/// modelos como "GPT-4o", "Llama-5b", "Mistral-8x" (dígitos + [a-z]? + [x\d+]?).
837fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
838    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
839    // Matches: separator + digits + optional decimal + optional lowercase letter
840    // Examples: "-4", " 5", "-4o", " 5b", "-8x", " 3.5", "-3.5-turbo" (capped by len)
841    let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)").unwrap());
842
843    entities
844        .into_iter()
845        .map(|ent| {
846            // Encontra a primeira ocorrência case-sensitive da entidade no body
847            if let Some(pos) = body.find(&ent.name) {
848                let after_pos = pos + ent.name.len();
849                if after_pos < body.len() {
850                    let after = &body[after_pos..];
851                    if let Some(m) = suffix_re.find(after) {
852                        let suffix = m.as_str();
853                        // Conservative: cap suffix length to 7 chars to avoid grabbing
854                        // long hyphenated phrases while allowing "4o", "5b", "3.5b".
855                        if suffix.len() <= 7 {
856                            let extended = format!("{}{}", ent.name, suffix);
857                            return ExtractedEntity {
858                                name: extended,
859                                entity_type: ent.entity_type,
860                            };
861                        }
862                    }
863                }
864            }
865            ent
866        })
867        .collect()
868}
869
870/// Captures versioned model names that BERT NER consistently misses.
871///
872/// BERT NER often classifies tokens like "Claude" or "Llama" as common nouns,
873/// failing to emit a B-PER/B-ORG tag. As a result, `extend_with_numeric_suffix`
874/// never sees these candidates and the version suffix gets lost.
875///
876/// This function scans the body with a conservative regex, matching capitalised
877/// words followed by a space-or-hyphen and a small integer. Matches that are not
878/// already covered by an existing entity (case-insensitive) are appended with the
879/// `concept` type, mirroring how `extend_with_numeric_suffix` represents these
880/// items downstream.
881///
882/// v1.0.24 P2-D: regex extended to cover:
883/// - Alphanumeric version suffixes: "GPT-4o", "Llama-3b", "Mistral-8x"
884/// - Composite versions: "Mixtral 8x7B" (digit × digit + uppercase letter)
885/// - Named release tiers after version: "Claude 4 Sonnet", "Llama 3 Pro"
886///
887/// Examples covered: "Claude 4", "Llama 3", "GPT-4o", "Claude 4 Sonnet", "Mixtral 8x7B".
888/// Examples already handled upstream and skipped here: plain "Apple" without a suffix.
889fn augment_versioned_model_names(
890    entities: Vec<ExtractedEntity>,
891    body: &str,
892) -> Vec<ExtractedEntity> {
893    static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
894    // Pattern breakdown:
895    //   [A-Z][A-Za-z]{2,15}   — capitalised model name (3-16 chars)
896    //   [\s\-]+               — separator: space(s) or hyphen(s)
897    //   \d+(?:\.\d+)?         — version number, optional decimal
898    //   (?:[a-z]|x\d+[A-Za-z]?)? — optional alphanumeric suffix: "o", "b", "x7B"
899    //   (?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))? — optional release tier
900    let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
901        Regex::new(
902            r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
903        )
904        .unwrap()
905    });
906
907    let mut existing_lc: std::collections::HashSet<String> =
908        entities.iter().map(|ent| ent.name.to_lowercase()).collect();
909    let mut result = entities;
910
911    for caps in model_re.captures_iter(body) {
912        let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
913        // Conservative cap: avoid harvesting multi-word noise like "section 12" inside
914        // long passages. A model name plus a one or two digit suffix fits in 24 chars.
915        if full_match.is_empty() || full_match.len() > 24 {
916            continue;
917        }
918        let normalized_lc = full_match.to_lowercase();
919        if existing_lc.contains(&normalized_lc) {
920            continue;
921        }
922        // Stop appending once the global entity cap is reached to keep parity with
923        // `merge_and_deduplicate` truncation semantics.
924        if result.len() >= MAX_ENTS {
925            break;
926        }
927        existing_lc.insert(normalized_lc);
928        result.push(ExtractedEntity {
929            name: full_match.to_string(),
930            entity_type: "concept".to_string(),
931        });
932    }
933
934    result
935}
936
937fn merge_and_deduplicate(
938    regex_ents: Vec<ExtractedEntity>,
939    ner_ents: Vec<ExtractedEntity>,
940) -> Vec<ExtractedEntity> {
941    // v1.0.25 P0-3: Collision detection uses substring containment (not starts_with)
942    // and is scoped per entity_type. This fixes two bugs from prior versions:
943    //
944    // 1. starts_with was not symmetric for non-prefix substrings. "sonne" does not
945    //    start_with "sonnet", so the pair could survive dedup depending on insertion
946    //    order. contains() catches both directions unconditionally.
947    //
948    // 2. The lookup key omitted entity_type, so "Apple/organization" and
949    //    "Apple/concept" collapsed into one. Key is now "type\0name_lc".
950    //
951    // Earlier invariants preserved:
952    // - NFKC normalization before lowercasing (v1.0.24).
953    // - Longest-wins: on collision keep the entity with the longer name.
954    // - Truncation warning at MAX_ENTS.
955    let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
956    let mut result: Vec<ExtractedEntity> = Vec::new();
957    let mut truncated = false;
958
959    let total_input = regex_ents.len() + ner_ents.len();
960    for ent in regex_ents.into_iter().chain(ner_ents) {
961        let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
962        // Composite key: entity_type + NUL + normalised lowercase name.
963        // Collision search is scoped to the same type so that e.g.
964        // "Apple/organization" and "Apple/concept" are kept separately.
965        let key = format!("{}\0{}", ent.entity_type, name_lc);
966
967        // Scan stored entries for substring containment within the same type.
968        // Two names collide when one is a case-insensitive substring of the other:
969        //   "sonne" ⊂ "sonnet"  → collision, keep "sonnet" (longest-wins)
970        //   "open"  ⊂ "openai"  → collision, keep "openai" (longest-wins)
971        let mut collision_idx: Option<usize> = None;
972        for (existing_key, idx) in &by_lc {
973            // Fast-path: check type prefix matches before scanning the name.
974            let type_prefix = format!("{}\0", ent.entity_type);
975            if !existing_key.starts_with(&type_prefix) {
976                continue;
977            }
978            let existing_name_lc = &existing_key[type_prefix.len()..];
979            if existing_name_lc == name_lc
980                || existing_name_lc.contains(name_lc.as_str())
981                || name_lc.contains(existing_name_lc)
982            {
983                collision_idx = Some(*idx);
984                break;
985            }
986        }
987        match collision_idx {
988            Some(idx) => {
989                // Replace stored entity only when the new candidate is strictly
990                // longer; otherwise drop the new one.
991                if ent.name.len() > result[idx].name.len() {
992                    let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
993                    let old_key = format!("{}\0{}", result[idx].entity_type, old_name_lc);
994                    by_lc.remove(&old_key);
995                    result[idx] = ent;
996                    by_lc.insert(key, idx);
997                }
998            }
999            None => {
1000                by_lc.insert(key, result.len());
1001                result.push(ent);
1002            }
1003        }
1004        if result.len() >= MAX_ENTS {
1005            truncated = true;
1006            break;
1007        }
1008    }
1009
1010    // v1.0.20: avisar quando truncamento silencioso descarta entidades acima do MAX_ENTS.
1011    if truncated {
1012        tracing::warn!(
1013            "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
1014        );
1015    }
1016
1017    result
1018}
1019
1020fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1021    extracted
1022        .into_iter()
1023        .map(|e| NewEntity {
1024            name: e.name,
1025            entity_type: e.entity_type,
1026            description: None,
1027        })
1028        .collect()
1029}
1030
1031pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
1032    let regex_entities = apply_regex_prefilter(body);
1033
1034    let mut bert_used = false;
1035    let ner_entities = match get_or_init_model(paths) {
1036        Some(model) => match run_ner_sliding_window(model, body, paths) {
1037            Ok(ents) => {
1038                bert_used = true;
1039                ents
1040            }
1041            Err(e) => {
1042                tracing::warn!("NER falhou, usando apenas regex: {e:#}");
1043                Vec::new()
1044            }
1045        },
1046        None => Vec::new(),
1047    };
1048
1049    let merged = merge_and_deduplicate(regex_entities, ner_entities);
1050    // v1.0.22: estender entidades NER com sufixos numéricos do body (GPT-5, Claude 4, Python 3).
1051    let extended = extend_with_numeric_suffix(merged, body);
1052    // v1.0.23: capture versioned model names that BERT NER does not detect on its own
1053    // (e.g. "Claude 4", "Llama 3"). Hyphenated variants like "GPT-5" are already covered
1054    // by the NER+suffix pipeline above, but space-separated names need a dedicated pass.
1055    let with_models = augment_versioned_model_names(extended, body);
1056    // v1.0.25 P0-4: augment_versioned_model_names matches any capitalised word followed by a
1057    // digit, which inadvertently captures PT-BR section markers ("Etapa 3", "Fase 1"). Strip
1058    // them here as a final guard after the full augmentation pipeline.
1059    let with_models: Vec<ExtractedEntity> = with_models
1060        .into_iter()
1061        .filter(|e| !regex_section_marker().is_match(&e.name))
1062        .collect();
1063    let entities = to_new_entities(with_models);
1064    let (relationships, relationships_truncated) = build_relationships(&entities);
1065
1066    let extraction_method = if bert_used {
1067        "bert+regex-batch".to_string()
1068    } else {
1069        "regex-only".to_string()
1070    };
1071
1072    let urls = extract_urls(body);
1073
1074    Ok(ExtractionResult {
1075        entities,
1076        relationships,
1077        relationships_truncated,
1078        extraction_method,
1079        urls,
1080    })
1081}
1082
1083pub struct RegexExtractor;
1084
1085impl Extractor for RegexExtractor {
1086    fn extract(&self, body: &str) -> Result<ExtractionResult> {
1087        let regex_entities = apply_regex_prefilter(body);
1088        let entities = to_new_entities(regex_entities);
1089        let (relationships, relationships_truncated) = build_relationships(&entities);
1090        let urls = extract_urls(body);
1091        Ok(ExtractionResult {
1092            entities,
1093            relationships,
1094            relationships_truncated,
1095            extraction_method: "regex-only".to_string(),
1096            urls,
1097        })
1098    }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103    use super::*;
1104
1105    fn make_paths() -> AppPaths {
1106        use std::path::PathBuf;
1107        AppPaths {
1108            db: PathBuf::from("/tmp/test.sqlite"),
1109            models: PathBuf::from("/tmp/test_models"),
1110        }
1111    }
1112
1113    #[test]
1114    fn regex_email_captura_endereco() {
1115        let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
1116        // v1.0.20: emails são classificados como "concept" (regex sozinho não distingue pessoa de role).
1117        assert!(ents
1118            .iter()
1119            .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
1120    }
1121
1122    #[test]
1123    fn regex_all_caps_filtra_palavra_regra_pt() {
1124        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE não devem virar "entidades".
1125        let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
1126        assert!(
1127            !ents.iter().any(|e| e.name == "NUNCA"),
1128            "NUNCA deveria ser filtrado como stopword"
1129        );
1130        assert!(
1131            !ents.iter().any(|e| e.name == "PROIBIDO"),
1132            "PROIBIDO deveria ser filtrado"
1133        );
1134        assert!(
1135            !ents.iter().any(|e| e.name == "DEVE"),
1136            "DEVE deveria ser filtrado"
1137        );
1138    }
1139
1140    #[test]
1141    fn regex_all_caps_aceita_constante_com_underscore() {
1142        // Constantes técnicas tipo MAX_RETRY, TIMEOUT_MS sempre devem ser aceitas.
1143        let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
1144        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1145        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1146    }
1147
1148    #[test]
1149    fn regex_all_caps_aceita_acronimo_dominio() {
1150        // Acrônimos legítimos (não-stopword) devem passar: OPENAI, NVIDIA, GOOGLE.
1151        let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
1152        assert!(ents.iter().any(|e| e.name == "OPENAI"));
1153        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1154    }
1155
1156    #[test]
1157    fn regex_url_nao_aparece_em_apply_regex_prefilter() {
1158        // v1.0.24 P0-2: URLs foram removidas de apply_regex_prefilter e agora vão para extract_urls.
1159        let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
1160        assert!(
1161            !ents.iter().any(|e| e.name.starts_with("https://")),
1162            "URLs não devem aparecer como entidades após split P0-2"
1163        );
1164    }
1165
1166    #[test]
1167    fn extract_urls_captura_https() {
1168        let urls = extract_urls("veja https://docs.rs/crate para detalhes");
1169        assert_eq!(urls.len(), 1);
1170        assert_eq!(urls[0].url, "https://docs.rs/crate");
1171        assert!(urls[0].offset > 0);
1172    }
1173
1174    #[test]
1175    fn extract_urls_trim_sufixo_pontuacao() {
1176        let urls = extract_urls("link: https://example.com/path. fim");
1177        assert!(!urls.is_empty());
1178        assert!(
1179            !urls[0].url.ends_with('.'),
1180            "sufixo ponto deve ser removido"
1181        );
1182    }
1183
1184    #[test]
1185    fn extract_urls_deduplica_repetidas() {
1186        let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1187        let urls = extract_urls(body);
1188        assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1189    }
1190
1191    #[test]
1192    fn regex_uuid_captura_identificador() {
1193        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1194        assert!(ents.iter().any(|e| e.entity_type == "concept"));
1195    }
1196
1197    #[test]
1198    fn regex_all_caps_captura_constante() {
1199        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1200        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1201        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1202    }
1203
1204    #[test]
1205    fn regex_all_caps_ignora_palavras_curtas() {
1206        let ents = apply_regex_prefilter("use AI em seu projeto");
1207        assert!(
1208            !ents.iter().any(|e| e.name == "AI"),
1209            "AI tem apenas 2 chars, deve ser ignorado"
1210        );
1211    }
1212
1213    #[test]
1214    fn iob_decodifica_per_para_person() {
1215        let tokens = vec![
1216            "John".to_string(),
1217            "Doe".to_string(),
1218            "trabalhou".to_string(),
1219        ];
1220        let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
1221        let ents = iob_to_entities(&tokens, &labels);
1222        assert_eq!(ents.len(), 1);
1223        assert_eq!(ents[0].entity_type, "person");
1224        assert!(ents[0].name.contains("John"));
1225    }
1226
1227    #[test]
1228    fn iob_strip_subword_b_prefix() {
1229        // v1.0.21 P0: BERT às vezes emite ##AI com B-prefix (subword confuso).
1230        // Deve mergear na entidade ativa em vez de criar entidade fantasma "##AI".
1231        let tokens = vec!["Open".to_string(), "##AI".to_string()];
1232        let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
1233        let ents = iob_to_entities(&tokens, &labels);
1234        assert!(
1235            ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
1236            "deveria mergear ##AI ou descartar"
1237        );
1238    }
1239
1240    #[test]
1241    fn iob_subword_orphan_descarta() {
1242        // v1.0.21 P0: subword órfão sem entidade ativa não deve virar entidade.
1243        let tokens = vec!["##AI".to_string()];
1244        let labels = vec!["B-ORG".to_string()];
1245        let ents = iob_to_entities(&tokens, &labels);
1246        assert!(
1247            ents.is_empty(),
1248            "subword órfão sem entidade ativa deve ser descartado"
1249        );
1250    }
1251
1252    #[test]
1253    fn iob_mapeia_date_para_date_v1025() {
1254        // v1.0.25 V008: DATE is now emitted instead of discarded.
1255        let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
1256        let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
1257        let ents = iob_to_entities(&tokens, &labels);
1258        assert_eq!(ents.len(), 1, "DATE deve ser emitido como entidade v1.0.25");
1259        assert_eq!(ents[0].entity_type, "date");
1260    }
1261
1262    #[test]
1263    fn iob_mapeia_org_para_organization_v1025() {
1264        // v1.0.25 V008: B-ORG without tool keywords maps to "organization" not "project".
1265        let tokens = vec!["Empresa".to_string()];
1266        let labels = vec!["B-ORG".to_string()];
1267        let ents = iob_to_entities(&tokens, &labels);
1268        assert_eq!(ents[0].entity_type, "organization");
1269    }
1270
1271    #[test]
1272    fn iob_mapeia_org_sdk_para_tool() {
1273        let tokens = vec!["tokio-sdk".to_string()];
1274        let labels = vec!["B-ORG".to_string()];
1275        let ents = iob_to_entities(&tokens, &labels);
1276        assert_eq!(ents[0].entity_type, "tool");
1277    }
1278
1279    #[test]
1280    fn iob_mapeia_loc_para_location_v1025() {
1281        // v1.0.25 V008: B-LOC maps to "location" not "concept".
1282        let tokens = vec!["Brasil".to_string()];
1283        let labels = vec!["B-LOC".to_string()];
1284        let ents = iob_to_entities(&tokens, &labels);
1285        assert_eq!(ents[0].entity_type, "location");
1286    }
1287
1288    #[test]
1289    fn build_relationships_respeitam_max_rels() {
1290        let entities: Vec<NewEntity> = (0..20)
1291            .map(|i| NewEntity {
1292                name: format!("entidade_{i}"),
1293                entity_type: "concept".to_string(),
1294                description: None,
1295            })
1296            .collect();
1297        let (rels, truncated) = build_relationships(&entities);
1298        let max_rels = crate::constants::max_relationships_per_memory();
1299        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1300        if rels.len() == max_rels {
1301            assert!(truncated, "truncated deve ser true quando atingiu o cap");
1302        }
1303    }
1304
1305    #[test]
1306    fn build_relationships_sem_duplicatas() {
1307        let entities: Vec<NewEntity> = (0..5)
1308            .map(|i| NewEntity {
1309                name: format!("ent_{i}"),
1310                entity_type: "concept".to_string(),
1311                description: None,
1312            })
1313            .collect();
1314        let (rels, _truncated) = build_relationships(&entities);
1315        let mut pares: std::collections::HashSet<(String, String)> =
1316            std::collections::HashSet::new();
1317        for r in &rels {
1318            let par = (r.source.clone(), r.target.clone());
1319            assert!(pares.insert(par), "par duplicado encontrado");
1320        }
1321    }
1322
1323    #[test]
1324    fn merge_deduplica_por_nome_lowercase() {
1325        // v1.0.25: collision detection is scoped per entity_type; same name + same type
1326        // must deduplicate to one entry. Different types are kept separately.
1327        let a = vec![ExtractedEntity {
1328            name: "Rust".to_string(),
1329            entity_type: "concept".to_string(),
1330        }];
1331        let b = vec![ExtractedEntity {
1332            name: "rust".to_string(),
1333            entity_type: "concept".to_string(),
1334        }];
1335        let merged = merge_and_deduplicate(a, b);
1336        assert_eq!(
1337            merged.len(),
1338            1,
1339            "rust e Rust com mesmo tipo são a mesma entidade"
1340        );
1341    }
1342
1343    #[test]
1344    fn regex_extractor_implementa_trait() {
1345        let extractor = RegexExtractor;
1346        let result = extractor
1347            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1348            .unwrap();
1349        assert!(!result.entities.is_empty());
1350    }
1351
1352    #[test]
1353    fn extract_retorna_ok_sem_modelo() {
1354        // Sem modelo baixado, deve retornar Ok com apenas entidades regex
1355        let paths = make_paths();
1356        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1357        let result = extract_graph_auto(body, &paths).unwrap();
1358        assert!(result
1359            .entities
1360            .iter()
1361            .any(|e| e.name.contains("teste@exemplo.com")));
1362    }
1363
1364    #[test]
1365    fn stopwords_filter_v1024_terms() {
1366        // v1.0.24: verify that all 17 new stopwords added in P0-3 are filtered
1367        // by apply_regex_prefilter so they do not appear as entities.
1368        let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1369                    DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1370        let ents = apply_regex_prefilter(body);
1371        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1372        for word in &[
1373            "ACEITE",
1374            "ACK",
1375            "ACL",
1376            "BORDA",
1377            "CHECKLIST",
1378            "COMPLETED",
1379            "CONFIRME",
1380            "DEVEMOS",
1381            "DONE",
1382            "FIXED",
1383            "NEGUE",
1384            "PENDING",
1385            "PLAN",
1386            "PODEMOS",
1387            "RECUSE",
1388            "TOKEN",
1389            "VAMOS",
1390        ] {
1391            assert!(
1392                !names.contains(word),
1393                "v1.0.24 stopword {word} should be filtered but was found in entities"
1394            );
1395        }
1396    }
1397
1398    #[test]
1399    fn dedup_normalizes_unicode_combining_marks() {
1400        // v1.0.24 P1-E: "Café" (NFC precomposed) and "Cafe\u{301}" (NFD with
1401        // combining acute accent) must deduplicate to a single entity after NFKC
1402        // normalization.
1403        let nfc = vec![ExtractedEntity {
1404            name: "Café".to_string(),
1405            entity_type: "concept".to_string(),
1406        }];
1407        // Build the NFD form: 'e' followed by combining acute accent U+0301
1408        let nfd_name = "Cafe\u{301}".to_string();
1409        let nfd = vec![ExtractedEntity {
1410            name: nfd_name,
1411            entity_type: "concept".to_string(),
1412        }];
1413        let merged = merge_and_deduplicate(nfc, nfd);
1414        assert_eq!(
1415            merged.len(),
1416            1,
1417            "NFC 'Café' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1418        );
1419    }
1420
1421    // ── predict_batch regression tests ──────────────────────────────────────
1422
1423    #[test]
1424    fn predict_batch_output_count_matches_input() {
1425        // Verify that predict_batch returns exactly one Vec<String> per window
1426        // without requiring a real model.  We test the shape contract by
1427        // constructing the padding logic manually and asserting counts.
1428        //
1429        // Two windows of different lengths: 3 tokens and 5 tokens.
1430        let w1_ids: Vec<u32> = vec![101, 100, 102];
1431        let w1_tok: Vec<String> = vec!["[CLS]".into(), "hello".into(), "[SEP]".into()];
1432        let w2_ids: Vec<u32> = vec![101, 100, 200, 300, 102];
1433        let w2_tok: Vec<String> = vec![
1434            "[CLS]".into(),
1435            "world".into(),
1436            "foo".into(),
1437            "bar".into(),
1438            "[SEP]".into(),
1439        ];
1440        let windows: Vec<(Vec<u32>, Vec<String>)> =
1441            vec![(w1_ids.clone(), w1_tok), (w2_ids.clone(), w2_tok)];
1442
1443        // Verify padding logic and output length contracts using tensor operations
1444        // that do NOT require BertModel::forward.
1445        let device = Device::Cpu;
1446        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap();
1447        assert_eq!(max_len, 5, "max_len deve ser 5");
1448
1449        let mut padded_ids: Vec<Tensor> = Vec::new();
1450        for (ids, _) in &windows {
1451            let len = ids.len();
1452            let pad_right = max_len - len;
1453            let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
1454            let t = Tensor::from_vec(ids_i64, len, &device).unwrap();
1455            let t = t.pad_with_zeros(0, 0, pad_right).unwrap();
1456            assert_eq!(
1457                t.dims(),
1458                &[max_len],
1459                "cada janela deve ter shape (max_len,) após padding"
1460            );
1461            padded_ids.push(t);
1462        }
1463
1464        let stacked = Tensor::stack(&padded_ids, 0).unwrap();
1465        assert_eq!(
1466            stacked.dims(),
1467            &[2, max_len],
1468            "stack deve produzir (batch_size=2, max_len=5)"
1469        );
1470
1471        // Verify narrow preserves only real tokens for each window
1472        // (simulates what predict_batch does after classifier.forward)
1473        let fake_logits_data: Vec<f32> = vec![0.0f32; 2 * max_len * 9]; // batch×seq×num_labels=9
1474        let fake_logits =
1475            Tensor::from_vec(fake_logits_data, (2usize, max_len, 9usize), &device).unwrap();
1476        for (i, (ids, _)) in windows.iter().enumerate() {
1477            let real_len = ids.len();
1478            let example = fake_logits.get(i).unwrap();
1479            let sliced = example.narrow(0, 0, real_len).unwrap();
1480            assert_eq!(
1481                sliced.dims(),
1482                &[real_len, 9],
1483                "narrow deve preservar apenas {real_len} tokens reais"
1484            );
1485        }
1486    }
1487
1488    #[test]
1489    fn predict_batch_empty_windows_returns_empty() {
1490        // predict_batch with no windows must return an empty Vec, not panic.
1491        // We test the guard logic directly on the batch size/max_len path.
1492        let windows: Vec<(Vec<u32>, Vec<String>)> = vec![];
1493        let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
1494        assert_eq!(max_len, 0, "zero windows → max_len 0");
1495        // The real predict_batch returns Ok(vec![]) when max_len == 0.
1496        // We assert the expected output shape by reproducing the guard here.
1497        let result: Vec<Vec<String>> = if max_len == 0 {
1498            Vec::new()
1499        } else {
1500            unreachable!()
1501        };
1502        assert!(result.is_empty());
1503    }
1504
1505    #[test]
1506    fn ner_batch_size_default_is_8() {
1507        // Verify that ner_batch_size() returns the documented default when the
1508        // env var is absent.  We clear the var to avoid cross-test contamination.
1509        std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1510        assert_eq!(crate::constants::ner_batch_size(), 8);
1511    }
1512
1513    #[test]
1514    fn ner_batch_size_env_override_clamped() {
1515        // Override via env var; values outside [1, 32] must be clamped.
1516        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "64");
1517        assert_eq!(crate::constants::ner_batch_size(), 32, "deve clampar em 32");
1518
1519        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "0");
1520        assert_eq!(crate::constants::ner_batch_size(), 1, "deve clampar em 1");
1521
1522        std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "4");
1523        assert_eq!(
1524            crate::constants::ner_batch_size(),
1525            4,
1526            "valor válido preservado"
1527        );
1528
1529        std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1530    }
1531
1532    #[test]
1533    fn extraction_method_regex_only_unchanged() {
1534        // RegexExtractor always returns "regex-only" regardless of NER_MODEL OnceLock state.
1535        // This guards against accidentally changing the regex-only fallback string.
1536        let result = RegexExtractor.extract("contato: dev@acme.io").unwrap();
1537        assert_eq!(
1538            result.extraction_method, "regex-only",
1539            "RegexExtractor deve retornar regex-only"
1540        );
1541    }
1542
1543    // --- P2-E: extend_with_numeric_suffix alphanumeric suffix ---
1544
1545    #[test]
1546    fn extend_suffix_pure_numeric_unchanged() {
1547        // Existing behaviour: pure-numeric suffix must still work after P2-E.
1548        let ents = vec![ExtractedEntity {
1549            name: "GPT".to_string(),
1550            entity_type: "concept".to_string(),
1551        }];
1552        let result = extend_with_numeric_suffix(ents, "usando GPT-5 no projeto");
1553        assert_eq!(
1554            result[0].name, "GPT-5",
1555            "sufixo puramente numérico deve ser estendido"
1556        );
1557    }
1558
1559    #[test]
1560    fn extend_suffix_alphanumeric_letter_after_digit() {
1561        // P2-E: "4o" suffix (digit + lowercase letter) must be captured.
1562        let ents = vec![ExtractedEntity {
1563            name: "GPT".to_string(),
1564            entity_type: "concept".to_string(),
1565        }];
1566        let result = extend_with_numeric_suffix(ents, "usando GPT-4o para tarefas avançadas");
1567        assert_eq!(result[0].name, "GPT-4o", "sufixo '4o' deve ser aceito");
1568    }
1569
1570    #[test]
1571    fn extend_suffix_alphanumeric_b_suffix() {
1572        // P2-E: "5b" suffix (digit + 'b') must be captured.
1573        let ents = vec![ExtractedEntity {
1574            name: "Llama".to_string(),
1575            entity_type: "concept".to_string(),
1576        }];
1577        let result = extend_with_numeric_suffix(ents, "modelo Llama-5b open-weight");
1578        assert_eq!(result[0].name, "Llama-5b", "sufixo '5b' deve ser aceito");
1579    }
1580
1581    #[test]
1582    fn extend_suffix_alphanumeric_x_suffix() {
1583        // P2-E: "8x" suffix (digit + 'x') must be captured.
1584        let ents = vec![ExtractedEntity {
1585            name: "Mistral".to_string(),
1586            entity_type: "concept".to_string(),
1587        }];
1588        let result = extend_with_numeric_suffix(ents, "testando Mistral-8x em produção");
1589        assert_eq!(result[0].name, "Mistral-8x", "sufixo '8x' deve ser aceito");
1590    }
1591
1592    // --- P2-D: augment_versioned_model_names extended regex ---
1593
1594    #[test]
1595    fn augment_versioned_gpt4o() {
1596        // P2-D: "GPT-4o" must be captured with alphanumeric suffix.
1597        let result = augment_versioned_model_names(vec![], "usando GPT-4o para análise");
1598        assert!(
1599            result.iter().any(|e| e.name == "GPT-4o"),
1600            "GPT-4o deve ser capturado pelo augment, achados: {:?}",
1601            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1602        );
1603    }
1604
1605    #[test]
1606    fn augment_versioned_claude_4_sonnet() {
1607        // P2-D: "Claude 4 Sonnet" must be captured with release tier.
1608        let result =
1609            augment_versioned_model_names(vec![], "melhor modelo: Claude 4 Sonnet lançado hoje");
1610        assert!(
1611            result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1612            "Claude 4 Sonnet deve ser capturado, achados: {:?}",
1613            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1614        );
1615    }
1616
1617    #[test]
1618    fn augment_versioned_llama_3_pro() {
1619        // P2-D: "Llama 3 Pro" must be captured with release tier.
1620        let result =
1621            augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1622        assert!(
1623            result.iter().any(|e| e.name == "Llama 3 Pro"),
1624            "Llama 3 Pro deve ser capturado, achados: {:?}",
1625            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1626        );
1627    }
1628
1629    #[test]
1630    fn augment_versioned_mixtral_8x7b() {
1631        // P2-D: "Mixtral 8x7B" composite version must be captured.
1632        let result =
1633            augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1634        assert!(
1635            result.iter().any(|e| e.name == "Mixtral 8x7B"),
1636            "Mixtral 8x7B deve ser capturado, achados: {:?}",
1637            result.iter().map(|e| &e.name).collect::<Vec<_>>()
1638        );
1639    }
1640
1641    #[test]
1642    fn augment_versioned_does_not_duplicate_existing() {
1643        // P2-D back-compat: entities already present must not be duplicated.
1644        let existing = vec![ExtractedEntity {
1645            name: "Claude 4".to_string(),
1646            entity_type: "concept".to_string(),
1647        }];
1648        let result = augment_versioned_model_names(existing, "usando Claude 4 no projeto");
1649        let count = result.iter().filter(|e| e.name == "Claude 4").count();
1650        assert_eq!(count, 1, "Claude 4 não deve ser duplicado");
1651    }
1652
1653    // ── v1.0.25 P0-4: new stopwords (API, CLI, HTTP, HTTPS, JWT, LLM, REST, UI, URL) ──
1654
1655    #[test]
1656    fn stopwords_filter_url_jwt_api_v1025() {
1657        // Verify that v1.0.25 tech-acronym stopwords do not leak as entities.
1658        let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1659        let ents = apply_regex_prefilter(body);
1660        let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1661        for blocked in &[
1662            "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1663        ] {
1664            assert!(
1665                !names.contains(blocked),
1666                "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1667            );
1668        }
1669    }
1670
1671    // ── v1.0.25 P0-4: section-marker regex strips "Etapa N", "Fase N", etc. ──
1672
1673    #[test]
1674    fn section_markers_etapa_fase_filtered_v1025() {
1675        // "Etapa 3" and "Fase 1" are document-structure labels, not entities.
1676        let body = "Etapa 3 do plano: implementar Fase 1 da Migração.";
1677        let ents = apply_regex_prefilter(body);
1678        assert!(
1679            !ents
1680                .iter()
1681                .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1682            "section markers must be stripped; entities: {:?}",
1683            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1684        );
1685    }
1686
1687    #[test]
1688    fn section_markers_passo_secao_filtered_v1025() {
1689        let body = "Siga Passo 2 conforme Seção 3 do manual.";
1690        let ents = apply_regex_prefilter(body);
1691        assert!(
1692            !ents
1693                .iter()
1694                .any(|e| e.name.contains("Passo") || e.name.contains("Seção")),
1695            "Passo/Seção section markers must be stripped; entities: {:?}",
1696            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1697        );
1698    }
1699
1700    // ── v1.0.25 P0-2: CamelCase brand names extracted as organization ──
1701
1702    #[test]
1703    fn brand_camelcase_extracted_as_organization_v1025() {
1704        // "OpenAI" is a CamelCase brand that BERT NER often misses.
1705        let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1706        let ents = apply_regex_prefilter(body);
1707        let openai = ents.iter().find(|e| e.name == "OpenAI");
1708        assert!(
1709            openai.is_some(),
1710            "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1711            ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1712        );
1713        assert_eq!(
1714            openai.unwrap().entity_type,
1715            "organization",
1716            "brand CamelCase must map to organization (V008)"
1717        );
1718    }
1719
1720    #[test]
1721    fn brand_postgresql_extracted_as_organization_v1025() {
1722        let body = "migrating from MySQL to PostgreSQL for better performance.";
1723        let ents = apply_regex_prefilter(body);
1724        assert!(
1725            ents.iter()
1726                .any(|e| e.name == "PostgreSQL" && e.entity_type == "organization"),
1727            "PostgreSQL must be extracted as organization; entities: {:?}",
1728            ents.iter()
1729                .map(|e| (&e.name, &e.entity_type))
1730                .collect::<Vec<_>>()
1731        );
1732    }
1733
1734    // ── v1.0.25 V008 alignment ──
1735
1736    #[test]
1737    fn iob_org_maps_to_organization_not_project_v1025() {
1738        // B-ORG without tool keywords must emit "organization" (V008), not "project".
1739        let tokens = vec!["Microsoft".to_string()];
1740        let labels = vec!["B-ORG".to_string()];
1741        let ents = iob_to_entities(&tokens, &labels);
1742        assert_eq!(
1743            ents[0].entity_type, "organization",
1744            "B-ORG must map to organization (V008); got {}",
1745            ents[0].entity_type
1746        );
1747    }
1748
1749    #[test]
1750    fn iob_loc_maps_to_location_not_concept_v1025() {
1751        // B-LOC must emit "location" (V008), not "concept".
1752        let tokens = vec!["São".to_string(), "Paulo".to_string()];
1753        let labels = vec!["B-LOC".to_string(), "I-LOC".to_string()];
1754        let ents = iob_to_entities(&tokens, &labels);
1755        assert_eq!(
1756            ents[0].entity_type, "location",
1757            "B-LOC must map to location (V008); got {}",
1758            ents[0].entity_type
1759        );
1760    }
1761
1762    #[test]
1763    fn iob_date_maps_to_date_not_discarded_v1025() {
1764        // B-DATE must emit "date" (V008) instead of being discarded.
1765        let tokens = vec!["2025".to_string(), "-".to_string(), "12".to_string()];
1766        let labels = vec![
1767            "B-DATE".to_string(),
1768            "I-DATE".to_string(),
1769            "I-DATE".to_string(),
1770        ];
1771        let ents = iob_to_entities(&tokens, &labels);
1772        assert_eq!(
1773            ents.len(),
1774            1,
1775            "DATE entity must be emitted (V008); entities: {ents:?}"
1776        );
1777        assert_eq!(ents[0].entity_type, "date");
1778    }
1779
1780    // ── v1.0.25 P0-2: PT verb false-positive filter ──
1781
1782    #[test]
1783    fn pt_verb_le_filtered_as_per_v1025() {
1784        // "Lê" is a PT monosyllabic verb; when tagged B-PER it must be dropped.
1785        let tokens = vec!["Lê".to_string(), "o".to_string(), "livro".to_string()];
1786        let labels = vec!["B-PER".to_string(), "O".to_string(), "O".to_string()];
1787        let ents = iob_to_entities(&tokens, &labels);
1788        assert!(
1789            !ents
1790                .iter()
1791                .any(|e| e.name == "Lê" && e.entity_type == "person"),
1792            "PT verb 'Lê' tagged B-PER must be filtered; entities: {ents:?}"
1793        );
1794    }
1795
1796    #[test]
1797    fn pt_verb_ver_filtered_as_per_v1025() {
1798        // "Ver" is a PT verb that BERT sometimes tags B-PER; must be filtered.
1799        let tokens = vec!["Ver".to_string()];
1800        let labels = vec!["B-PER".to_string()];
1801        let ents = iob_to_entities(&tokens, &labels);
1802        assert!(
1803            ents.is_empty(),
1804            "PT verb 'Ver' tagged B-PER must be filtered; entities: {ents:?}"
1805        );
1806    }
1807
1808    // --- P0-3 longest-wins v1.0.25 ---
1809
1810    fn entity(name: &str, entity_type: &str) -> ExtractedEntity {
1811        ExtractedEntity {
1812            name: name.to_string(),
1813            entity_type: entity_type.to_string(),
1814        }
1815    }
1816
1817    #[test]
1818    fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1819        // "Sonne" is a substring of "Sonnet" — longest-wins must keep "Sonnet".
1820        let regex = vec![entity("Sonne", "concept")];
1821        let ner = vec![entity("Sonnet", "concept")];
1822        let result = merge_and_deduplicate(regex, ner);
1823        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1824        assert_eq!(result[0].name, "Sonnet");
1825    }
1826
1827    #[test]
1828    fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1829        // "Open" is a substring of "OpenAI" — longest-wins must keep "OpenAI".
1830        let regex = vec![
1831            entity("Open", "organization"),
1832            entity("OpenAI", "organization"),
1833        ];
1834        let result = merge_and_deduplicate(regex, vec![]);
1835        assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1836        assert_eq!(result[0].name, "OpenAI");
1837    }
1838
1839    #[test]
1840    fn merge_keeps_both_when_no_containment_v1025() {
1841        // "Alice" and "Bob" share no containment — both must be preserved.
1842        let regex = vec![entity("Alice", "person"), entity("Bob", "person")];
1843        let result = merge_and_deduplicate(regex, vec![]);
1844        assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1845    }
1846
1847    #[test]
1848    fn merge_respects_entity_type_boundary_v1025() {
1849        // Same name "Apple" but different types: both must survive independently.
1850        let regex = vec![entity("Apple", "organization"), entity("Apple", "concept")];
1851        let result = merge_and_deduplicate(regex, vec![]);
1852        assert_eq!(
1853            result.len(),
1854            2,
1855            "expected 2 entities (different types), got: {result:?}"
1856        );
1857    }
1858
1859    #[test]
1860    fn merge_case_insensitive_dedup_v1025() {
1861        // "OpenAI" and "openai" are the same entity — deduplicate to exactly one.
1862        let regex = vec![
1863            entity("OpenAI", "organization"),
1864            entity("openai", "organization"),
1865        ];
1866        let result = merge_and_deduplicate(regex, vec![]);
1867        assert_eq!(
1868            result.len(),
1869            1,
1870            "expected 1 entity after case-insensitive dedup, got: {result:?}"
1871        );
1872    }
1873
1874    // ── v1.0.25 P0-4: section markers must be filtered in iob_to_entities too ──
1875
1876    #[test]
1877    fn iob_section_marker_etapa_filtered_v1025() {
1878        // BERT may tag "Etapa" (B-MISC) + "3" (I-MISC) as a span; flush must drop it.
1879        let tokens = vec!["Etapa".to_string(), "3".to_string()];
1880        let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1881        let ents = iob_to_entities(&tokens, &labels);
1882        assert!(
1883            !ents.iter().any(|e| e.name.contains("Etapa")),
1884            "section marker 'Etapa 3' from BERT must be filtered; entities: {ents:?}"
1885        );
1886    }
1887
1888    #[test]
1889    fn iob_section_marker_fase_filtered_v1025() {
1890        // BERT may tag "Fase" (B-MISC) + "1" (I-MISC) as a span; flush must drop it.
1891        let tokens = vec!["Fase".to_string(), "1".to_string()];
1892        let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1893        let ents = iob_to_entities(&tokens, &labels);
1894        assert!(
1895            !ents.iter().any(|e| e.name.contains("Fase")),
1896            "section marker 'Fase 1' from BERT must be filtered; entities: {ents:?}"
1897        );
1898    }
1899}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs