sqlite_graphrag/
extraction.rs

1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::OnceLock;
4
5use anyhow::{Context, Result};
6use candle_core::{DType, Device, Tensor};
7use candle_nn::{Linear, Module, VarBuilder};
8use candle_transformers::models::bert::{BertModel, Config as BertConfig};
9use regex::Regex;
10use serde::Deserialize;
11
12use crate::paths::AppPaths;
13use crate::storage::entities::{NewEntity, NewRelationship};
14
15const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
16const MAX_SEQ_LEN: usize = 512;
17const STRIDE: usize = 256;
18const MAX_ENTS: usize = 30;
19const TOP_K_RELATIONS: usize = 5;
20const DEFAULT_RELATION: &str = "mentions";
21const MIN_ENTITY_CHARS: usize = 2;
22
23static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
24static REGEX_URL: OnceLock<Regex> = OnceLock::new();
25static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
26static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
27
28// v1.0.20: stopwords para filtrar palavras-regra PT-BR/EN comuns capturadas como ALL_CAPS.
29// Sem este filtro, corpus técnico em PT-BR contendo regras formatadas em CAPS (NUNCA, PROIBIDO, DEVE)
30// gerava ~70% de "entidades" lixo. Mantemos identificadores tipo MAX_RETRY (com underscore).
31// v1.0.22: lista expandida com termos observados em stress test 495 arquivos do flowaiper.
32// Inclui verbos (ADICIONAR, VALIDAR), adjetivos (ALTA, BAIXA), substantivos comuns (BANCO, CASO),
33// HTTP methods (GET, POST, DELETE) e formatos de dados genéricos (JSON, XML).
34const ALL_CAPS_STOPWORDS: &[&str] = &[
35    "ACRESCENTADO",
36    "ADICIONAR",
37    "AGENTS",
38    "ALL",
39    "ALTA",
40    "ALWAYS",
41    "ARTEFATOS",
42    "ATIVO",
43    "BAIXA",
44    "BANCO",
45    "BLOQUEAR",
46    "BUG",
47    "CASO",
48    "CONFIRMADO",
49    "CONTRATO",
50    "CRÍTICO",
51    "CRITICAL",
52    "CSV",
53    "DEVE",
54    "DISCO",
55    "EFEITO",
56    "ENTRADA",
57    "ERROR",
58    "ESSA",
59    "ESSE",
60    "ESSENCIAL",
61    "ESTA",
62    "ESTE",
63    "EVITAR",
64    "EXPANDIR",
65    "EXPOR",
66    "FALHA",
67    "FIXME",
68    "FORBIDDEN",
69    "HACK",
70    "HEARTBEAT",
71    "INATIVO",
72    "JAMAIS",
73    "JSON",
74    "MUST",
75    "NEVER",
76    "NOTE",
77    "NUNCA",
78    "OBRIGATÓRIO",
79    "PADRÃO",
80    "PROIBIDO",
81    "REGRAS",
82    "REQUIRED",
83    "REQUISITO",
84    "SEMPRE",
85    "SHALL",
86    "SHOULD",
87    "SOUL",
88    "TODAS",
89    "TODO",
90    "TODOS",
91    "TOOLS",
92    "TSV",
93    "USAR",
94    "VALIDAR",
95    "VOCÊ",
96    "WARNING",
97    "XML",
98    "YAML",
99];
100
101// v1.0.22: HTTP methods são verbos de protocolo, não entidades semanticamente úteis.
102// Filtrados em apply_regex_prefilter (regex_all_caps) e iob_to_entities (single-token).
103const HTTP_METHODS: &[&str] = &[
104    "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
105];
106
107fn is_filtered_all_caps(token: &str) -> bool {
108    // Identificadores com underscore são preservados (ex: MAX_RETRY, FLOWAIPER_API_KEY)
109    let is_identifier = token.contains('_');
110    if is_identifier {
111        return false;
112    }
113    ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
114}
115
116fn regex_email() -> &'static Regex {
117    REGEX_EMAIL
118        .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
119}
120
121fn regex_url() -> &'static Regex {
122    REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
123}
124
125fn regex_uuid() -> &'static Regex {
126    REGEX_UUID.get_or_init(|| {
127        Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
128            .unwrap()
129    })
130}
131
132fn regex_all_caps() -> &'static Regex {
133    REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
134}
135
136#[derive(Debug, Clone, PartialEq)]
137pub struct ExtractedEntity {
138    pub name: String,
139    pub entity_type: String,
140}
141
142#[derive(Debug, Clone)]
143pub struct ExtractionResult {
144    pub entities: Vec<NewEntity>,
145    pub relationships: Vec<NewRelationship>,
146    /// Método usado para extração: "bert+regex" ou "regex-only".
147    /// Útil para auditoria, métricas e reportes ao usuário.
148    pub extraction_method: String,
149}
150
151pub trait Extractor: Send + Sync {
152    fn extract(&self, body: &str) -> Result<ExtractionResult>;
153}
154
155#[derive(Deserialize)]
156struct ModelConfig {
157    #[serde(default)]
158    id2label: HashMap<String, String>,
159    hidden_size: usize,
160}
161
162struct BertNerModel {
163    bert: BertModel,
164    classifier: Linear,
165    device: Device,
166    id2label: HashMap<usize, String>,
167}
168
169impl BertNerModel {
170    fn load(model_dir: &Path) -> Result<Self> {
171        let config_path = model_dir.join("config.json");
172        let weights_path = model_dir.join("model.safetensors");
173
174        let config_str = std::fs::read_to_string(&config_path)
175            .with_context(|| format!("lendo config.json em {config_path:?}"))?;
176        let model_cfg: ModelConfig =
177            serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
178
179        let id2label: HashMap<usize, String> = model_cfg
180            .id2label
181            .into_iter()
182            .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
183            .collect();
184
185        let num_labels = id2label.len().max(9);
186        let hidden_size = model_cfg.hidden_size;
187
188        let bert_config_str = std::fs::read_to_string(&config_path)
189            .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
190        let bert_cfg: BertConfig =
191            serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
192
193        let device = Device::Cpu;
194
195        let vb = unsafe {
196            VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
197                .with_context(|| format!("mapeando {weights_path:?}"))?
198        };
199        let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
200
201        // v1.0.20 fix P0 secundário: carregar classifier head do safetensors em vez de zeros.
202        // Em v1.0.19 usávamos Tensor::zeros, o que produzia argmax constante e inferência degenerada.
203        let cls_vb = vb.pp("classifier");
204        let weight = cls_vb
205            .get((num_labels, hidden_size), "weight")
206            .context("carregando classifier.weight do safetensors")?;
207        let bias = cls_vb
208            .get(num_labels, "bias")
209            .context("carregando classifier.bias do safetensors")?;
210        let classifier = Linear::new(weight, Some(bias));
211
212        Ok(Self {
213            bert,
214            classifier,
215            device,
216            id2label,
217        })
218    }
219
220    fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
221        let len = token_ids.len();
222        let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
223        let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
224
225        let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
226            .context("criando tensor input_ids")?;
227        let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
228            .context("criando tensor token_type_ids")?;
229        let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
230            .context("criando tensor attention_mask")?;
231
232        let sequence_output = self
233            .bert
234            .forward(&input_ids, &token_type_ids, Some(&attn_mask))
235            .context("forward pass do BertModel")?;
236
237        let logits = self
238            .classifier
239            .forward(&sequence_output)
240            .context("forward pass do classificador")?;
241
242        let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
243
244        let num_tokens = logits_2d.dim(0).context("dim(0)")?;
245
246        let mut labels = Vec::with_capacity(num_tokens);
247        for i in 0..num_tokens {
248            let token_logits = logits_2d.get(i).context("get token logits")?;
249            let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
250            let argmax = vec
251                .iter()
252                .enumerate()
253                .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
254                .map(|(idx, _)| idx)
255                .unwrap_or(0);
256            let label = self
257                .id2label
258                .get(&argmax)
259                .cloned()
260                .unwrap_or_else(|| "O".to_string());
261            labels.push(label);
262        }
263
264        Ok(labels)
265    }
266}
267
268static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
269
270fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
271    NER_MODEL
272        .get_or_init(|| match load_model(paths) {
273            Ok(m) => Some(m),
274            Err(e) => {
275                tracing::warn!("NER model não disponível (graceful degradation): {e:#}");
276                None
277            }
278        })
279        .as_ref()
280}
281
282fn model_dir(paths: &AppPaths) -> PathBuf {
283    paths.models.join("bert-multilingual-ner")
284}
285
286fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
287    let dir = model_dir(paths);
288    std::fs::create_dir_all(&dir)
289        .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
290
291    let weights = dir.join("model.safetensors");
292    let config = dir.join("config.json");
293    let tokenizer = dir.join("tokenizer.json");
294
295    if weights.exists() && config.exists() && tokenizer.exists() {
296        return Ok(dir);
297    }
298
299    tracing::info!("Baixando modelo NER (primeira execução, ~676 MB)...");
300    crate::output::emit_progress_i18n(
301        "Downloading NER model (first run, ~676 MB)...",
302        "Baixando modelo NER (primeira execução, ~676 MB)...",
303    );
304
305    let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
306    let repo = api.model(MODEL_ID.to_string());
307
308    // v1.0.20 fix P0 primário: tokenizer.json no repo Davlan está apenas em onnx/tokenizer.json.
309    // Em v1.0.19 buscávamos da raiz e recebíamos 404, caindo em graceful degradation 100% das vezes.
310    // Mapeamos (remote_path, local_filename) para baixar do subfolder mantendo nome plano local.
311    for (remote, local) in &[
312        ("model.safetensors", "model.safetensors"),
313        ("config.json", "config.json"),
314        ("onnx/tokenizer.json", "tokenizer.json"),
315        ("tokenizer_config.json", "tokenizer_config.json"),
316    ] {
317        let dest = dir.join(local);
318        if !dest.exists() {
319            let src = repo
320                .get(remote)
321                .with_context(|| format!("baixando {remote} do HF Hub"))?;
322            std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
323        }
324    }
325
326    Ok(dir)
327}
328
329fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
330    let dir = ensure_model_files(paths)?;
331    BertNerModel::load(&dir)
332}
333
334fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
335    let mut entities = Vec::new();
336    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
337
338    let add = |entities: &mut Vec<ExtractedEntity>,
339               seen: &mut std::collections::HashSet<String>,
340               name: &str,
341               entity_type: &str| {
342        let name = name.trim().to_string();
343        if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
344            entities.push(ExtractedEntity {
345                name,
346                entity_type: entity_type.to_string(),
347            });
348        }
349    };
350
351    for m in regex_email().find_iter(body) {
352        // v1.0.20: email é "concept" (regex sozinho não distingue pessoa de mailing list/role).
353        add(&mut entities, &mut seen, m.as_str(), "concept");
354    }
355    for m in regex_url().find_iter(body) {
356        // v1.0.22: URLs strip de sufixo de markdown (backtick fechando, parens, brackets).
357        // Mantidas como entity_type "concept" para preservar rastreabilidade de citações.
358        let raw = m.as_str();
359        let cleaned = raw
360            .trim_end_matches('`')
361            .trim_end_matches(',')
362            .trim_end_matches('.')
363            .trim_end_matches(';')
364            .trim_end_matches(')')
365            .trim_end_matches(']')
366            .trim_end_matches('}');
367        add(&mut entities, &mut seen, cleaned, "concept");
368    }
369    for m in regex_uuid().find_iter(body) {
370        add(&mut entities, &mut seen, m.as_str(), "concept");
371    }
372    for m in regex_all_caps().find_iter(body) {
373        let candidate = m.as_str();
374        // v1.0.22: filtro consolidado (stopwords + HTTP methods); preserva identificadores com underscore.
375        if !is_filtered_all_caps(candidate) {
376            add(&mut entities, &mut seen, candidate, "concept");
377        }
378    }
379
380    entities
381}
382
383fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
384    let mut entities: Vec<ExtractedEntity> = Vec::new();
385    let mut current_parts: Vec<String> = Vec::new();
386    let mut current_type: Option<String> = None;
387
388    let flush =
389        |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
390            if let Some(t) = typ.take() {
391                let name = parts.join(" ").trim().to_string();
392                // v1.0.22: filtra single-token entities que sejam stopwords ALL CAPS ou HTTP methods.
393                // BERT NER classifica algumas dessas como B-MISC/B-ORG; pós-filtro aqui evita
394                // poluir o grafo com verbos/protocolos genéricos.
395                let is_single_caps = !name.contains(' ')
396                    && name == name.to_uppercase()
397                    && name.len() >= MIN_ENTITY_CHARS;
398                let should_skip = is_single_caps && is_filtered_all_caps(&name);
399                if name.len() >= MIN_ENTITY_CHARS && !should_skip {
400                    entities.push(ExtractedEntity {
401                        name,
402                        entity_type: t,
403                    });
404                }
405                parts.clear();
406            }
407        };
408
409    for (token, label) in tokens.iter().zip(labels.iter()) {
410        if label == "O" {
411            flush(&mut current_parts, &mut current_type, &mut entities);
412            continue;
413        }
414
415        let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
416            ("B", rest)
417        } else if let Some(rest) = label.strip_prefix("I-") {
418            ("I", rest)
419        } else {
420            flush(&mut current_parts, &mut current_type, &mut entities);
421            continue;
422        };
423
424        let entity_type = match bio_type {
425            "DATE" => {
426                flush(&mut current_parts, &mut current_type, &mut entities);
427                continue;
428            }
429            "PER" => "person",
430            "ORG" => {
431                let t = token.to_lowercase();
432                if t.contains("lib")
433                    || t.contains("sdk")
434                    || t.contains("cli")
435                    || t.contains("crate")
436                    || t.contains("npm")
437                {
438                    "tool"
439                } else {
440                    "project"
441                }
442            }
443            "LOC" => "concept",
444            other => other,
445        };
446
447        if prefix == "B" {
448            if token.starts_with("##") {
449                // BERT confuso: subword com B-prefix indica continuação de entidade anterior.
450                // Anexar à última parte da entidade atual; senão descartar.
451                let clean = token.strip_prefix("##").unwrap_or(token.as_str());
452                if let Some(last) = current_parts.last_mut() {
453                    last.push_str(clean);
454                }
455                continue;
456            }
457            flush(&mut current_parts, &mut current_type, &mut entities);
458            current_parts.push(token.clone());
459            current_type = Some(entity_type.to_string());
460        } else if prefix == "I" && current_type.is_some() {
461            let clean = token.strip_prefix("##").unwrap_or(token.as_str());
462            if token.starts_with("##") {
463                if let Some(last) = current_parts.last_mut() {
464                    last.push_str(clean);
465                }
466            } else {
467                current_parts.push(clean.to_string());
468            }
469        }
470    }
471
472    flush(&mut current_parts, &mut current_type, &mut entities);
473    entities
474}
475
476fn build_relationships(entities: &[NewEntity]) -> Vec<NewRelationship> {
477    if entities.len() < 2 {
478        return Vec::new();
479    }
480
481    // v1.0.22: cap configurável via env var (constants::max_relationships_per_memory).
482    // Permite usuários com corpus denso aumentar além do default 50.
483    let max_rels = crate::constants::max_relationships_per_memory();
484    let n = entities.len().min(MAX_ENTS);
485    let mut rels: Vec<NewRelationship> = Vec::new();
486    let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
487
488    let mut hit_cap = false;
489    'outer: for i in 0..n {
490        if rels.len() >= max_rels {
491            hit_cap = true;
492            break;
493        }
494
495        let mut for_entity = 0usize;
496        for j in (i + 1)..n {
497            if for_entity >= TOP_K_RELATIONS {
498                break;
499            }
500            if rels.len() >= max_rels {
501                hit_cap = true;
502                break 'outer;
503            }
504
505            let src = &entities[i].name;
506            let tgt = &entities[j].name;
507            let key = (src.clone(), tgt.clone());
508
509            if seen.contains(&key) {
510                continue;
511            }
512            seen.insert(key);
513
514            rels.push(NewRelationship {
515                source: src.clone(),
516                target: tgt.clone(),
517                relation: DEFAULT_RELATION.to_string(),
518                strength: 0.5,
519                description: None,
520            });
521            for_entity += 1;
522        }
523    }
524
525    // v1.0.20: avisar quando relacionamentos foram truncados antes de cobrir todos os pares possíveis.
526    if hit_cap {
527        tracing::warn!(
528            "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
529            n.saturating_sub(1)
530        );
531    }
532
533    rels
534}
535
536fn run_ner_sliding_window(
537    model: &BertNerModel,
538    body: &str,
539    paths: &AppPaths,
540) -> Result<Vec<ExtractedEntity>> {
541    let tokenizer_path = model_dir(paths).join("tokenizer.json");
542    let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
543        .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
544
545    let encoding = tokenizer
546        .encode(body, false)
547        .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
548
549    let all_ids: Vec<u32> = encoding.get_ids().to_vec();
550    let all_tokens: Vec<String> = encoding
551        .get_tokens()
552        .iter()
553        .map(|s| s.to_string())
554        .collect();
555
556    if all_ids.is_empty() {
557        return Ok(Vec::new());
558    }
559
560    let mut entities: Vec<ExtractedEntity> = Vec::new();
561    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
562
563    let mut start = 0usize;
564    loop {
565        let end = (start + MAX_SEQ_LEN).min(all_ids.len());
566        let window_ids = &all_ids[start..end];
567        let window_tokens = &all_tokens[start..end];
568        let attention_mask: Vec<u32> = vec![1u32; window_ids.len()];
569
570        match model.predict(window_ids, &attention_mask) {
571            Ok(labels) => {
572                let window_ents = iob_to_entities(window_tokens, &labels);
573                for ent in window_ents {
574                    if seen.insert(ent.name.clone()) {
575                        entities.push(ent);
576                    }
577                }
578            }
579            Err(e) => {
580                tracing::warn!("janela NER falhou (start={start}): {e:#}");
581            }
582        }
583
584        if end >= all_ids.len() {
585            break;
586        }
587        start += STRIDE;
588    }
589
590    Ok(entities)
591}
592
593/// v1.0.22 P1: estende entidades com sufixos numéricos hifenizados ou separados por espaço.
594/// Casos: GPT extraído mas body contém "GPT-5" → reescreve para "GPT-5".
595/// Casos: Claude extraído mas body contém "Claude 4" → reescreve para "Claude 4".
596/// Conservador: só estende se sufixo tiver até 6 caracteres e for puramente numérico.
597fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
598    static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
599    let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?)").unwrap());
600
601    entities
602        .into_iter()
603        .map(|ent| {
604            // Encontra a primeira ocorrência case-sensitive da entidade no body
605            if let Some(pos) = body.find(&ent.name) {
606                let after_pos = pos + ent.name.len();
607                if after_pos < body.len() {
608                    let after = &body[after_pos..];
609                    if let Some(m) = suffix_re.find(after) {
610                        let suffix = m.as_str();
611                        // Conservador: limita comprimento total do sufixo a 6 chars
612                        if suffix.len() <= 6 {
613                            let extended = format!("{}{}", ent.name, suffix);
614                            return ExtractedEntity {
615                                name: extended,
616                                entity_type: ent.entity_type,
617                            };
618                        }
619                    }
620                }
621            }
622            ent
623        })
624        .collect()
625}
626
627fn merge_and_deduplicate(
628    regex_ents: Vec<ExtractedEntity>,
629    ner_ents: Vec<ExtractedEntity>,
630) -> Vec<ExtractedEntity> {
631    let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
632    let mut result: Vec<ExtractedEntity> = Vec::new();
633    let mut truncated = false;
634
635    let total_input = regex_ents.len() + ner_ents.len();
636    for ent in regex_ents.into_iter().chain(ner_ents) {
637        let key = ent.name.to_lowercase();
638        if seen.insert(key) {
639            result.push(ent);
640        }
641        if result.len() >= MAX_ENTS {
642            truncated = true;
643            break;
644        }
645    }
646
647    // v1.0.20: avisar quando truncamento silencioso descarta entidades acima do MAX_ENTS.
648    if truncated {
649        tracing::warn!(
650            "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
651        );
652    }
653
654    result
655}
656
657fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
658    extracted
659        .into_iter()
660        .map(|e| NewEntity {
661            name: e.name,
662            entity_type: e.entity_type,
663            description: None,
664        })
665        .collect()
666}
667
668pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
669    let regex_entities = apply_regex_prefilter(body);
670
671    let mut bert_used = false;
672    let ner_entities = match get_or_init_model(paths) {
673        Some(model) => match run_ner_sliding_window(model, body, paths) {
674            Ok(ents) => {
675                bert_used = true;
676                ents
677            }
678            Err(e) => {
679                tracing::warn!("NER falhou, usando apenas regex: {e:#}");
680                Vec::new()
681            }
682        },
683        None => Vec::new(),
684    };
685
686    let merged = merge_and_deduplicate(regex_entities, ner_entities);
687    // v1.0.22: estender entidades NER com sufixos numéricos do body (GPT-5, Claude 4, Python 3).
688    let extended = extend_with_numeric_suffix(merged, body);
689    let entities = to_new_entities(extended);
690    let relationships = build_relationships(&entities);
691
692    let extraction_method = if bert_used {
693        "bert+regex".to_string()
694    } else {
695        "regex-only".to_string()
696    };
697
698    Ok(ExtractionResult {
699        entities,
700        relationships,
701        extraction_method,
702    })
703}
704
705pub struct RegexExtractor;
706
707impl Extractor for RegexExtractor {
708    fn extract(&self, body: &str) -> Result<ExtractionResult> {
709        let regex_entities = apply_regex_prefilter(body);
710        let entities = to_new_entities(regex_entities);
711        let relationships = build_relationships(&entities);
712        Ok(ExtractionResult {
713            entities,
714            relationships,
715            extraction_method: "regex-only".to_string(),
716        })
717    }
718}
719
720#[cfg(test)]
721mod tests {
722    use super::*;
723
724    fn make_paths() -> AppPaths {
725        use std::path::PathBuf;
726        AppPaths {
727            db: PathBuf::from("/tmp/test.sqlite"),
728            models: PathBuf::from("/tmp/test_models"),
729        }
730    }
731
732    #[test]
733    fn regex_email_captura_endereco() {
734        let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
735        // v1.0.20: emails são classificados como "concept" (regex sozinho não distingue pessoa de role).
736        assert!(ents
737            .iter()
738            .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
739    }
740
741    #[test]
742    fn regex_all_caps_filtra_palavra_regra_pt() {
743        // v1.0.20 fix P1: NUNCA, PROIBIDO, DEVE não devem virar "entidades".
744        let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
745        assert!(
746            !ents.iter().any(|e| e.name == "NUNCA"),
747            "NUNCA deveria ser filtrado como stopword"
748        );
749        assert!(
750            !ents.iter().any(|e| e.name == "PROIBIDO"),
751            "PROIBIDO deveria ser filtrado"
752        );
753        assert!(
754            !ents.iter().any(|e| e.name == "DEVE"),
755            "DEVE deveria ser filtrado"
756        );
757    }
758
759    #[test]
760    fn regex_all_caps_aceita_constante_com_underscore() {
761        // Constantes técnicas tipo MAX_RETRY, TIMEOUT_MS sempre devem ser aceitas.
762        let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
763        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
764        assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
765    }
766
767    #[test]
768    fn regex_all_caps_aceita_acronimo_dominio() {
769        // Acrônimos legítimos (não-stopword) devem passar: OPENAI, NVIDIA, GOOGLE.
770        let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
771        assert!(ents.iter().any(|e| e.name == "OPENAI"));
772        assert!(ents.iter().any(|e| e.name == "NVIDIA"));
773    }
774
775    #[test]
776    fn regex_url_captura_link() {
777        let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
778        assert!(ents
779            .iter()
780            .any(|e| e.name.starts_with("https://") && e.entity_type == "concept"));
781    }
782
783    #[test]
784    fn regex_uuid_captura_identificador() {
785        let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
786        assert!(ents.iter().any(|e| e.entity_type == "concept"));
787    }
788
789    #[test]
790    fn regex_all_caps_captura_constante() {
791        let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
792        assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
793        assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
794    }
795
796    #[test]
797    fn regex_all_caps_ignora_palavras_curtas() {
798        let ents = apply_regex_prefilter("use AI em seu projeto");
799        assert!(
800            !ents.iter().any(|e| e.name == "AI"),
801            "AI tem apenas 2 chars, deve ser ignorado"
802        );
803    }
804
805    #[test]
806    fn iob_decodifica_per_para_person() {
807        let tokens = vec![
808            "John".to_string(),
809            "Doe".to_string(),
810            "trabalhou".to_string(),
811        ];
812        let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
813        let ents = iob_to_entities(&tokens, &labels);
814        assert_eq!(ents.len(), 1);
815        assert_eq!(ents[0].entity_type, "person");
816        assert!(ents[0].name.contains("John"));
817    }
818
819    #[test]
820    fn iob_strip_subword_b_prefix() {
821        // v1.0.21 P0: BERT às vezes emite ##AI com B-prefix (subword confuso).
822        // Deve mergear na entidade ativa em vez de criar entidade fantasma "##AI".
823        let tokens = vec!["Open".to_string(), "##AI".to_string()];
824        let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
825        let ents = iob_to_entities(&tokens, &labels);
826        assert!(
827            ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
828            "deveria mergear ##AI ou descartar"
829        );
830    }
831
832    #[test]
833    fn iob_subword_orphan_descarta() {
834        // v1.0.21 P0: subword órfão sem entidade ativa não deve virar entidade.
835        let tokens = vec!["##AI".to_string()];
836        let labels = vec!["B-ORG".to_string()];
837        let ents = iob_to_entities(&tokens, &labels);
838        assert!(
839            ents.is_empty(),
840            "subword órfão sem entidade ativa deve ser descartado"
841        );
842    }
843
844    #[test]
845    fn iob_descarta_date() {
846        let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
847        let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
848        let ents = iob_to_entities(&tokens, &labels);
849        assert!(ents.is_empty(), "DATE deve ser descartado");
850    }
851
852    #[test]
853    fn iob_mapeia_org_para_project() {
854        let tokens = vec!["Empresa".to_string()];
855        let labels = vec!["B-ORG".to_string()];
856        let ents = iob_to_entities(&tokens, &labels);
857        assert_eq!(ents[0].entity_type, "project");
858    }
859
860    #[test]
861    fn iob_mapeia_org_sdk_para_tool() {
862        let tokens = vec!["tokio-sdk".to_string()];
863        let labels = vec!["B-ORG".to_string()];
864        let ents = iob_to_entities(&tokens, &labels);
865        assert_eq!(ents[0].entity_type, "tool");
866    }
867
868    #[test]
869    fn iob_mapeia_loc_para_concept() {
870        let tokens = vec!["Brasil".to_string()];
871        let labels = vec!["B-LOC".to_string()];
872        let ents = iob_to_entities(&tokens, &labels);
873        assert_eq!(ents[0].entity_type, "concept");
874    }
875
876    #[test]
877    fn build_relationships_respeitam_max_rels() {
878        let entities: Vec<NewEntity> = (0..20)
879            .map(|i| NewEntity {
880                name: format!("entidade_{i}"),
881                entity_type: "concept".to_string(),
882                description: None,
883            })
884            .collect();
885        let rels = build_relationships(&entities);
886        let max_rels = crate::constants::max_relationships_per_memory();
887        assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
888    }
889
890    #[test]
891    fn build_relationships_sem_duplicatas() {
892        let entities: Vec<NewEntity> = (0..5)
893            .map(|i| NewEntity {
894                name: format!("ent_{i}"),
895                entity_type: "concept".to_string(),
896                description: None,
897            })
898            .collect();
899        let rels = build_relationships(&entities);
900        let mut pares: std::collections::HashSet<(String, String)> =
901            std::collections::HashSet::new();
902        for r in &rels {
903            let par = (r.source.clone(), r.target.clone());
904            assert!(pares.insert(par), "par duplicado encontrado");
905        }
906    }
907
908    #[test]
909    fn merge_deduplica_por_nome_lowercase() {
910        let a = vec![ExtractedEntity {
911            name: "Rust".to_string(),
912            entity_type: "concept".to_string(),
913        }];
914        let b = vec![ExtractedEntity {
915            name: "rust".to_string(),
916            entity_type: "tool".to_string(),
917        }];
918        let merged = merge_and_deduplicate(a, b);
919        assert_eq!(merged.len(), 1, "rust e Rust são a mesma entidade");
920    }
921
922    #[test]
923    fn regex_extractor_implementa_trait() {
924        let extractor = RegexExtractor;
925        let result = extractor
926            .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
927            .unwrap();
928        assert!(!result.entities.is_empty());
929    }
930
931    #[test]
932    fn extract_retorna_ok_sem_modelo() {
933        // Sem modelo baixado, deve retornar Ok com apenas entidades regex
934        let paths = make_paths();
935        let body = "contato: teste@exemplo.com com MAX_RETRY=3";
936        let result = extract_graph_auto(body, &paths).unwrap();
937        assert!(result
938            .entities
939            .iter()
940            .any(|e| e.name.contains("teste@exemplo.com")));
941    }
942}
sqlite_graphrag/extraction.rs

sqlite_graphrag/
extraction.rs