1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::OnceLock;
4
5use anyhow::{Context, Result};
6use candle_core::{DType, Device, Tensor};
7use candle_nn::{Linear, Module, VarBuilder};
8use candle_transformers::models::bert::{BertModel, Config as BertConfig};
9use regex::Regex;
10use serde::Deserialize;
11use unicode_normalization::UnicodeNormalization;
12
13use crate::paths::AppPaths;
14use crate::storage::entities::{NewEntity, NewRelationship};
15
16const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
17const MAX_SEQ_LEN: usize = 512;
18const STRIDE: usize = 256;
19const MAX_ENTS: usize = 30;
20const TOP_K_RELATIONS: usize = 5;
21const DEFAULT_RELATION: &str = "mentions";
22const MIN_ENTITY_CHARS: usize = 2;
23
24static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
25static REGEX_URL: OnceLock<Regex> = OnceLock::new();
26static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
27static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
28static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
30static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
32
33const ALL_CAPS_STOPWORDS: &[&str] = &[
47 "ACEITE",
48 "ACK",
49 "ACL",
50 "ACRESCENTADO",
51 "ADICIONAR",
52 "AGENTS",
53 "ALL",
54 "ALTA",
55 "ALWAYS",
56 "API",
57 "ARTEFATOS",
58 "ATIVO",
59 "BAIXA",
60 "BANCO",
61 "BORDA",
62 "BLOQUEAR",
63 "BUG",
64 "CAPÍTULO",
65 "CASO",
66 "CHECKLIST",
67 "CLI",
68 "COMPLETED",
69 "CONFIRMADO",
70 "CONFIRME",
71 "CONTRATO",
72 "CRÍTICO",
73 "CRITICAL",
74 "CSV",
75 "DEVE",
76 "DEVEMOS",
77 "DISCO",
78 "DONE",
79 "EFEITO",
80 "ENTRADA",
81 "ERROR",
82 "ESSA",
83 "ESSE",
84 "ESSENCIAL",
85 "ESTA",
86 "ESTE",
87 "ETAPA",
88 "EVITAR",
89 "EXPANDIR",
90 "EXPOR",
91 "FALHA",
92 "FASE",
93 "FIXED",
94 "FIXME",
95 "FORBIDDEN",
96 "HACK",
97 "HEARTBEAT",
98 "HTTP",
99 "HTTPS",
100 "INATIVO",
101 "JAMAIS",
102 "JSON",
103 "JWT",
104 "LLM",
105 "MUST",
106 "NEGUE",
107 "NEVER",
108 "NOTE",
109 "NUNCA",
110 "OBRIGATÓRIO",
111 "PADRÃO",
112 "PASSO",
113 "PENDING",
114 "PLAN",
115 "PODEMOS",
116 "PROIBIDO",
117 "RECUSE",
118 "REGRAS",
119 "REQUIRED",
120 "REQUISITO",
121 "REST",
122 "SEÇÃO",
123 "SEMPRE",
124 "SHALL",
125 "SHOULD",
126 "SOUL",
127 "TODAS",
128 "TODO",
129 "TODOS",
130 "TOKEN",
131 "TOOLS",
132 "TSV",
133 "UI",
134 "URL",
135 "USAR",
136 "VALIDAR",
137 "VAMOS",
138 "VOCÊ",
139 "WARNING",
140 "XML",
141 "YAML",
142];
143
144const HTTP_METHODS: &[&str] = &[
147 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
148];
149
150fn is_filtered_all_caps(token: &str) -> bool {
151 let is_identifier = token.contains('_');
153 if is_identifier {
154 return false;
155 }
156 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
157}
158
159fn regex_email() -> &'static Regex {
160 REGEX_EMAIL
161 .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
162}
163
164fn regex_url() -> &'static Regex {
165 REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
166}
167
168fn regex_uuid() -> &'static Regex {
169 REGEX_UUID.get_or_init(|| {
170 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
171 .unwrap()
172 })
173}
174
175fn regex_all_caps() -> &'static Regex {
176 REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
177}
178
179fn regex_section_marker() -> &'static Regex {
180 REGEX_SECTION_MARKER.get_or_init(|| {
181 Regex::new(r"\b(?:Etapa|Fase|Passo|Seção|Capítulo)\s+\d+\b").unwrap()
183 })
184}
185
186fn regex_brand_camel() -> &'static Regex {
187 REGEX_BRAND_CAMEL.get_or_init(|| {
188 Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b").unwrap()
191 })
192}
193
194#[derive(Debug, Clone, PartialEq)]
195pub struct ExtractedEntity {
196 pub name: String,
197 pub entity_type: String,
198}
199
200#[derive(Debug, Clone)]
202pub struct ExtractedUrl {
203 pub url: String,
204 pub offset: usize,
206}
207
208#[derive(Debug, Clone)]
209pub struct ExtractionResult {
210 pub entities: Vec<NewEntity>,
211 pub relationships: Vec<NewRelationship>,
212 pub relationships_truncated: bool,
215 pub extraction_method: String,
218 pub urls: Vec<ExtractedUrl>,
220}
221
222pub trait Extractor: Send + Sync {
223 fn extract(&self, body: &str) -> Result<ExtractionResult>;
224}
225
226#[derive(Deserialize)]
227struct ModelConfig {
228 #[serde(default)]
229 id2label: HashMap<String, String>,
230 hidden_size: usize,
231}
232
233struct BertNerModel {
234 bert: BertModel,
235 classifier: Linear,
236 device: Device,
237 id2label: HashMap<usize, String>,
238}
239
240impl BertNerModel {
241 fn load(model_dir: &Path) -> Result<Self> {
242 let config_path = model_dir.join("config.json");
243 let weights_path = model_dir.join("model.safetensors");
244
245 let config_str = std::fs::read_to_string(&config_path)
246 .with_context(|| format!("lendo config.json em {config_path:?}"))?;
247 let model_cfg: ModelConfig =
248 serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
249
250 let id2label: HashMap<usize, String> = model_cfg
251 .id2label
252 .into_iter()
253 .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
254 .collect();
255
256 let num_labels = id2label.len().max(9);
257 let hidden_size = model_cfg.hidden_size;
258
259 let bert_config_str = std::fs::read_to_string(&config_path)
260 .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
261 let bert_cfg: BertConfig =
262 serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
263
264 let device = Device::Cpu;
265
266 let vb = unsafe {
267 VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
268 .with_context(|| format!("mapeando {weights_path:?}"))?
269 };
270 let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
271
272 let cls_vb = vb.pp("classifier");
275 let weight = cls_vb
276 .get((num_labels, hidden_size), "weight")
277 .context("carregando classifier.weight do safetensors")?;
278 let bias = cls_vb
279 .get(num_labels, "bias")
280 .context("carregando classifier.bias do safetensors")?;
281 let classifier = Linear::new(weight, Some(bias));
282
283 Ok(Self {
284 bert,
285 classifier,
286 device,
287 id2label,
288 })
289 }
290
291 fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
292 let len = token_ids.len();
293 let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
294 let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
295
296 let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
297 .context("criando tensor input_ids")?;
298 let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
299 .context("criando tensor token_type_ids")?;
300 let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
301 .context("criando tensor attention_mask")?;
302
303 let sequence_output = self
304 .bert
305 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
306 .context("forward pass do BertModel")?;
307
308 let logits = self
309 .classifier
310 .forward(&sequence_output)
311 .context("forward pass do classificador")?;
312
313 let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
314
315 let num_tokens = logits_2d.dim(0).context("dim(0)")?;
316
317 let mut labels = Vec::with_capacity(num_tokens);
318 for i in 0..num_tokens {
319 let token_logits = logits_2d.get(i).context("get token logits")?;
320 let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
321 let argmax = vec
322 .iter()
323 .enumerate()
324 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
325 .map(|(idx, _)| idx)
326 .unwrap_or(0);
327 let label = self
328 .id2label
329 .get(&argmax)
330 .cloned()
331 .unwrap_or_else(|| "O".to_string());
332 labels.push(label);
333 }
334
335 Ok(labels)
336 }
337
338 fn predict_batch(&self, windows: &[(Vec<u32>, Vec<String>)]) -> Result<Vec<Vec<String>>> {
347 let batch_size = windows.len();
348 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
349 if max_len == 0 {
350 return Ok(vec![vec![]; batch_size]);
351 }
352
353 let mut padded_ids: Vec<Tensor> = Vec::with_capacity(batch_size);
354 let mut padded_masks: Vec<Tensor> = Vec::with_capacity(batch_size);
355
356 for (ids, _) in windows {
357 let len = ids.len();
358 let pad_right = max_len - len;
359
360 let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
361 let t = Tensor::from_vec(ids_i64, len, &self.device)
363 .context("criando tensor de ids para batch")?;
364 let t = t
365 .pad_with_zeros(0, 0, pad_right)
366 .context("padding tensor de ids")?;
367 padded_ids.push(t);
368
369 let mut mask_i64 = vec![1i64; len];
371 mask_i64.extend(vec![0i64; pad_right]);
372 let m = Tensor::from_vec(mask_i64, max_len, &self.device)
373 .context("criando tensor de máscara para batch")?;
374 padded_masks.push(m);
375 }
376
377 let input_ids = Tensor::stack(&padded_ids, 0).context("stack input_ids")?;
379 let attn_mask = Tensor::stack(&padded_masks, 0).context("stack attn_mask")?;
380 let token_type_ids = Tensor::zeros((batch_size, max_len), DType::I64, &self.device)
381 .context("criando token_type_ids batch")?;
382
383 let sequence_output = self
385 .bert
386 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
387 .context("forward pass batch BertModel")?;
388 let logits = self
391 .classifier
392 .forward(&sequence_output)
393 .context("forward pass batch classificador")?;
394 let mut results = Vec::with_capacity(batch_size);
397 for (i, (window_ids, _)) in windows.iter().enumerate() {
398 let example_logits = logits.get(i).context("get logits exemplo")?;
399 let real_len = window_ids.len();
401 let example_slice = example_logits
402 .narrow(0, 0, real_len)
403 .context("narrow para tokens reais")?;
404 let logits_2d: Vec<Vec<f32>> = example_slice.to_vec2().context("to_vec2 logits")?;
405
406 let labels: Vec<String> = logits_2d
407 .iter()
408 .map(|token_logits| {
409 let argmax = token_logits
410 .iter()
411 .enumerate()
412 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
413 .map(|(idx, _)| idx)
414 .unwrap_or(0);
415 self.id2label
416 .get(&argmax)
417 .cloned()
418 .unwrap_or_else(|| "O".to_string())
419 })
420 .collect();
421
422 results.push(labels);
423 }
424
425 Ok(results)
426 }
427}
428
429static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
430
431fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
432 NER_MODEL
433 .get_or_init(|| match load_model(paths) {
434 Ok(m) => Some(m),
435 Err(e) => {
436 tracing::warn!("NER model não disponível (graceful degradation): {e:#}");
437 None
438 }
439 })
440 .as_ref()
441}
442
443fn model_dir(paths: &AppPaths) -> PathBuf {
444 paths.models.join("bert-multilingual-ner")
445}
446
447fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
448 let dir = model_dir(paths);
449 std::fs::create_dir_all(&dir)
450 .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
451
452 let weights = dir.join("model.safetensors");
453 let config = dir.join("config.json");
454 let tokenizer = dir.join("tokenizer.json");
455
456 if weights.exists() && config.exists() && tokenizer.exists() {
457 return Ok(dir);
458 }
459
460 tracing::info!("Baixando modelo NER (primeira execução, ~676 MB)...");
461 crate::output::emit_progress_i18n(
462 "Downloading NER model (first run, ~676 MB)...",
463 "Baixando modelo NER (primeira execução, ~676 MB)...",
464 );
465
466 let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
467 let repo = api.model(MODEL_ID.to_string());
468
469 for (remote, local) in &[
473 ("model.safetensors", "model.safetensors"),
474 ("config.json", "config.json"),
475 ("onnx/tokenizer.json", "tokenizer.json"),
476 ("tokenizer_config.json", "tokenizer_config.json"),
477 ] {
478 let dest = dir.join(local);
479 if !dest.exists() {
480 let src = repo
481 .get(remote)
482 .with_context(|| format!("baixando {remote} do HF Hub"))?;
483 std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
484 }
485 }
486
487 Ok(dir)
488}
489
490fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
491 let dir = ensure_model_files(paths)?;
492 BertNerModel::load(&dir)
493}
494
495fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
496 let mut entities = Vec::new();
497 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
498
499 let add = |entities: &mut Vec<ExtractedEntity>,
500 seen: &mut std::collections::HashSet<String>,
501 name: &str,
502 entity_type: &str| {
503 let name = name.trim().to_string();
504 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
505 entities.push(ExtractedEntity {
506 name,
507 entity_type: entity_type.to_string(),
508 });
509 }
510 };
511
512 let cleaned = regex_section_marker().replace_all(body, " ");
515 let cleaned = cleaned.as_ref();
516
517 for m in regex_email().find_iter(cleaned) {
518 add(&mut entities, &mut seen, m.as_str(), "concept");
520 }
521 for m in regex_uuid().find_iter(cleaned) {
522 add(&mut entities, &mut seen, m.as_str(), "concept");
523 }
524 for m in regex_all_caps().find_iter(cleaned) {
525 let candidate = m.as_str();
526 if !is_filtered_all_caps(candidate) {
528 add(&mut entities, &mut seen, candidate, "concept");
529 }
530 }
531 for m in regex_brand_camel().find_iter(cleaned) {
534 let name = m.as_str();
535 if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
537 add(&mut entities, &mut seen, name, "organization");
538 }
539 }
540
541 entities
542}
543
544pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
548 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
549 let mut result = Vec::new();
550 for m in regex_url().find_iter(body) {
551 let raw = m.as_str();
552 let cleaned = raw
553 .trim_end_matches('`')
554 .trim_end_matches(',')
555 .trim_end_matches('.')
556 .trim_end_matches(';')
557 .trim_end_matches(')')
558 .trim_end_matches(']')
559 .trim_end_matches('}');
560 if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
561 result.push(ExtractedUrl {
562 url: cleaned.to_string(),
563 offset: m.start(),
564 });
565 }
566 }
567 result
568}
569
570fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
571 let mut entities: Vec<ExtractedEntity> = Vec::new();
572 let mut current_parts: Vec<String> = Vec::new();
573 let mut current_type: Option<String> = None;
574
575 let flush =
576 |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
577 if let Some(t) = typ.take() {
578 let name = parts.join(" ").trim().to_string();
579 let is_single_caps = !name.contains(' ')
583 && name == name.to_uppercase()
584 && name.len() >= MIN_ENTITY_CHARS;
585 let should_skip = is_single_caps && is_filtered_all_caps(&name);
586 let is_section_marker = regex_section_marker().is_match(&name);
591 if name.len() >= MIN_ENTITY_CHARS && !should_skip && !is_section_marker {
592 entities.push(ExtractedEntity {
593 name,
594 entity_type: t,
595 });
596 }
597 parts.clear();
598 }
599 };
600
601 for (token, label) in tokens.iter().zip(labels.iter()) {
602 if label == "O" {
603 flush(&mut current_parts, &mut current_type, &mut entities);
604 continue;
605 }
606
607 let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
608 ("B", rest)
609 } else if let Some(rest) = label.strip_prefix("I-") {
610 ("I", rest)
611 } else {
612 flush(&mut current_parts, &mut current_type, &mut entities);
613 continue;
614 };
615
616 const PT_VERB_FALSE_POSITIVES: &[&str] = &[
620 "Lê", "Vê", "Cá", "Pôr", "Ser", "Vir", "Ver", "Dar", "Ler", "Ter",
621 ];
622
623 let entity_type = match bio_type {
624 "DATE" => "date",
626 "PER" => {
627 if PT_VERB_FALSE_POSITIVES.contains(&token.as_str()) {
629 flush(&mut current_parts, &mut current_type, &mut entities);
630 continue;
631 }
632 "person"
633 }
634 "ORG" => {
635 let t = token.to_lowercase();
636 if t.contains("lib")
637 || t.contains("sdk")
638 || t.contains("cli")
639 || t.contains("crate")
640 || t.contains("npm")
641 {
642 "tool"
643 } else {
644 "organization"
646 }
647 }
648 "LOC" => "location",
650 other => other,
651 };
652
653 if prefix == "B" {
654 if token.starts_with("##") {
655 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
658 if let Some(last) = current_parts.last_mut() {
659 last.push_str(clean);
660 }
661 continue;
662 }
663 flush(&mut current_parts, &mut current_type, &mut entities);
664 current_parts.push(token.clone());
665 current_type = Some(entity_type.to_string());
666 } else if prefix == "I" && current_type.is_some() {
667 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
668 if token.starts_with("##") {
669 if let Some(last) = current_parts.last_mut() {
670 last.push_str(clean);
671 }
672 } else {
673 current_parts.push(clean.to_string());
674 }
675 }
676 }
677
678 flush(&mut current_parts, &mut current_type, &mut entities);
679 entities
680}
681
682fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
686 if entities.len() < 2 {
687 return (Vec::new(), false);
688 }
689
690 let max_rels = crate::constants::max_relationships_per_memory();
693 let n = entities.len().min(MAX_ENTS);
694 let mut rels: Vec<NewRelationship> = Vec::new();
695 let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
696
697 let mut hit_cap = false;
698 'outer: for i in 0..n {
699 if rels.len() >= max_rels {
700 hit_cap = true;
701 break;
702 }
703
704 let mut for_entity = 0usize;
705 for j in (i + 1)..n {
706 if for_entity >= TOP_K_RELATIONS {
707 break;
708 }
709 if rels.len() >= max_rels {
710 hit_cap = true;
711 break 'outer;
712 }
713
714 let src = &entities[i].name;
715 let tgt = &entities[j].name;
716 let key = (src.clone(), tgt.clone());
717
718 if seen.contains(&key) {
719 continue;
720 }
721 seen.insert(key);
722
723 rels.push(NewRelationship {
724 source: src.clone(),
725 target: tgt.clone(),
726 relation: DEFAULT_RELATION.to_string(),
727 strength: 0.5,
728 description: None,
729 });
730 for_entity += 1;
731 }
732 }
733
734 if hit_cap {
736 tracing::warn!(
737 "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
738 n.saturating_sub(1)
739 );
740 }
741
742 (rels, hit_cap)
743}
744
745fn run_ner_sliding_window(
746 model: &BertNerModel,
747 body: &str,
748 paths: &AppPaths,
749) -> Result<Vec<ExtractedEntity>> {
750 let tokenizer_path = model_dir(paths).join("tokenizer.json");
751 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
752 .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
753
754 let encoding = tokenizer
755 .encode(body, false)
756 .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
757
758 let all_ids: Vec<u32> = encoding.get_ids().to_vec();
759 let all_tokens: Vec<String> = encoding
760 .get_tokens()
761 .iter()
762 .map(|s| s.to_string())
763 .collect();
764
765 if all_ids.is_empty() {
766 return Ok(Vec::new());
767 }
768
769 let mut windows: Vec<(Vec<u32>, Vec<String>)> = Vec::new();
771 let mut start = 0usize;
772 loop {
773 let end = (start + MAX_SEQ_LEN).min(all_ids.len());
774 windows.push((
775 all_ids[start..end].to_vec(),
776 all_tokens[start..end].to_vec(),
777 ));
778 if end >= all_ids.len() {
779 break;
780 }
781 start += STRIDE;
782 }
783
784 windows.sort_by_key(|(ids, _)| ids.len());
786
787 let batch_size = crate::constants::ner_batch_size();
789 let mut entities: Vec<ExtractedEntity> = Vec::new();
790 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
791
792 for chunk in windows.chunks(batch_size) {
793 match model.predict_batch(chunk) {
794 Ok(batch_labels) => {
795 for (labels, (_, tokens)) in batch_labels.iter().zip(chunk.iter()) {
796 for ent in iob_to_entities(tokens, labels) {
797 if seen.insert(ent.name.clone()) {
798 entities.push(ent);
799 }
800 }
801 }
802 }
803 Err(e) => {
804 tracing::warn!(
805 "batch NER falhou (chunk de {} janelas): {e:#} — fallback single-window",
806 chunk.len()
807 );
808 for (ids, tokens) in chunk {
810 let mask = vec![1u32; ids.len()];
811 match model.predict(ids, &mask) {
812 Ok(labels) => {
813 for ent in iob_to_entities(tokens, &labels) {
814 if seen.insert(ent.name.clone()) {
815 entities.push(ent);
816 }
817 }
818 }
819 Err(e2) => {
820 tracing::warn!("janela NER fallback também falhou: {e2:#}");
821 }
822 }
823 }
824 }
825 }
826 }
827
828 Ok(entities)
829}
830
831fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
838 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
839 let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)").unwrap());
842
843 entities
844 .into_iter()
845 .map(|ent| {
846 if let Some(pos) = body.find(&ent.name) {
848 let after_pos = pos + ent.name.len();
849 if after_pos < body.len() {
850 let after = &body[after_pos..];
851 if let Some(m) = suffix_re.find(after) {
852 let suffix = m.as_str();
853 if suffix.len() <= 7 {
856 let extended = format!("{}{}", ent.name, suffix);
857 return ExtractedEntity {
858 name: extended,
859 entity_type: ent.entity_type,
860 };
861 }
862 }
863 }
864 }
865 ent
866 })
867 .collect()
868}
869
870fn augment_versioned_model_names(
890 entities: Vec<ExtractedEntity>,
891 body: &str,
892) -> Vec<ExtractedEntity> {
893 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
894 let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
901 Regex::new(
902 r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
903 )
904 .unwrap()
905 });
906
907 let mut existing_lc: std::collections::HashSet<String> =
908 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
909 let mut result = entities;
910
911 for caps in model_re.captures_iter(body) {
912 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
913 if full_match.is_empty() || full_match.len() > 24 {
916 continue;
917 }
918 let normalized_lc = full_match.to_lowercase();
919 if existing_lc.contains(&normalized_lc) {
920 continue;
921 }
922 if result.len() >= MAX_ENTS {
925 break;
926 }
927 existing_lc.insert(normalized_lc);
928 result.push(ExtractedEntity {
929 name: full_match.to_string(),
930 entity_type: "concept".to_string(),
931 });
932 }
933
934 result
935}
936
937fn merge_and_deduplicate(
938 regex_ents: Vec<ExtractedEntity>,
939 ner_ents: Vec<ExtractedEntity>,
940) -> Vec<ExtractedEntity> {
941 let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
956 let mut result: Vec<ExtractedEntity> = Vec::new();
957 let mut truncated = false;
958
959 let total_input = regex_ents.len() + ner_ents.len();
960 for ent in regex_ents.into_iter().chain(ner_ents) {
961 let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
962 let key = format!("{}\0{}", ent.entity_type, name_lc);
966
967 let mut collision_idx: Option<usize> = None;
972 for (existing_key, idx) in &by_lc {
973 let type_prefix = format!("{}\0", ent.entity_type);
975 if !existing_key.starts_with(&type_prefix) {
976 continue;
977 }
978 let existing_name_lc = &existing_key[type_prefix.len()..];
979 if existing_name_lc == name_lc
980 || existing_name_lc.contains(name_lc.as_str())
981 || name_lc.contains(existing_name_lc)
982 {
983 collision_idx = Some(*idx);
984 break;
985 }
986 }
987 match collision_idx {
988 Some(idx) => {
989 if ent.name.len() > result[idx].name.len() {
992 let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
993 let old_key = format!("{}\0{}", result[idx].entity_type, old_name_lc);
994 by_lc.remove(&old_key);
995 result[idx] = ent;
996 by_lc.insert(key, idx);
997 }
998 }
999 None => {
1000 by_lc.insert(key, result.len());
1001 result.push(ent);
1002 }
1003 }
1004 if result.len() >= MAX_ENTS {
1005 truncated = true;
1006 break;
1007 }
1008 }
1009
1010 if truncated {
1012 tracing::warn!(
1013 "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
1014 );
1015 }
1016
1017 result
1018}
1019
1020fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1021 extracted
1022 .into_iter()
1023 .map(|e| NewEntity {
1024 name: e.name,
1025 entity_type: e.entity_type,
1026 description: None,
1027 })
1028 .collect()
1029}
1030
1031pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
1032 let regex_entities = apply_regex_prefilter(body);
1033
1034 let mut bert_used = false;
1035 let ner_entities = match get_or_init_model(paths) {
1036 Some(model) => match run_ner_sliding_window(model, body, paths) {
1037 Ok(ents) => {
1038 bert_used = true;
1039 ents
1040 }
1041 Err(e) => {
1042 tracing::warn!("NER falhou, usando apenas regex: {e:#}");
1043 Vec::new()
1044 }
1045 },
1046 None => Vec::new(),
1047 };
1048
1049 let merged = merge_and_deduplicate(regex_entities, ner_entities);
1050 let extended = extend_with_numeric_suffix(merged, body);
1052 let with_models = augment_versioned_model_names(extended, body);
1056 let with_models: Vec<ExtractedEntity> = with_models
1060 .into_iter()
1061 .filter(|e| !regex_section_marker().is_match(&e.name))
1062 .collect();
1063 let entities = to_new_entities(with_models);
1064 let (relationships, relationships_truncated) = build_relationships(&entities);
1065
1066 let extraction_method = if bert_used {
1067 "bert+regex-batch".to_string()
1068 } else {
1069 "regex-only".to_string()
1070 };
1071
1072 let urls = extract_urls(body);
1073
1074 Ok(ExtractionResult {
1075 entities,
1076 relationships,
1077 relationships_truncated,
1078 extraction_method,
1079 urls,
1080 })
1081}
1082
1083pub struct RegexExtractor;
1084
1085impl Extractor for RegexExtractor {
1086 fn extract(&self, body: &str) -> Result<ExtractionResult> {
1087 let regex_entities = apply_regex_prefilter(body);
1088 let entities = to_new_entities(regex_entities);
1089 let (relationships, relationships_truncated) = build_relationships(&entities);
1090 let urls = extract_urls(body);
1091 Ok(ExtractionResult {
1092 entities,
1093 relationships,
1094 relationships_truncated,
1095 extraction_method: "regex-only".to_string(),
1096 urls,
1097 })
1098 }
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103 use super::*;
1104
1105 fn make_paths() -> AppPaths {
1106 use std::path::PathBuf;
1107 AppPaths {
1108 db: PathBuf::from("/tmp/test.sqlite"),
1109 models: PathBuf::from("/tmp/test_models"),
1110 }
1111 }
1112
1113 #[test]
1114 fn regex_email_captura_endereco() {
1115 let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
1116 assert!(ents
1118 .iter()
1119 .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
1120 }
1121
1122 #[test]
1123 fn regex_all_caps_filtra_palavra_regra_pt() {
1124 let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
1126 assert!(
1127 !ents.iter().any(|e| e.name == "NUNCA"),
1128 "NUNCA deveria ser filtrado como stopword"
1129 );
1130 assert!(
1131 !ents.iter().any(|e| e.name == "PROIBIDO"),
1132 "PROIBIDO deveria ser filtrado"
1133 );
1134 assert!(
1135 !ents.iter().any(|e| e.name == "DEVE"),
1136 "DEVE deveria ser filtrado"
1137 );
1138 }
1139
1140 #[test]
1141 fn regex_all_caps_aceita_constante_com_underscore() {
1142 let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
1144 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1145 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1146 }
1147
1148 #[test]
1149 fn regex_all_caps_aceita_acronimo_dominio() {
1150 let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
1152 assert!(ents.iter().any(|e| e.name == "OPENAI"));
1153 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1154 }
1155
1156 #[test]
1157 fn regex_url_nao_aparece_em_apply_regex_prefilter() {
1158 let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
1160 assert!(
1161 !ents.iter().any(|e| e.name.starts_with("https://")),
1162 "URLs não devem aparecer como entidades após split P0-2"
1163 );
1164 }
1165
1166 #[test]
1167 fn extract_urls_captura_https() {
1168 let urls = extract_urls("veja https://docs.rs/crate para detalhes");
1169 assert_eq!(urls.len(), 1);
1170 assert_eq!(urls[0].url, "https://docs.rs/crate");
1171 assert!(urls[0].offset > 0);
1172 }
1173
1174 #[test]
1175 fn extract_urls_trim_sufixo_pontuacao() {
1176 let urls = extract_urls("link: https://example.com/path. fim");
1177 assert!(!urls.is_empty());
1178 assert!(
1179 !urls[0].url.ends_with('.'),
1180 "sufixo ponto deve ser removido"
1181 );
1182 }
1183
1184 #[test]
1185 fn extract_urls_deduplica_repetidas() {
1186 let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1187 let urls = extract_urls(body);
1188 assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1189 }
1190
1191 #[test]
1192 fn regex_uuid_captura_identificador() {
1193 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1194 assert!(ents.iter().any(|e| e.entity_type == "concept"));
1195 }
1196
1197 #[test]
1198 fn regex_all_caps_captura_constante() {
1199 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1200 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1201 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1202 }
1203
1204 #[test]
1205 fn regex_all_caps_ignora_palavras_curtas() {
1206 let ents = apply_regex_prefilter("use AI em seu projeto");
1207 assert!(
1208 !ents.iter().any(|e| e.name == "AI"),
1209 "AI tem apenas 2 chars, deve ser ignorado"
1210 );
1211 }
1212
1213 #[test]
1214 fn iob_decodifica_per_para_person() {
1215 let tokens = vec![
1216 "John".to_string(),
1217 "Doe".to_string(),
1218 "trabalhou".to_string(),
1219 ];
1220 let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
1221 let ents = iob_to_entities(&tokens, &labels);
1222 assert_eq!(ents.len(), 1);
1223 assert_eq!(ents[0].entity_type, "person");
1224 assert!(ents[0].name.contains("John"));
1225 }
1226
1227 #[test]
1228 fn iob_strip_subword_b_prefix() {
1229 let tokens = vec!["Open".to_string(), "##AI".to_string()];
1232 let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
1233 let ents = iob_to_entities(&tokens, &labels);
1234 assert!(
1235 ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
1236 "deveria mergear ##AI ou descartar"
1237 );
1238 }
1239
1240 #[test]
1241 fn iob_subword_orphan_descarta() {
1242 let tokens = vec!["##AI".to_string()];
1244 let labels = vec!["B-ORG".to_string()];
1245 let ents = iob_to_entities(&tokens, &labels);
1246 assert!(
1247 ents.is_empty(),
1248 "subword órfão sem entidade ativa deve ser descartado"
1249 );
1250 }
1251
1252 #[test]
1253 fn iob_mapeia_date_para_date_v1025() {
1254 let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
1256 let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
1257 let ents = iob_to_entities(&tokens, &labels);
1258 assert_eq!(ents.len(), 1, "DATE deve ser emitido como entidade v1.0.25");
1259 assert_eq!(ents[0].entity_type, "date");
1260 }
1261
1262 #[test]
1263 fn iob_mapeia_org_para_organization_v1025() {
1264 let tokens = vec!["Empresa".to_string()];
1266 let labels = vec!["B-ORG".to_string()];
1267 let ents = iob_to_entities(&tokens, &labels);
1268 assert_eq!(ents[0].entity_type, "organization");
1269 }
1270
1271 #[test]
1272 fn iob_mapeia_org_sdk_para_tool() {
1273 let tokens = vec!["tokio-sdk".to_string()];
1274 let labels = vec!["B-ORG".to_string()];
1275 let ents = iob_to_entities(&tokens, &labels);
1276 assert_eq!(ents[0].entity_type, "tool");
1277 }
1278
1279 #[test]
1280 fn iob_mapeia_loc_para_location_v1025() {
1281 let tokens = vec!["Brasil".to_string()];
1283 let labels = vec!["B-LOC".to_string()];
1284 let ents = iob_to_entities(&tokens, &labels);
1285 assert_eq!(ents[0].entity_type, "location");
1286 }
1287
1288 #[test]
1289 fn build_relationships_respeitam_max_rels() {
1290 let entities: Vec<NewEntity> = (0..20)
1291 .map(|i| NewEntity {
1292 name: format!("entidade_{i}"),
1293 entity_type: "concept".to_string(),
1294 description: None,
1295 })
1296 .collect();
1297 let (rels, truncated) = build_relationships(&entities);
1298 let max_rels = crate::constants::max_relationships_per_memory();
1299 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1300 if rels.len() == max_rels {
1301 assert!(truncated, "truncated deve ser true quando atingiu o cap");
1302 }
1303 }
1304
1305 #[test]
1306 fn build_relationships_sem_duplicatas() {
1307 let entities: Vec<NewEntity> = (0..5)
1308 .map(|i| NewEntity {
1309 name: format!("ent_{i}"),
1310 entity_type: "concept".to_string(),
1311 description: None,
1312 })
1313 .collect();
1314 let (rels, _truncated) = build_relationships(&entities);
1315 let mut pares: std::collections::HashSet<(String, String)> =
1316 std::collections::HashSet::new();
1317 for r in &rels {
1318 let par = (r.source.clone(), r.target.clone());
1319 assert!(pares.insert(par), "par duplicado encontrado");
1320 }
1321 }
1322
1323 #[test]
1324 fn merge_deduplica_por_nome_lowercase() {
1325 let a = vec![ExtractedEntity {
1328 name: "Rust".to_string(),
1329 entity_type: "concept".to_string(),
1330 }];
1331 let b = vec![ExtractedEntity {
1332 name: "rust".to_string(),
1333 entity_type: "concept".to_string(),
1334 }];
1335 let merged = merge_and_deduplicate(a, b);
1336 assert_eq!(
1337 merged.len(),
1338 1,
1339 "rust e Rust com mesmo tipo são a mesma entidade"
1340 );
1341 }
1342
1343 #[test]
1344 fn regex_extractor_implementa_trait() {
1345 let extractor = RegexExtractor;
1346 let result = extractor
1347 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1348 .unwrap();
1349 assert!(!result.entities.is_empty());
1350 }
1351
1352 #[test]
1353 fn extract_retorna_ok_sem_modelo() {
1354 let paths = make_paths();
1356 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1357 let result = extract_graph_auto(body, &paths).unwrap();
1358 assert!(result
1359 .entities
1360 .iter()
1361 .any(|e| e.name.contains("teste@exemplo.com")));
1362 }
1363
1364 #[test]
1365 fn stopwords_filter_v1024_terms() {
1366 let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1369 DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1370 let ents = apply_regex_prefilter(body);
1371 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1372 for word in &[
1373 "ACEITE",
1374 "ACK",
1375 "ACL",
1376 "BORDA",
1377 "CHECKLIST",
1378 "COMPLETED",
1379 "CONFIRME",
1380 "DEVEMOS",
1381 "DONE",
1382 "FIXED",
1383 "NEGUE",
1384 "PENDING",
1385 "PLAN",
1386 "PODEMOS",
1387 "RECUSE",
1388 "TOKEN",
1389 "VAMOS",
1390 ] {
1391 assert!(
1392 !names.contains(word),
1393 "v1.0.24 stopword {word} should be filtered but was found in entities"
1394 );
1395 }
1396 }
1397
1398 #[test]
1399 fn dedup_normalizes_unicode_combining_marks() {
1400 let nfc = vec![ExtractedEntity {
1404 name: "Café".to_string(),
1405 entity_type: "concept".to_string(),
1406 }];
1407 let nfd_name = "Cafe\u{301}".to_string();
1409 let nfd = vec![ExtractedEntity {
1410 name: nfd_name,
1411 entity_type: "concept".to_string(),
1412 }];
1413 let merged = merge_and_deduplicate(nfc, nfd);
1414 assert_eq!(
1415 merged.len(),
1416 1,
1417 "NFC 'Café' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1418 );
1419 }
1420
1421 #[test]
1424 fn predict_batch_output_count_matches_input() {
1425 let w1_ids: Vec<u32> = vec![101, 100, 102];
1431 let w1_tok: Vec<String> = vec!["[CLS]".into(), "hello".into(), "[SEP]".into()];
1432 let w2_ids: Vec<u32> = vec![101, 100, 200, 300, 102];
1433 let w2_tok: Vec<String> = vec![
1434 "[CLS]".into(),
1435 "world".into(),
1436 "foo".into(),
1437 "bar".into(),
1438 "[SEP]".into(),
1439 ];
1440 let windows: Vec<(Vec<u32>, Vec<String>)> =
1441 vec![(w1_ids.clone(), w1_tok), (w2_ids.clone(), w2_tok)];
1442
1443 let device = Device::Cpu;
1446 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap();
1447 assert_eq!(max_len, 5, "max_len deve ser 5");
1448
1449 let mut padded_ids: Vec<Tensor> = Vec::new();
1450 for (ids, _) in &windows {
1451 let len = ids.len();
1452 let pad_right = max_len - len;
1453 let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
1454 let t = Tensor::from_vec(ids_i64, len, &device).unwrap();
1455 let t = t.pad_with_zeros(0, 0, pad_right).unwrap();
1456 assert_eq!(
1457 t.dims(),
1458 &[max_len],
1459 "cada janela deve ter shape (max_len,) após padding"
1460 );
1461 padded_ids.push(t);
1462 }
1463
1464 let stacked = Tensor::stack(&padded_ids, 0).unwrap();
1465 assert_eq!(
1466 stacked.dims(),
1467 &[2, max_len],
1468 "stack deve produzir (batch_size=2, max_len=5)"
1469 );
1470
1471 let fake_logits_data: Vec<f32> = vec![0.0f32; 2 * max_len * 9]; let fake_logits =
1475 Tensor::from_vec(fake_logits_data, (2usize, max_len, 9usize), &device).unwrap();
1476 for (i, (ids, _)) in windows.iter().enumerate() {
1477 let real_len = ids.len();
1478 let example = fake_logits.get(i).unwrap();
1479 let sliced = example.narrow(0, 0, real_len).unwrap();
1480 assert_eq!(
1481 sliced.dims(),
1482 &[real_len, 9],
1483 "narrow deve preservar apenas {real_len} tokens reais"
1484 );
1485 }
1486 }
1487
1488 #[test]
1489 fn predict_batch_empty_windows_returns_empty() {
1490 let windows: Vec<(Vec<u32>, Vec<String>)> = vec![];
1493 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
1494 assert_eq!(max_len, 0, "zero windows → max_len 0");
1495 let result: Vec<Vec<String>> = if max_len == 0 {
1498 Vec::new()
1499 } else {
1500 unreachable!()
1501 };
1502 assert!(result.is_empty());
1503 }
1504
1505 #[test]
1506 fn ner_batch_size_default_is_8() {
1507 std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1510 assert_eq!(crate::constants::ner_batch_size(), 8);
1511 }
1512
1513 #[test]
1514 fn ner_batch_size_env_override_clamped() {
1515 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "64");
1517 assert_eq!(crate::constants::ner_batch_size(), 32, "deve clampar em 32");
1518
1519 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "0");
1520 assert_eq!(crate::constants::ner_batch_size(), 1, "deve clampar em 1");
1521
1522 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "4");
1523 assert_eq!(
1524 crate::constants::ner_batch_size(),
1525 4,
1526 "valor válido preservado"
1527 );
1528
1529 std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1530 }
1531
1532 #[test]
1533 fn extraction_method_regex_only_unchanged() {
1534 let result = RegexExtractor.extract("contato: dev@acme.io").unwrap();
1537 assert_eq!(
1538 result.extraction_method, "regex-only",
1539 "RegexExtractor deve retornar regex-only"
1540 );
1541 }
1542
1543 #[test]
1546 fn extend_suffix_pure_numeric_unchanged() {
1547 let ents = vec![ExtractedEntity {
1549 name: "GPT".to_string(),
1550 entity_type: "concept".to_string(),
1551 }];
1552 let result = extend_with_numeric_suffix(ents, "usando GPT-5 no projeto");
1553 assert_eq!(
1554 result[0].name, "GPT-5",
1555 "sufixo puramente numérico deve ser estendido"
1556 );
1557 }
1558
1559 #[test]
1560 fn extend_suffix_alphanumeric_letter_after_digit() {
1561 let ents = vec![ExtractedEntity {
1563 name: "GPT".to_string(),
1564 entity_type: "concept".to_string(),
1565 }];
1566 let result = extend_with_numeric_suffix(ents, "usando GPT-4o para tarefas avançadas");
1567 assert_eq!(result[0].name, "GPT-4o", "sufixo '4o' deve ser aceito");
1568 }
1569
1570 #[test]
1571 fn extend_suffix_alphanumeric_b_suffix() {
1572 let ents = vec![ExtractedEntity {
1574 name: "Llama".to_string(),
1575 entity_type: "concept".to_string(),
1576 }];
1577 let result = extend_with_numeric_suffix(ents, "modelo Llama-5b open-weight");
1578 assert_eq!(result[0].name, "Llama-5b", "sufixo '5b' deve ser aceito");
1579 }
1580
1581 #[test]
1582 fn extend_suffix_alphanumeric_x_suffix() {
1583 let ents = vec![ExtractedEntity {
1585 name: "Mistral".to_string(),
1586 entity_type: "concept".to_string(),
1587 }];
1588 let result = extend_with_numeric_suffix(ents, "testando Mistral-8x em produção");
1589 assert_eq!(result[0].name, "Mistral-8x", "sufixo '8x' deve ser aceito");
1590 }
1591
1592 #[test]
1595 fn augment_versioned_gpt4o() {
1596 let result = augment_versioned_model_names(vec![], "usando GPT-4o para análise");
1598 assert!(
1599 result.iter().any(|e| e.name == "GPT-4o"),
1600 "GPT-4o deve ser capturado pelo augment, achados: {:?}",
1601 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1602 );
1603 }
1604
1605 #[test]
1606 fn augment_versioned_claude_4_sonnet() {
1607 let result =
1609 augment_versioned_model_names(vec![], "melhor modelo: Claude 4 Sonnet lançado hoje");
1610 assert!(
1611 result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1612 "Claude 4 Sonnet deve ser capturado, achados: {:?}",
1613 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1614 );
1615 }
1616
1617 #[test]
1618 fn augment_versioned_llama_3_pro() {
1619 let result =
1621 augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1622 assert!(
1623 result.iter().any(|e| e.name == "Llama 3 Pro"),
1624 "Llama 3 Pro deve ser capturado, achados: {:?}",
1625 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1626 );
1627 }
1628
1629 #[test]
1630 fn augment_versioned_mixtral_8x7b() {
1631 let result =
1633 augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1634 assert!(
1635 result.iter().any(|e| e.name == "Mixtral 8x7B"),
1636 "Mixtral 8x7B deve ser capturado, achados: {:?}",
1637 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1638 );
1639 }
1640
1641 #[test]
1642 fn augment_versioned_does_not_duplicate_existing() {
1643 let existing = vec![ExtractedEntity {
1645 name: "Claude 4".to_string(),
1646 entity_type: "concept".to_string(),
1647 }];
1648 let result = augment_versioned_model_names(existing, "usando Claude 4 no projeto");
1649 let count = result.iter().filter(|e| e.name == "Claude 4").count();
1650 assert_eq!(count, 1, "Claude 4 não deve ser duplicado");
1651 }
1652
1653 #[test]
1656 fn stopwords_filter_url_jwt_api_v1025() {
1657 let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1659 let ents = apply_regex_prefilter(body);
1660 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1661 for blocked in &[
1662 "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1663 ] {
1664 assert!(
1665 !names.contains(blocked),
1666 "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1667 );
1668 }
1669 }
1670
1671 #[test]
1674 fn section_markers_etapa_fase_filtered_v1025() {
1675 let body = "Etapa 3 do plano: implementar Fase 1 da Migração.";
1677 let ents = apply_regex_prefilter(body);
1678 assert!(
1679 !ents
1680 .iter()
1681 .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1682 "section markers must be stripped; entities: {:?}",
1683 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1684 );
1685 }
1686
1687 #[test]
1688 fn section_markers_passo_secao_filtered_v1025() {
1689 let body = "Siga Passo 2 conforme Seção 3 do manual.";
1690 let ents = apply_regex_prefilter(body);
1691 assert!(
1692 !ents
1693 .iter()
1694 .any(|e| e.name.contains("Passo") || e.name.contains("Seção")),
1695 "Passo/Seção section markers must be stripped; entities: {:?}",
1696 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1697 );
1698 }
1699
1700 #[test]
1703 fn brand_camelcase_extracted_as_organization_v1025() {
1704 let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1706 let ents = apply_regex_prefilter(body);
1707 let openai = ents.iter().find(|e| e.name == "OpenAI");
1708 assert!(
1709 openai.is_some(),
1710 "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1711 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1712 );
1713 assert_eq!(
1714 openai.unwrap().entity_type,
1715 "organization",
1716 "brand CamelCase must map to organization (V008)"
1717 );
1718 }
1719
1720 #[test]
1721 fn brand_postgresql_extracted_as_organization_v1025() {
1722 let body = "migrating from MySQL to PostgreSQL for better performance.";
1723 let ents = apply_regex_prefilter(body);
1724 assert!(
1725 ents.iter()
1726 .any(|e| e.name == "PostgreSQL" && e.entity_type == "organization"),
1727 "PostgreSQL must be extracted as organization; entities: {:?}",
1728 ents.iter()
1729 .map(|e| (&e.name, &e.entity_type))
1730 .collect::<Vec<_>>()
1731 );
1732 }
1733
1734 #[test]
1737 fn iob_org_maps_to_organization_not_project_v1025() {
1738 let tokens = vec!["Microsoft".to_string()];
1740 let labels = vec!["B-ORG".to_string()];
1741 let ents = iob_to_entities(&tokens, &labels);
1742 assert_eq!(
1743 ents[0].entity_type, "organization",
1744 "B-ORG must map to organization (V008); got {}",
1745 ents[0].entity_type
1746 );
1747 }
1748
1749 #[test]
1750 fn iob_loc_maps_to_location_not_concept_v1025() {
1751 let tokens = vec!["São".to_string(), "Paulo".to_string()];
1753 let labels = vec!["B-LOC".to_string(), "I-LOC".to_string()];
1754 let ents = iob_to_entities(&tokens, &labels);
1755 assert_eq!(
1756 ents[0].entity_type, "location",
1757 "B-LOC must map to location (V008); got {}",
1758 ents[0].entity_type
1759 );
1760 }
1761
1762 #[test]
1763 fn iob_date_maps_to_date_not_discarded_v1025() {
1764 let tokens = vec!["2025".to_string(), "-".to_string(), "12".to_string()];
1766 let labels = vec![
1767 "B-DATE".to_string(),
1768 "I-DATE".to_string(),
1769 "I-DATE".to_string(),
1770 ];
1771 let ents = iob_to_entities(&tokens, &labels);
1772 assert_eq!(
1773 ents.len(),
1774 1,
1775 "DATE entity must be emitted (V008); entities: {ents:?}"
1776 );
1777 assert_eq!(ents[0].entity_type, "date");
1778 }
1779
1780 #[test]
1783 fn pt_verb_le_filtered_as_per_v1025() {
1784 let tokens = vec!["Lê".to_string(), "o".to_string(), "livro".to_string()];
1786 let labels = vec!["B-PER".to_string(), "O".to_string(), "O".to_string()];
1787 let ents = iob_to_entities(&tokens, &labels);
1788 assert!(
1789 !ents
1790 .iter()
1791 .any(|e| e.name == "Lê" && e.entity_type == "person"),
1792 "PT verb 'Lê' tagged B-PER must be filtered; entities: {ents:?}"
1793 );
1794 }
1795
1796 #[test]
1797 fn pt_verb_ver_filtered_as_per_v1025() {
1798 let tokens = vec!["Ver".to_string()];
1800 let labels = vec!["B-PER".to_string()];
1801 let ents = iob_to_entities(&tokens, &labels);
1802 assert!(
1803 ents.is_empty(),
1804 "PT verb 'Ver' tagged B-PER must be filtered; entities: {ents:?}"
1805 );
1806 }
1807
1808 fn entity(name: &str, entity_type: &str) -> ExtractedEntity {
1811 ExtractedEntity {
1812 name: name.to_string(),
1813 entity_type: entity_type.to_string(),
1814 }
1815 }
1816
1817 #[test]
1818 fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1819 let regex = vec![entity("Sonne", "concept")];
1821 let ner = vec![entity("Sonnet", "concept")];
1822 let result = merge_and_deduplicate(regex, ner);
1823 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1824 assert_eq!(result[0].name, "Sonnet");
1825 }
1826
1827 #[test]
1828 fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1829 let regex = vec![
1831 entity("Open", "organization"),
1832 entity("OpenAI", "organization"),
1833 ];
1834 let result = merge_and_deduplicate(regex, vec![]);
1835 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1836 assert_eq!(result[0].name, "OpenAI");
1837 }
1838
1839 #[test]
1840 fn merge_keeps_both_when_no_containment_v1025() {
1841 let regex = vec![entity("Alice", "person"), entity("Bob", "person")];
1843 let result = merge_and_deduplicate(regex, vec![]);
1844 assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1845 }
1846
1847 #[test]
1848 fn merge_respects_entity_type_boundary_v1025() {
1849 let regex = vec![entity("Apple", "organization"), entity("Apple", "concept")];
1851 let result = merge_and_deduplicate(regex, vec![]);
1852 assert_eq!(
1853 result.len(),
1854 2,
1855 "expected 2 entities (different types), got: {result:?}"
1856 );
1857 }
1858
1859 #[test]
1860 fn merge_case_insensitive_dedup_v1025() {
1861 let regex = vec![
1863 entity("OpenAI", "organization"),
1864 entity("openai", "organization"),
1865 ];
1866 let result = merge_and_deduplicate(regex, vec![]);
1867 assert_eq!(
1868 result.len(),
1869 1,
1870 "expected 1 entity after case-insensitive dedup, got: {result:?}"
1871 );
1872 }
1873
1874 #[test]
1877 fn iob_section_marker_etapa_filtered_v1025() {
1878 let tokens = vec!["Etapa".to_string(), "3".to_string()];
1880 let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1881 let ents = iob_to_entities(&tokens, &labels);
1882 assert!(
1883 !ents.iter().any(|e| e.name.contains("Etapa")),
1884 "section marker 'Etapa 3' from BERT must be filtered; entities: {ents:?}"
1885 );
1886 }
1887
1888 #[test]
1889 fn iob_section_marker_fase_filtered_v1025() {
1890 let tokens = vec!["Fase".to_string(), "1".to_string()];
1892 let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1893 let ents = iob_to_entities(&tokens, &labels);
1894 assert!(
1895 !ents.iter().any(|e| e.name.contains("Fase")),
1896 "section marker 'Fase 1' from BERT must be filtered; entities: {ents:?}"
1897 );
1898 }
1899}