1use std::collections::HashMap;
7use std::path::{Path, PathBuf};
8use std::sync::OnceLock;
9
10use anyhow::{Context, Result};
11use candle_core::{DType, Device, Tensor};
12use candle_nn::{Linear, Module, VarBuilder};
13use candle_transformers::models::bert::{BertModel, Config as BertConfig};
14use regex::Regex;
15use serde::Deserialize;
16use unicode_normalization::UnicodeNormalization;
17
18use crate::paths::AppPaths;
19use crate::storage::entities::{NewEntity, NewRelationship};
20
21const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
22const MAX_SEQ_LEN: usize = 512;
23const STRIDE: usize = 256;
24const MAX_ENTS: usize = 30;
25const TOP_K_RELATIONS: usize = 5;
26const DEFAULT_RELATION: &str = "mentions";
27const MIN_ENTITY_CHARS: usize = 2;
28
29static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
30static REGEX_URL: OnceLock<Regex> = OnceLock::new();
31static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
32static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
33static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
35static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
37
38const ALL_CAPS_STOPWORDS: &[&str] = &[
52 "ACEITE",
53 "ACK",
54 "ACL",
55 "ACRESCENTADO",
56 "ADICIONAR",
57 "AGENTS",
58 "ALL",
59 "ALTA",
60 "ALWAYS",
61 "API",
62 "ARTEFATOS",
63 "ATIVO",
64 "BAIXA",
65 "BANCO",
66 "BORDA",
67 "BLOQUEAR",
68 "BUG",
69 "CAPÍTULO",
70 "CASO",
71 "CHECKLIST",
72 "CLI",
73 "COMPLETED",
74 "CONFIRMADO",
75 "CONFIRME",
76 "CONTRATO",
77 "CRÍTICO",
78 "CRITICAL",
79 "CSV",
80 "DEVE",
81 "DEVEMOS",
82 "DISCO",
83 "DONE",
84 "EFEITO",
85 "ENTRADA",
86 "ERROR",
87 "ESSA",
88 "ESSE",
89 "ESSENCIAL",
90 "ESTA",
91 "ESTE",
92 "ETAPA",
93 "EVITAR",
94 "EXPANDIR",
95 "EXPOR",
96 "FALHA",
97 "FASE",
98 "FIXED",
99 "FIXME",
100 "FORBIDDEN",
101 "HACK",
102 "HEARTBEAT",
103 "HTTP",
104 "HTTPS",
105 "INATIVO",
106 "JAMAIS",
107 "JSON",
108 "JWT",
109 "LLM",
110 "MUST",
111 "NEGUE",
112 "NEVER",
113 "NOTE",
114 "NUNCA",
115 "OBRIGATÓRIO",
116 "PADRÃO",
117 "PASSO",
118 "PENDING",
119 "PLAN",
120 "PODEMOS",
121 "PROIBIDO",
122 "RECUSE",
123 "REGRAS",
124 "REQUIRED",
125 "REQUISITO",
126 "REST",
127 "SEÇÃO",
128 "SEMPRE",
129 "SHALL",
130 "SHOULD",
131 "SOUL",
132 "TODAS",
133 "TODO",
134 "TODOS",
135 "TOKEN",
136 "TOOLS",
137 "TSV",
138 "UI",
139 "URL",
140 "USAR",
141 "VALIDAR",
142 "VAMOS",
143 "VOCÊ",
144 "WARNING",
145 "XML",
146 "YAML",
147];
148
149const HTTP_METHODS: &[&str] = &[
152 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
153];
154
155fn is_filtered_all_caps(token: &str) -> bool {
156 let is_identifier = token.contains('_');
158 if is_identifier {
159 return false;
160 }
161 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
162}
163
164fn regex_email() -> &'static Regex {
165 REGEX_EMAIL
166 .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
167}
168
169fn regex_url() -> &'static Regex {
170 REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
171}
172
173fn regex_uuid() -> &'static Regex {
174 REGEX_UUID.get_or_init(|| {
175 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
176 .unwrap()
177 })
178}
179
180fn regex_all_caps() -> &'static Regex {
181 REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
182}
183
184fn regex_section_marker() -> &'static Regex {
185 REGEX_SECTION_MARKER.get_or_init(|| {
186 Regex::new(r"\b(?:Etapa|Fase|Passo|Seção|Capítulo)\s+\d+\b").unwrap()
188 })
189}
190
191fn regex_brand_camel() -> &'static Regex {
192 REGEX_BRAND_CAMEL.get_or_init(|| {
193 Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b").unwrap()
196 })
197}
198
199#[derive(Debug, Clone, PartialEq)]
200pub struct ExtractedEntity {
201 pub name: String,
202 pub entity_type: String,
203}
204
205#[derive(Debug, Clone)]
207pub struct ExtractedUrl {
208 pub url: String,
209 pub offset: usize,
211}
212
213#[derive(Debug, Clone)]
214pub struct ExtractionResult {
215 pub entities: Vec<NewEntity>,
216 pub relationships: Vec<NewRelationship>,
217 pub relationships_truncated: bool,
220 pub extraction_method: String,
223 pub urls: Vec<ExtractedUrl>,
225}
226
227pub trait Extractor: Send + Sync {
228 fn extract(&self, body: &str) -> Result<ExtractionResult>;
229}
230
231#[derive(Deserialize)]
232struct ModelConfig {
233 #[serde(default)]
234 id2label: HashMap<String, String>,
235 hidden_size: usize,
236}
237
238struct BertNerModel {
239 bert: BertModel,
240 classifier: Linear,
241 device: Device,
242 id2label: HashMap<usize, String>,
243}
244
245impl BertNerModel {
246 fn load(model_dir: &Path) -> Result<Self> {
247 let config_path = model_dir.join("config.json");
248 let weights_path = model_dir.join("model.safetensors");
249
250 let config_str = std::fs::read_to_string(&config_path)
251 .with_context(|| format!("lendo config.json em {config_path:?}"))?;
252 let model_cfg: ModelConfig =
253 serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
254
255 let id2label: HashMap<usize, String> = model_cfg
256 .id2label
257 .into_iter()
258 .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
259 .collect();
260
261 let num_labels = id2label.len().max(9);
262 let hidden_size = model_cfg.hidden_size;
263
264 let bert_config_str = std::fs::read_to_string(&config_path)
265 .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
266 let bert_cfg: BertConfig =
267 serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
268
269 let device = Device::Cpu;
270
271 let vb = unsafe {
279 VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
280 .with_context(|| format!("mapeando {weights_path:?}"))?
281 };
282 let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
283
284 let cls_vb = vb.pp("classifier");
287 let weight = cls_vb
288 .get((num_labels, hidden_size), "weight")
289 .context("carregando classifier.weight do safetensors")?;
290 let bias = cls_vb
291 .get(num_labels, "bias")
292 .context("carregando classifier.bias do safetensors")?;
293 let classifier = Linear::new(weight, Some(bias));
294
295 Ok(Self {
296 bert,
297 classifier,
298 device,
299 id2label,
300 })
301 }
302
303 fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
304 let len = token_ids.len();
305 let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
306 let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
307
308 let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
309 .context("criando tensor input_ids")?;
310 let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
311 .context("criando tensor token_type_ids")?;
312 let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
313 .context("criando tensor attention_mask")?;
314
315 let sequence_output = self
316 .bert
317 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
318 .context("forward pass do BertModel")?;
319
320 let logits = self
321 .classifier
322 .forward(&sequence_output)
323 .context("forward pass do classificador")?;
324
325 let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
326
327 let num_tokens = logits_2d.dim(0).context("dim(0)")?;
328
329 let mut labels = Vec::with_capacity(num_tokens);
330 for i in 0..num_tokens {
331 let token_logits = logits_2d.get(i).context("get token logits")?;
332 let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
333 let argmax = vec
334 .iter()
335 .enumerate()
336 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
337 .map(|(idx, _)| idx)
338 .unwrap_or(0);
339 let label = self
340 .id2label
341 .get(&argmax)
342 .cloned()
343 .unwrap_or_else(|| "O".to_string());
344 labels.push(label);
345 }
346
347 Ok(labels)
348 }
349
350 fn predict_batch(&self, windows: &[(Vec<u32>, Vec<String>)]) -> Result<Vec<Vec<String>>> {
359 let batch_size = windows.len();
360 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
361 if max_len == 0 {
362 return Ok(vec![vec![]; batch_size]);
363 }
364
365 let mut padded_ids: Vec<Tensor> = Vec::with_capacity(batch_size);
366 let mut padded_masks: Vec<Tensor> = Vec::with_capacity(batch_size);
367
368 for (ids, _) in windows {
369 let len = ids.len();
370 let pad_right = max_len - len;
371
372 let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
373 let t = Tensor::from_vec(ids_i64, len, &self.device)
375 .context("criando tensor de ids para batch")?;
376 let t = t
377 .pad_with_zeros(0, 0, pad_right)
378 .context("padding tensor de ids")?;
379 padded_ids.push(t);
380
381 let mut mask_i64 = vec![1i64; len];
383 mask_i64.extend(vec![0i64; pad_right]);
384 let m = Tensor::from_vec(mask_i64, max_len, &self.device)
385 .context("criando tensor de máscara para batch")?;
386 padded_masks.push(m);
387 }
388
389 let input_ids = Tensor::stack(&padded_ids, 0).context("stack input_ids")?;
391 let attn_mask = Tensor::stack(&padded_masks, 0).context("stack attn_mask")?;
392 let token_type_ids = Tensor::zeros((batch_size, max_len), DType::I64, &self.device)
393 .context("criando token_type_ids batch")?;
394
395 let sequence_output = self
397 .bert
398 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
399 .context("forward pass batch BertModel")?;
400 let logits = self
403 .classifier
404 .forward(&sequence_output)
405 .context("forward pass batch classificador")?;
406 let mut results = Vec::with_capacity(batch_size);
409 for (i, (window_ids, _)) in windows.iter().enumerate() {
410 let example_logits = logits.get(i).context("get logits exemplo")?;
411 let real_len = window_ids.len();
413 let example_slice = example_logits
414 .narrow(0, 0, real_len)
415 .context("narrow para tokens reais")?;
416 let logits_2d: Vec<Vec<f32>> = example_slice.to_vec2().context("to_vec2 logits")?;
417
418 let labels: Vec<String> = logits_2d
419 .iter()
420 .map(|token_logits| {
421 let argmax = token_logits
422 .iter()
423 .enumerate()
424 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
425 .map(|(idx, _)| idx)
426 .unwrap_or(0);
427 self.id2label
428 .get(&argmax)
429 .cloned()
430 .unwrap_or_else(|| "O".to_string())
431 })
432 .collect();
433
434 results.push(labels);
435 }
436
437 Ok(results)
438 }
439}
440
441static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
442
443fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
444 NER_MODEL
445 .get_or_init(|| match load_model(paths) {
446 Ok(m) => Some(m),
447 Err(e) => {
448 tracing::warn!("NER model unavailable (graceful degradation): {e:#}");
449 None
450 }
451 })
452 .as_ref()
453}
454
455fn model_dir(paths: &AppPaths) -> PathBuf {
456 paths.models.join("bert-multilingual-ner")
457}
458
459fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
460 let dir = model_dir(paths);
461 std::fs::create_dir_all(&dir)
462 .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
463
464 let weights = dir.join("model.safetensors");
465 let config = dir.join("config.json");
466 let tokenizer = dir.join("tokenizer.json");
467
468 if weights.exists() && config.exists() && tokenizer.exists() {
469 return Ok(dir);
470 }
471
472 tracing::info!("Downloading NER model (first run, ~676 MB)...");
473 crate::output::emit_progress_i18n(
474 "Downloading NER model (first run, ~676 MB)...",
475 "Baixando modelo NER (primeira execução, ~676 MB)...",
476 );
477
478 let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
479 let repo = api.model(MODEL_ID.to_string());
480
481 for (remote, local) in &[
485 ("model.safetensors", "model.safetensors"),
486 ("config.json", "config.json"),
487 ("onnx/tokenizer.json", "tokenizer.json"),
488 ("tokenizer_config.json", "tokenizer_config.json"),
489 ] {
490 let dest = dir.join(local);
491 if !dest.exists() {
492 let src = repo
493 .get(remote)
494 .with_context(|| format!("baixando {remote} do HF Hub"))?;
495 std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
496 }
497 }
498
499 Ok(dir)
500}
501
502fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
503 let dir = ensure_model_files(paths)?;
504 BertNerModel::load(&dir)
505}
506
507fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
508 let mut entities = Vec::new();
509 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
510
511 let add = |entities: &mut Vec<ExtractedEntity>,
512 seen: &mut std::collections::HashSet<String>,
513 name: &str,
514 entity_type: &str| {
515 let name = name.trim().to_string();
516 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
517 entities.push(ExtractedEntity {
518 name,
519 entity_type: entity_type.to_string(),
520 });
521 }
522 };
523
524 let cleaned = regex_section_marker().replace_all(body, " ");
527 let cleaned = cleaned.as_ref();
528
529 for m in regex_email().find_iter(cleaned) {
530 add(&mut entities, &mut seen, m.as_str(), "concept");
532 }
533 for m in regex_uuid().find_iter(cleaned) {
534 add(&mut entities, &mut seen, m.as_str(), "concept");
535 }
536 for m in regex_all_caps().find_iter(cleaned) {
537 let candidate = m.as_str();
538 if !is_filtered_all_caps(candidate) {
540 add(&mut entities, &mut seen, candidate, "concept");
541 }
542 }
543 for m in regex_brand_camel().find_iter(cleaned) {
546 let name = m.as_str();
547 if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
549 add(&mut entities, &mut seen, name, "organization");
550 }
551 }
552
553 entities
554}
555
556pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
560 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
561 let mut result = Vec::new();
562 for m in regex_url().find_iter(body) {
563 let raw = m.as_str();
564 let cleaned = raw
565 .trim_end_matches('`')
566 .trim_end_matches(',')
567 .trim_end_matches('.')
568 .trim_end_matches(';')
569 .trim_end_matches(')')
570 .trim_end_matches(']')
571 .trim_end_matches('}');
572 if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
573 result.push(ExtractedUrl {
574 url: cleaned.to_string(),
575 offset: m.start(),
576 });
577 }
578 }
579 result
580}
581
582fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
583 let mut entities: Vec<ExtractedEntity> = Vec::new();
584 let mut current_parts: Vec<String> = Vec::new();
585 let mut current_type: Option<String> = None;
586
587 let flush =
588 |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
589 if let Some(t) = typ.take() {
590 let name = parts.join(" ").trim().to_string();
591 let is_single_caps = !name.contains(' ')
595 && name == name.to_uppercase()
596 && name.len() >= MIN_ENTITY_CHARS;
597 let should_skip = is_single_caps && is_filtered_all_caps(&name);
598 let is_section_marker = regex_section_marker().is_match(&name);
603 if name.len() >= MIN_ENTITY_CHARS && !should_skip && !is_section_marker {
604 entities.push(ExtractedEntity {
605 name,
606 entity_type: t,
607 });
608 }
609 parts.clear();
610 }
611 };
612
613 for (token, label) in tokens.iter().zip(labels.iter()) {
614 if label == "O" {
615 flush(&mut current_parts, &mut current_type, &mut entities);
616 continue;
617 }
618
619 let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
620 ("B", rest)
621 } else if let Some(rest) = label.strip_prefix("I-") {
622 ("I", rest)
623 } else {
624 flush(&mut current_parts, &mut current_type, &mut entities);
625 continue;
626 };
627
628 const PT_VERB_FALSE_POSITIVES: &[&str] = &[
632 "Lê", "Vê", "Cá", "Pôr", "Ser", "Vir", "Ver", "Dar", "Ler", "Ter",
633 ];
634
635 let entity_type = match bio_type {
636 "DATE" => "date",
638 "PER" => {
639 if PT_VERB_FALSE_POSITIVES.contains(&token.as_str()) {
641 flush(&mut current_parts, &mut current_type, &mut entities);
642 continue;
643 }
644 "person"
645 }
646 "ORG" => {
647 let t = token.to_lowercase();
648 if t.contains("lib")
649 || t.contains("sdk")
650 || t.contains("cli")
651 || t.contains("crate")
652 || t.contains("npm")
653 {
654 "tool"
655 } else {
656 "organization"
658 }
659 }
660 "LOC" => "location",
662 other => other,
663 };
664
665 if prefix == "B" {
666 if token.starts_with("##") {
667 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
670 if let Some(last) = current_parts.last_mut() {
671 last.push_str(clean);
672 }
673 continue;
674 }
675 flush(&mut current_parts, &mut current_type, &mut entities);
676 current_parts.push(token.clone());
677 current_type = Some(entity_type.to_string());
678 } else if prefix == "I" && current_type.is_some() {
679 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
680 if token.starts_with("##") {
681 if let Some(last) = current_parts.last_mut() {
682 last.push_str(clean);
683 }
684 } else {
685 current_parts.push(clean.to_string());
686 }
687 }
688 }
689
690 flush(&mut current_parts, &mut current_type, &mut entities);
691 entities
692}
693
694fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
698 if entities.len() < 2 {
699 return (Vec::new(), false);
700 }
701
702 let max_rels = crate::constants::max_relationships_per_memory();
705 let n = entities.len().min(MAX_ENTS);
706 let mut rels: Vec<NewRelationship> = Vec::new();
707 let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
708
709 let mut hit_cap = false;
710 'outer: for i in 0..n {
711 if rels.len() >= max_rels {
712 hit_cap = true;
713 break;
714 }
715
716 let mut for_entity = 0usize;
717 for j in (i + 1)..n {
718 if for_entity >= TOP_K_RELATIONS {
719 break;
720 }
721 if rels.len() >= max_rels {
722 hit_cap = true;
723 break 'outer;
724 }
725
726 let src = &entities[i].name;
727 let tgt = &entities[j].name;
728 let key = (src.clone(), tgt.clone());
729
730 if seen.contains(&key) {
731 continue;
732 }
733 seen.insert(key);
734
735 rels.push(NewRelationship {
736 source: src.clone(),
737 target: tgt.clone(),
738 relation: DEFAULT_RELATION.to_string(),
739 strength: 0.5,
740 description: None,
741 });
742 for_entity += 1;
743 }
744 }
745
746 if hit_cap {
748 tracing::warn!(
749 "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
750 n.saturating_sub(1)
751 );
752 }
753
754 (rels, hit_cap)
755}
756
757fn run_ner_sliding_window(
758 model: &BertNerModel,
759 body: &str,
760 paths: &AppPaths,
761) -> Result<Vec<ExtractedEntity>> {
762 let tokenizer_path = model_dir(paths).join("tokenizer.json");
763 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
764 .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
765
766 let encoding = tokenizer
767 .encode(body, false)
768 .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
769
770 let all_ids: Vec<u32> = encoding.get_ids().to_vec();
771 let all_tokens: Vec<String> = encoding
772 .get_tokens()
773 .iter()
774 .map(|s| s.to_string())
775 .collect();
776
777 if all_ids.is_empty() {
778 return Ok(Vec::new());
779 }
780
781 let mut windows: Vec<(Vec<u32>, Vec<String>)> = Vec::new();
783 let mut start = 0usize;
784 loop {
785 let end = (start + MAX_SEQ_LEN).min(all_ids.len());
786 windows.push((
787 all_ids[start..end].to_vec(),
788 all_tokens[start..end].to_vec(),
789 ));
790 if end >= all_ids.len() {
791 break;
792 }
793 start += STRIDE;
794 }
795
796 windows.sort_by_key(|(ids, _)| ids.len());
798
799 let batch_size = crate::constants::ner_batch_size();
801 let mut entities: Vec<ExtractedEntity> = Vec::new();
802 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
803
804 for chunk in windows.chunks(batch_size) {
805 match model.predict_batch(chunk) {
806 Ok(batch_labels) => {
807 for (labels, (_, tokens)) in batch_labels.iter().zip(chunk.iter()) {
808 for ent in iob_to_entities(tokens, labels) {
809 if seen.insert(ent.name.clone()) {
810 entities.push(ent);
811 }
812 }
813 }
814 }
815 Err(e) => {
816 tracing::warn!(
817 "batch NER falhou (chunk de {} janelas): {e:#} — fallback single-window",
818 chunk.len()
819 );
820 for (ids, tokens) in chunk {
822 let mask = vec![1u32; ids.len()];
823 match model.predict(ids, &mask) {
824 Ok(labels) => {
825 for ent in iob_to_entities(tokens, &labels) {
826 if seen.insert(ent.name.clone()) {
827 entities.push(ent);
828 }
829 }
830 }
831 Err(e2) => {
832 tracing::warn!("NER window fallback also failed: {e2:#}");
833 }
834 }
835 }
836 }
837 }
838 }
839
840 Ok(entities)
841}
842
843fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
850 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
851 let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)").unwrap());
854
855 entities
856 .into_iter()
857 .map(|ent| {
858 if let Some(pos) = body.find(&ent.name) {
860 let after_pos = pos + ent.name.len();
861 if after_pos < body.len() {
862 let after = &body[after_pos..];
863 if let Some(m) = suffix_re.find(after) {
864 let suffix = m.as_str();
865 if suffix.len() <= 7 {
868 let extended = format!("{}{}", ent.name, suffix);
869 return ExtractedEntity {
870 name: extended,
871 entity_type: ent.entity_type,
872 };
873 }
874 }
875 }
876 }
877 ent
878 })
879 .collect()
880}
881
882fn augment_versioned_model_names(
902 entities: Vec<ExtractedEntity>,
903 body: &str,
904) -> Vec<ExtractedEntity> {
905 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
906 let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
913 Regex::new(
914 r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
915 )
916 .unwrap()
917 });
918
919 let mut existing_lc: std::collections::HashSet<String> =
920 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
921 let mut result = entities;
922
923 for caps in model_re.captures_iter(body) {
924 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
925 if full_match.is_empty() || full_match.len() > 24 {
928 continue;
929 }
930 let normalized_lc = full_match.to_lowercase();
931 if existing_lc.contains(&normalized_lc) {
932 continue;
933 }
934 if result.len() >= MAX_ENTS {
937 break;
938 }
939 existing_lc.insert(normalized_lc);
940 result.push(ExtractedEntity {
941 name: full_match.to_string(),
942 entity_type: "concept".to_string(),
943 });
944 }
945
946 result
947}
948
949fn merge_and_deduplicate(
950 regex_ents: Vec<ExtractedEntity>,
951 ner_ents: Vec<ExtractedEntity>,
952) -> Vec<ExtractedEntity> {
953 let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
968 let mut result: Vec<ExtractedEntity> = Vec::new();
969 let mut truncated = false;
970
971 let total_input = regex_ents.len() + ner_ents.len();
972 for ent in regex_ents.into_iter().chain(ner_ents) {
973 let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
974 let key = format!("{}\0{}", ent.entity_type, name_lc);
978
979 let mut collision_idx: Option<usize> = None;
984 for (existing_key, idx) in &by_lc {
985 let type_prefix = format!("{}\0", ent.entity_type);
987 if !existing_key.starts_with(&type_prefix) {
988 continue;
989 }
990 let existing_name_lc = &existing_key[type_prefix.len()..];
991 if existing_name_lc == name_lc
992 || existing_name_lc.contains(name_lc.as_str())
993 || name_lc.contains(existing_name_lc)
994 {
995 collision_idx = Some(*idx);
996 break;
997 }
998 }
999 match collision_idx {
1000 Some(idx) => {
1001 if ent.name.len() > result[idx].name.len() {
1004 let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1005 let old_key = format!("{}\0{}", result[idx].entity_type, old_name_lc);
1006 by_lc.remove(&old_key);
1007 result[idx] = ent;
1008 by_lc.insert(key, idx);
1009 }
1010 }
1011 None => {
1012 by_lc.insert(key, result.len());
1013 result.push(ent);
1014 }
1015 }
1016 if result.len() >= MAX_ENTS {
1017 truncated = true;
1018 break;
1019 }
1020 }
1021
1022 if truncated {
1024 tracing::warn!(
1025 "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
1026 );
1027 }
1028
1029 result
1030}
1031
1032fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1033 extracted
1034 .into_iter()
1035 .map(|e| NewEntity {
1036 name: e.name,
1037 entity_type: e.entity_type,
1038 description: None,
1039 })
1040 .collect()
1041}
1042
1043pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
1044 let regex_entities = apply_regex_prefilter(body);
1045
1046 let mut bert_used = false;
1047 let ner_entities = match get_or_init_model(paths) {
1048 Some(model) => match run_ner_sliding_window(model, body, paths) {
1049 Ok(ents) => {
1050 bert_used = true;
1051 ents
1052 }
1053 Err(e) => {
1054 tracing::warn!("NER falhou, usando apenas regex: {e:#}");
1055 Vec::new()
1056 }
1057 },
1058 None => Vec::new(),
1059 };
1060
1061 let merged = merge_and_deduplicate(regex_entities, ner_entities);
1062 let extended = extend_with_numeric_suffix(merged, body);
1064 let with_models = augment_versioned_model_names(extended, body);
1068 let with_models: Vec<ExtractedEntity> = with_models
1072 .into_iter()
1073 .filter(|e| !regex_section_marker().is_match(&e.name))
1074 .collect();
1075 let entities = to_new_entities(with_models);
1076 let (relationships, relationships_truncated) = build_relationships(&entities);
1077
1078 let extraction_method = if bert_used {
1079 "bert+regex-batch".to_string()
1080 } else {
1081 "regex-only".to_string()
1082 };
1083
1084 let urls = extract_urls(body);
1085
1086 Ok(ExtractionResult {
1087 entities,
1088 relationships,
1089 relationships_truncated,
1090 extraction_method,
1091 urls,
1092 })
1093}
1094
1095pub struct RegexExtractor;
1096
1097impl Extractor for RegexExtractor {
1098 fn extract(&self, body: &str) -> Result<ExtractionResult> {
1099 let regex_entities = apply_regex_prefilter(body);
1100 let entities = to_new_entities(regex_entities);
1101 let (relationships, relationships_truncated) = build_relationships(&entities);
1102 let urls = extract_urls(body);
1103 Ok(ExtractionResult {
1104 entities,
1105 relationships,
1106 relationships_truncated,
1107 extraction_method: "regex-only".to_string(),
1108 urls,
1109 })
1110 }
1111}
1112
1113#[cfg(test)]
1114mod tests {
1115 use super::*;
1116
1117 fn make_paths() -> AppPaths {
1118 use std::path::PathBuf;
1119 AppPaths {
1120 db: PathBuf::from("/tmp/test.sqlite"),
1121 models: PathBuf::from("/tmp/test_models"),
1122 }
1123 }
1124
1125 #[test]
1126 fn regex_email_captura_endereco() {
1127 let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
1128 assert!(ents
1130 .iter()
1131 .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
1132 }
1133
1134 #[test]
1135 fn regex_all_caps_filtra_palavra_regra_pt() {
1136 let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
1138 assert!(
1139 !ents.iter().any(|e| e.name == "NUNCA"),
1140 "NUNCA deveria ser filtrado como stopword"
1141 );
1142 assert!(
1143 !ents.iter().any(|e| e.name == "PROIBIDO"),
1144 "PROIBIDO deveria ser filtrado"
1145 );
1146 assert!(
1147 !ents.iter().any(|e| e.name == "DEVE"),
1148 "DEVE deveria ser filtrado"
1149 );
1150 }
1151
1152 #[test]
1153 fn regex_all_caps_aceita_constante_com_underscore() {
1154 let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
1156 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1157 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1158 }
1159
1160 #[test]
1161 fn regex_all_caps_aceita_acronimo_dominio() {
1162 let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
1164 assert!(ents.iter().any(|e| e.name == "OPENAI"));
1165 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1166 }
1167
1168 #[test]
1169 fn regex_url_nao_aparece_em_apply_regex_prefilter() {
1170 let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
1172 assert!(
1173 !ents.iter().any(|e| e.name.starts_with("https://")),
1174 "URLs não devem aparecer como entidades após split P0-2"
1175 );
1176 }
1177
1178 #[test]
1179 fn extract_urls_captura_https() {
1180 let urls = extract_urls("veja https://docs.rs/crate para detalhes");
1181 assert_eq!(urls.len(), 1);
1182 assert_eq!(urls[0].url, "https://docs.rs/crate");
1183 assert!(urls[0].offset > 0);
1184 }
1185
1186 #[test]
1187 fn extract_urls_trim_sufixo_pontuacao() {
1188 let urls = extract_urls("link: https://example.com/path. fim");
1189 assert!(!urls.is_empty());
1190 assert!(
1191 !urls[0].url.ends_with('.'),
1192 "sufixo ponto deve ser removido"
1193 );
1194 }
1195
1196 #[test]
1197 fn extract_urls_deduplica_repetidas() {
1198 let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1199 let urls = extract_urls(body);
1200 assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1201 }
1202
1203 #[test]
1204 fn regex_uuid_captura_identificador() {
1205 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1206 assert!(ents.iter().any(|e| e.entity_type == "concept"));
1207 }
1208
1209 #[test]
1210 fn regex_all_caps_captura_constante() {
1211 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1212 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1213 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1214 }
1215
1216 #[test]
1217 fn regex_all_caps_ignora_palavras_curtas() {
1218 let ents = apply_regex_prefilter("use AI em seu projeto");
1219 assert!(
1220 !ents.iter().any(|e| e.name == "AI"),
1221 "AI tem apenas 2 chars, deve ser ignorado"
1222 );
1223 }
1224
1225 #[test]
1226 fn iob_decodifica_per_para_person() {
1227 let tokens = vec![
1228 "John".to_string(),
1229 "Doe".to_string(),
1230 "trabalhou".to_string(),
1231 ];
1232 let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
1233 let ents = iob_to_entities(&tokens, &labels);
1234 assert_eq!(ents.len(), 1);
1235 assert_eq!(ents[0].entity_type, "person");
1236 assert!(ents[0].name.contains("John"));
1237 }
1238
1239 #[test]
1240 fn iob_strip_subword_b_prefix() {
1241 let tokens = vec!["Open".to_string(), "##AI".to_string()];
1244 let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
1245 let ents = iob_to_entities(&tokens, &labels);
1246 assert!(
1247 ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
1248 "deveria mergear ##AI ou descartar"
1249 );
1250 }
1251
1252 #[test]
1253 fn iob_subword_orphan_descarta() {
1254 let tokens = vec!["##AI".to_string()];
1256 let labels = vec!["B-ORG".to_string()];
1257 let ents = iob_to_entities(&tokens, &labels);
1258 assert!(
1259 ents.is_empty(),
1260 "subword órfão sem entidade ativa deve ser descartado"
1261 );
1262 }
1263
1264 #[test]
1265 fn iob_mapeia_date_para_date_v1025() {
1266 let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
1268 let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
1269 let ents = iob_to_entities(&tokens, &labels);
1270 assert_eq!(ents.len(), 1, "DATE deve ser emitido como entidade v1.0.25");
1271 assert_eq!(ents[0].entity_type, "date");
1272 }
1273
1274 #[test]
1275 fn iob_mapeia_org_para_organization_v1025() {
1276 let tokens = vec!["Empresa".to_string()];
1278 let labels = vec!["B-ORG".to_string()];
1279 let ents = iob_to_entities(&tokens, &labels);
1280 assert_eq!(ents[0].entity_type, "organization");
1281 }
1282
1283 #[test]
1284 fn iob_mapeia_org_sdk_para_tool() {
1285 let tokens = vec!["tokio-sdk".to_string()];
1286 let labels = vec!["B-ORG".to_string()];
1287 let ents = iob_to_entities(&tokens, &labels);
1288 assert_eq!(ents[0].entity_type, "tool");
1289 }
1290
1291 #[test]
1292 fn iob_mapeia_loc_para_location_v1025() {
1293 let tokens = vec!["Brasil".to_string()];
1295 let labels = vec!["B-LOC".to_string()];
1296 let ents = iob_to_entities(&tokens, &labels);
1297 assert_eq!(ents[0].entity_type, "location");
1298 }
1299
1300 #[test]
1301 fn build_relationships_respeitam_max_rels() {
1302 let entities: Vec<NewEntity> = (0..20)
1303 .map(|i| NewEntity {
1304 name: format!("entidade_{i}"),
1305 entity_type: "concept".to_string(),
1306 description: None,
1307 })
1308 .collect();
1309 let (rels, truncated) = build_relationships(&entities);
1310 let max_rels = crate::constants::max_relationships_per_memory();
1311 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1312 if rels.len() == max_rels {
1313 assert!(truncated, "truncated deve ser true quando atingiu o cap");
1314 }
1315 }
1316
1317 #[test]
1318 fn build_relationships_sem_duplicatas() {
1319 let entities: Vec<NewEntity> = (0..5)
1320 .map(|i| NewEntity {
1321 name: format!("ent_{i}"),
1322 entity_type: "concept".to_string(),
1323 description: None,
1324 })
1325 .collect();
1326 let (rels, _truncated) = build_relationships(&entities);
1327 let mut pares: std::collections::HashSet<(String, String)> =
1328 std::collections::HashSet::new();
1329 for r in &rels {
1330 let par = (r.source.clone(), r.target.clone());
1331 assert!(pares.insert(par), "par duplicado encontrado");
1332 }
1333 }
1334
1335 #[test]
1336 fn merge_deduplica_por_nome_lowercase() {
1337 let a = vec![ExtractedEntity {
1340 name: "Rust".to_string(),
1341 entity_type: "concept".to_string(),
1342 }];
1343 let b = vec![ExtractedEntity {
1344 name: "rust".to_string(),
1345 entity_type: "concept".to_string(),
1346 }];
1347 let merged = merge_and_deduplicate(a, b);
1348 assert_eq!(
1349 merged.len(),
1350 1,
1351 "rust e Rust com mesmo tipo são a mesma entidade"
1352 );
1353 }
1354
1355 #[test]
1356 fn regex_extractor_implementa_trait() {
1357 let extractor = RegexExtractor;
1358 let result = extractor
1359 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1360 .unwrap();
1361 assert!(!result.entities.is_empty());
1362 }
1363
1364 #[test]
1365 fn extract_retorna_ok_sem_modelo() {
1366 let paths = make_paths();
1368 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1369 let result = extract_graph_auto(body, &paths).unwrap();
1370 assert!(result
1371 .entities
1372 .iter()
1373 .any(|e| e.name.contains("teste@exemplo.com")));
1374 }
1375
1376 #[test]
1377 fn stopwords_filter_v1024_terms() {
1378 let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1381 DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1382 let ents = apply_regex_prefilter(body);
1383 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1384 for word in &[
1385 "ACEITE",
1386 "ACK",
1387 "ACL",
1388 "BORDA",
1389 "CHECKLIST",
1390 "COMPLETED",
1391 "CONFIRME",
1392 "DEVEMOS",
1393 "DONE",
1394 "FIXED",
1395 "NEGUE",
1396 "PENDING",
1397 "PLAN",
1398 "PODEMOS",
1399 "RECUSE",
1400 "TOKEN",
1401 "VAMOS",
1402 ] {
1403 assert!(
1404 !names.contains(word),
1405 "v1.0.24 stopword {word} should be filtered but was found in entities"
1406 );
1407 }
1408 }
1409
1410 #[test]
1411 fn dedup_normalizes_unicode_combining_marks() {
1412 let nfc = vec![ExtractedEntity {
1416 name: "Café".to_string(),
1417 entity_type: "concept".to_string(),
1418 }];
1419 let nfd_name = "Cafe\u{301}".to_string();
1421 let nfd = vec![ExtractedEntity {
1422 name: nfd_name,
1423 entity_type: "concept".to_string(),
1424 }];
1425 let merged = merge_and_deduplicate(nfc, nfd);
1426 assert_eq!(
1427 merged.len(),
1428 1,
1429 "NFC 'Café' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1430 );
1431 }
1432
1433 #[test]
1436 fn predict_batch_output_count_matches_input() {
1437 let w1_ids: Vec<u32> = vec![101, 100, 102];
1443 let w1_tok: Vec<String> = vec!["[CLS]".into(), "hello".into(), "[SEP]".into()];
1444 let w2_ids: Vec<u32> = vec![101, 100, 200, 300, 102];
1445 let w2_tok: Vec<String> = vec![
1446 "[CLS]".into(),
1447 "world".into(),
1448 "foo".into(),
1449 "bar".into(),
1450 "[SEP]".into(),
1451 ];
1452 let windows: Vec<(Vec<u32>, Vec<String>)> =
1453 vec![(w1_ids.clone(), w1_tok), (w2_ids.clone(), w2_tok)];
1454
1455 let device = Device::Cpu;
1458 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap();
1459 assert_eq!(max_len, 5, "max_len deve ser 5");
1460
1461 let mut padded_ids: Vec<Tensor> = Vec::new();
1462 for (ids, _) in &windows {
1463 let len = ids.len();
1464 let pad_right = max_len - len;
1465 let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
1466 let t = Tensor::from_vec(ids_i64, len, &device).unwrap();
1467 let t = t.pad_with_zeros(0, 0, pad_right).unwrap();
1468 assert_eq!(
1469 t.dims(),
1470 &[max_len],
1471 "cada janela deve ter shape (max_len,) após padding"
1472 );
1473 padded_ids.push(t);
1474 }
1475
1476 let stacked = Tensor::stack(&padded_ids, 0).unwrap();
1477 assert_eq!(
1478 stacked.dims(),
1479 &[2, max_len],
1480 "stack deve produzir (batch_size=2, max_len=5)"
1481 );
1482
1483 let fake_logits_data: Vec<f32> = vec![0.0f32; 2 * max_len * 9]; let fake_logits =
1487 Tensor::from_vec(fake_logits_data, (2usize, max_len, 9usize), &device).unwrap();
1488 for (i, (ids, _)) in windows.iter().enumerate() {
1489 let real_len = ids.len();
1490 let example = fake_logits.get(i).unwrap();
1491 let sliced = example.narrow(0, 0, real_len).unwrap();
1492 assert_eq!(
1493 sliced.dims(),
1494 &[real_len, 9],
1495 "narrow deve preservar apenas {real_len} tokens reais"
1496 );
1497 }
1498 }
1499
1500 #[test]
1501 fn predict_batch_empty_windows_returns_empty() {
1502 let windows: Vec<(Vec<u32>, Vec<String>)> = vec![];
1505 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
1506 assert_eq!(max_len, 0, "zero windows → max_len 0");
1507 let result: Vec<Vec<String>> = if max_len == 0 {
1510 Vec::new()
1511 } else {
1512 unreachable!()
1513 };
1514 assert!(result.is_empty());
1515 }
1516
1517 #[test]
1518 fn ner_batch_size_default_is_8() {
1519 std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1522 assert_eq!(crate::constants::ner_batch_size(), 8);
1523 }
1524
1525 #[test]
1526 fn ner_batch_size_env_override_clamped() {
1527 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "64");
1529 assert_eq!(crate::constants::ner_batch_size(), 32, "deve clampar em 32");
1530
1531 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "0");
1532 assert_eq!(crate::constants::ner_batch_size(), 1, "deve clampar em 1");
1533
1534 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "4");
1535 assert_eq!(
1536 crate::constants::ner_batch_size(),
1537 4,
1538 "valor válido preservado"
1539 );
1540
1541 std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1542 }
1543
1544 #[test]
1545 fn extraction_method_regex_only_unchanged() {
1546 let result = RegexExtractor.extract("contato: dev@acme.io").unwrap();
1549 assert_eq!(
1550 result.extraction_method, "regex-only",
1551 "RegexExtractor deve retornar regex-only"
1552 );
1553 }
1554
1555 #[test]
1558 fn extend_suffix_pure_numeric_unchanged() {
1559 let ents = vec![ExtractedEntity {
1561 name: "GPT".to_string(),
1562 entity_type: "concept".to_string(),
1563 }];
1564 let result = extend_with_numeric_suffix(ents, "usando GPT-5 no projeto");
1565 assert_eq!(
1566 result[0].name, "GPT-5",
1567 "sufixo puramente numérico deve ser estendido"
1568 );
1569 }
1570
1571 #[test]
1572 fn extend_suffix_alphanumeric_letter_after_digit() {
1573 let ents = vec![ExtractedEntity {
1575 name: "GPT".to_string(),
1576 entity_type: "concept".to_string(),
1577 }];
1578 let result = extend_with_numeric_suffix(ents, "usando GPT-4o para tarefas avançadas");
1579 assert_eq!(result[0].name, "GPT-4o", "sufixo '4o' deve ser aceito");
1580 }
1581
1582 #[test]
1583 fn extend_suffix_alphanumeric_b_suffix() {
1584 let ents = vec![ExtractedEntity {
1586 name: "Llama".to_string(),
1587 entity_type: "concept".to_string(),
1588 }];
1589 let result = extend_with_numeric_suffix(ents, "modelo Llama-5b open-weight");
1590 assert_eq!(result[0].name, "Llama-5b", "sufixo '5b' deve ser aceito");
1591 }
1592
1593 #[test]
1594 fn extend_suffix_alphanumeric_x_suffix() {
1595 let ents = vec![ExtractedEntity {
1597 name: "Mistral".to_string(),
1598 entity_type: "concept".to_string(),
1599 }];
1600 let result = extend_with_numeric_suffix(ents, "testando Mistral-8x em produção");
1601 assert_eq!(result[0].name, "Mistral-8x", "sufixo '8x' deve ser aceito");
1602 }
1603
1604 #[test]
1607 fn augment_versioned_gpt4o() {
1608 let result = augment_versioned_model_names(vec![], "usando GPT-4o para análise");
1610 assert!(
1611 result.iter().any(|e| e.name == "GPT-4o"),
1612 "GPT-4o deve ser capturado pelo augment, achados: {:?}",
1613 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1614 );
1615 }
1616
1617 #[test]
1618 fn augment_versioned_claude_4_sonnet() {
1619 let result =
1621 augment_versioned_model_names(vec![], "melhor modelo: Claude 4 Sonnet lançado hoje");
1622 assert!(
1623 result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1624 "Claude 4 Sonnet deve ser capturado, achados: {:?}",
1625 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1626 );
1627 }
1628
1629 #[test]
1630 fn augment_versioned_llama_3_pro() {
1631 let result =
1633 augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1634 assert!(
1635 result.iter().any(|e| e.name == "Llama 3 Pro"),
1636 "Llama 3 Pro deve ser capturado, achados: {:?}",
1637 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1638 );
1639 }
1640
1641 #[test]
1642 fn augment_versioned_mixtral_8x7b() {
1643 let result =
1645 augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1646 assert!(
1647 result.iter().any(|e| e.name == "Mixtral 8x7B"),
1648 "Mixtral 8x7B deve ser capturado, achados: {:?}",
1649 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1650 );
1651 }
1652
1653 #[test]
1654 fn augment_versioned_does_not_duplicate_existing() {
1655 let existing = vec![ExtractedEntity {
1657 name: "Claude 4".to_string(),
1658 entity_type: "concept".to_string(),
1659 }];
1660 let result = augment_versioned_model_names(existing, "usando Claude 4 no projeto");
1661 let count = result.iter().filter(|e| e.name == "Claude 4").count();
1662 assert_eq!(count, 1, "Claude 4 não deve ser duplicado");
1663 }
1664
1665 #[test]
1668 fn stopwords_filter_url_jwt_api_v1025() {
1669 let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1671 let ents = apply_regex_prefilter(body);
1672 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1673 for blocked in &[
1674 "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1675 ] {
1676 assert!(
1677 !names.contains(blocked),
1678 "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1679 );
1680 }
1681 }
1682
1683 #[test]
1686 fn section_markers_etapa_fase_filtered_v1025() {
1687 let body = "Etapa 3 do plano: implementar Fase 1 da Migração.";
1689 let ents = apply_regex_prefilter(body);
1690 assert!(
1691 !ents
1692 .iter()
1693 .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1694 "section markers must be stripped; entities: {:?}",
1695 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1696 );
1697 }
1698
1699 #[test]
1700 fn section_markers_passo_secao_filtered_v1025() {
1701 let body = "Siga Passo 2 conforme Seção 3 do manual.";
1702 let ents = apply_regex_prefilter(body);
1703 assert!(
1704 !ents
1705 .iter()
1706 .any(|e| e.name.contains("Passo") || e.name.contains("Seção")),
1707 "Passo/Seção section markers must be stripped; entities: {:?}",
1708 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1709 );
1710 }
1711
1712 #[test]
1715 fn brand_camelcase_extracted_as_organization_v1025() {
1716 let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1718 let ents = apply_regex_prefilter(body);
1719 let openai = ents.iter().find(|e| e.name == "OpenAI");
1720 assert!(
1721 openai.is_some(),
1722 "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1723 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1724 );
1725 assert_eq!(
1726 openai.unwrap().entity_type,
1727 "organization",
1728 "brand CamelCase must map to organization (V008)"
1729 );
1730 }
1731
1732 #[test]
1733 fn brand_postgresql_extracted_as_organization_v1025() {
1734 let body = "migrating from MySQL to PostgreSQL for better performance.";
1735 let ents = apply_regex_prefilter(body);
1736 assert!(
1737 ents.iter()
1738 .any(|e| e.name == "PostgreSQL" && e.entity_type == "organization"),
1739 "PostgreSQL must be extracted as organization; entities: {:?}",
1740 ents.iter()
1741 .map(|e| (&e.name, &e.entity_type))
1742 .collect::<Vec<_>>()
1743 );
1744 }
1745
1746 #[test]
1749 fn iob_org_maps_to_organization_not_project_v1025() {
1750 let tokens = vec!["Microsoft".to_string()];
1752 let labels = vec!["B-ORG".to_string()];
1753 let ents = iob_to_entities(&tokens, &labels);
1754 assert_eq!(
1755 ents[0].entity_type, "organization",
1756 "B-ORG must map to organization (V008); got {}",
1757 ents[0].entity_type
1758 );
1759 }
1760
1761 #[test]
1762 fn iob_loc_maps_to_location_not_concept_v1025() {
1763 let tokens = vec!["São".to_string(), "Paulo".to_string()];
1765 let labels = vec!["B-LOC".to_string(), "I-LOC".to_string()];
1766 let ents = iob_to_entities(&tokens, &labels);
1767 assert_eq!(
1768 ents[0].entity_type, "location",
1769 "B-LOC must map to location (V008); got {}",
1770 ents[0].entity_type
1771 );
1772 }
1773
1774 #[test]
1775 fn iob_date_maps_to_date_not_discarded_v1025() {
1776 let tokens = vec!["2025".to_string(), "-".to_string(), "12".to_string()];
1778 let labels = vec![
1779 "B-DATE".to_string(),
1780 "I-DATE".to_string(),
1781 "I-DATE".to_string(),
1782 ];
1783 let ents = iob_to_entities(&tokens, &labels);
1784 assert_eq!(
1785 ents.len(),
1786 1,
1787 "DATE entity must be emitted (V008); entities: {ents:?}"
1788 );
1789 assert_eq!(ents[0].entity_type, "date");
1790 }
1791
1792 #[test]
1795 fn pt_verb_le_filtered_as_per_v1025() {
1796 let tokens = vec!["Lê".to_string(), "o".to_string(), "livro".to_string()];
1798 let labels = vec!["B-PER".to_string(), "O".to_string(), "O".to_string()];
1799 let ents = iob_to_entities(&tokens, &labels);
1800 assert!(
1801 !ents
1802 .iter()
1803 .any(|e| e.name == "Lê" && e.entity_type == "person"),
1804 "PT verb 'Lê' tagged B-PER must be filtered; entities: {ents:?}"
1805 );
1806 }
1807
1808 #[test]
1809 fn pt_verb_ver_filtered_as_per_v1025() {
1810 let tokens = vec!["Ver".to_string()];
1812 let labels = vec!["B-PER".to_string()];
1813 let ents = iob_to_entities(&tokens, &labels);
1814 assert!(
1815 ents.is_empty(),
1816 "PT verb 'Ver' tagged B-PER must be filtered; entities: {ents:?}"
1817 );
1818 }
1819
1820 fn entity(name: &str, entity_type: &str) -> ExtractedEntity {
1823 ExtractedEntity {
1824 name: name.to_string(),
1825 entity_type: entity_type.to_string(),
1826 }
1827 }
1828
1829 #[test]
1830 fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1831 let regex = vec![entity("Sonne", "concept")];
1833 let ner = vec![entity("Sonnet", "concept")];
1834 let result = merge_and_deduplicate(regex, ner);
1835 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1836 assert_eq!(result[0].name, "Sonnet");
1837 }
1838
1839 #[test]
1840 fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1841 let regex = vec![
1843 entity("Open", "organization"),
1844 entity("OpenAI", "organization"),
1845 ];
1846 let result = merge_and_deduplicate(regex, vec![]);
1847 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1848 assert_eq!(result[0].name, "OpenAI");
1849 }
1850
1851 #[test]
1852 fn merge_keeps_both_when_no_containment_v1025() {
1853 let regex = vec![entity("Alice", "person"), entity("Bob", "person")];
1855 let result = merge_and_deduplicate(regex, vec![]);
1856 assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1857 }
1858
1859 #[test]
1860 fn merge_respects_entity_type_boundary_v1025() {
1861 let regex = vec![entity("Apple", "organization"), entity("Apple", "concept")];
1863 let result = merge_and_deduplicate(regex, vec![]);
1864 assert_eq!(
1865 result.len(),
1866 2,
1867 "expected 2 entities (different types), got: {result:?}"
1868 );
1869 }
1870
1871 #[test]
1872 fn merge_case_insensitive_dedup_v1025() {
1873 let regex = vec![
1875 entity("OpenAI", "organization"),
1876 entity("openai", "organization"),
1877 ];
1878 let result = merge_and_deduplicate(regex, vec![]);
1879 assert_eq!(
1880 result.len(),
1881 1,
1882 "expected 1 entity after case-insensitive dedup, got: {result:?}"
1883 );
1884 }
1885
1886 #[test]
1889 fn iob_section_marker_etapa_filtered_v1025() {
1890 let tokens = vec!["Etapa".to_string(), "3".to_string()];
1892 let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1893 let ents = iob_to_entities(&tokens, &labels);
1894 assert!(
1895 !ents.iter().any(|e| e.name.contains("Etapa")),
1896 "section marker 'Etapa 3' from BERT must be filtered; entities: {ents:?}"
1897 );
1898 }
1899
1900 #[test]
1901 fn iob_section_marker_fase_filtered_v1025() {
1902 let tokens = vec!["Fase".to_string(), "1".to_string()];
1904 let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1905 let ents = iob_to_entities(&tokens, &labels);
1906 assert!(
1907 !ents.iter().any(|e| e.name.contains("Fase")),
1908 "section marker 'Fase 1' from BERT must be filtered; entities: {ents:?}"
1909 );
1910 }
1911}