1use std::collections::HashMap;
7use std::path::{Path, PathBuf};
8use std::sync::OnceLock;
9
10use anyhow::{Context, Result};
11use candle_core::{DType, Device, Tensor};
12use candle_nn::{Linear, Module, VarBuilder};
13use candle_transformers::models::bert::{BertModel, Config as BertConfig};
14use regex::Regex;
15use serde::Deserialize;
16use unicode_normalization::UnicodeNormalization;
17
18use crate::paths::AppPaths;
19use crate::storage::entities::{NewEntity, NewRelationship};
20
21const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
22const MAX_SEQ_LEN: usize = 512;
23const STRIDE: usize = 256;
24const MAX_ENTS: usize = 30;
25const TOP_K_RELATIONS: usize = 5;
26const DEFAULT_RELATION: &str = "mentions";
27const MIN_ENTITY_CHARS: usize = 2;
28
29static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
30static REGEX_URL: OnceLock<Regex> = OnceLock::new();
31static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
32static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
33static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
35static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
37
38const ALL_CAPS_STOPWORDS: &[&str] = &[
52 "ACEITE",
53 "ACK",
54 "ACL",
55 "ACRESCENTADO",
56 "ADICIONAR",
57 "AGENTS",
58 "ALL",
59 "ALTA",
60 "ALWAYS",
61 "API",
62 "ARTEFATOS",
63 "ATIVO",
64 "BAIXA",
65 "BANCO",
66 "BORDA",
67 "BLOQUEAR",
68 "BUG",
69 "CAPÍTULO",
70 "CASO",
71 "CHECKLIST",
72 "CLI",
73 "COMPLETED",
74 "CONFIRMADO",
75 "CONFIRME",
76 "CONTRATO",
77 "CRÍTICO",
78 "CRITICAL",
79 "CSV",
80 "DEVE",
81 "DEVEMOS",
82 "DISCO",
83 "DONE",
84 "EFEITO",
85 "ENTRADA",
86 "ERROR",
87 "ESSA",
88 "ESSE",
89 "ESSENCIAL",
90 "ESTA",
91 "ESTE",
92 "ETAPA",
93 "EVITAR",
94 "EXPANDIR",
95 "EXPOR",
96 "FALHA",
97 "FASE",
98 "FIXED",
99 "FIXME",
100 "FORBIDDEN",
101 "HACK",
102 "HEARTBEAT",
103 "HTTP",
104 "HTTPS",
105 "INATIVO",
106 "JAMAIS",
107 "JSON",
108 "JWT",
109 "LLM",
110 "MUST",
111 "NEGUE",
112 "NEVER",
113 "NOTE",
114 "NUNCA",
115 "OBRIGATÓRIO",
116 "PADRÃO",
117 "PASSO",
118 "PENDING",
119 "PLAN",
120 "PODEMOS",
121 "PROIBIDO",
122 "RECUSE",
123 "REGRAS",
124 "REQUIRED",
125 "REQUISITO",
126 "REST",
127 "SEÇÃO",
128 "SEMPRE",
129 "SHALL",
130 "SHOULD",
131 "SOUL",
132 "TODAS",
133 "TODO",
134 "TODOS",
135 "TOKEN",
136 "TOOLS",
137 "TSV",
138 "UI",
139 "URL",
140 "USAR",
141 "VALIDAR",
142 "VAMOS",
143 "VOCÊ",
144 "WARNING",
145 "XML",
146 "YAML",
147];
148
149const HTTP_METHODS: &[&str] = &[
152 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
153];
154
155fn is_filtered_all_caps(token: &str) -> bool {
156 let is_identifier = token.contains('_');
158 if is_identifier {
159 return false;
160 }
161 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
162}
163
164fn regex_email() -> &'static Regex {
165 REGEX_EMAIL
166 .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
167}
168
169fn regex_url() -> &'static Regex {
170 REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
171}
172
173fn regex_uuid() -> &'static Regex {
174 REGEX_UUID.get_or_init(|| {
175 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
176 .unwrap()
177 })
178}
179
180fn regex_all_caps() -> &'static Regex {
181 REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
182}
183
184fn regex_section_marker() -> &'static Regex {
185 REGEX_SECTION_MARKER.get_or_init(|| {
186 Regex::new(r"\b(?:Etapa|Fase|Passo|Seção|Capítulo)\s+\d+\b").unwrap()
188 })
189}
190
191fn regex_brand_camel() -> &'static Regex {
192 REGEX_BRAND_CAMEL.get_or_init(|| {
193 Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b").unwrap()
196 })
197}
198
199#[derive(Debug, Clone, PartialEq)]
200pub struct ExtractedEntity {
201 pub name: String,
202 pub entity_type: String,
203}
204
205#[derive(Debug, Clone)]
207pub struct ExtractedUrl {
208 pub url: String,
209 pub offset: usize,
211}
212
213#[derive(Debug, Clone)]
214pub struct ExtractionResult {
215 pub entities: Vec<NewEntity>,
216 pub relationships: Vec<NewRelationship>,
217 pub relationships_truncated: bool,
220 pub extraction_method: String,
223 pub urls: Vec<ExtractedUrl>,
225}
226
227pub trait Extractor: Send + Sync {
228 fn extract(&self, body: &str) -> Result<ExtractionResult>;
229}
230
231#[derive(Deserialize)]
232struct ModelConfig {
233 #[serde(default)]
234 id2label: HashMap<String, String>,
235 hidden_size: usize,
236}
237
238struct BertNerModel {
239 bert: BertModel,
240 classifier: Linear,
241 device: Device,
242 id2label: HashMap<usize, String>,
243}
244
245impl BertNerModel {
246 fn load(model_dir: &Path) -> Result<Self> {
247 let config_path = model_dir.join("config.json");
248 let weights_path = model_dir.join("model.safetensors");
249
250 let config_str = std::fs::read_to_string(&config_path)
251 .with_context(|| format!("lendo config.json em {config_path:?}"))?;
252 let model_cfg: ModelConfig =
253 serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
254
255 let id2label: HashMap<usize, String> = model_cfg
256 .id2label
257 .into_iter()
258 .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
259 .collect();
260
261 let num_labels = id2label.len().max(9);
262 let hidden_size = model_cfg.hidden_size;
263
264 let bert_config_str = std::fs::read_to_string(&config_path)
265 .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
266 let bert_cfg: BertConfig =
267 serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
268
269 let device = Device::Cpu;
270
271 let vb = unsafe {
272 VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
273 .with_context(|| format!("mapeando {weights_path:?}"))?
274 };
275 let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
276
277 let cls_vb = vb.pp("classifier");
280 let weight = cls_vb
281 .get((num_labels, hidden_size), "weight")
282 .context("carregando classifier.weight do safetensors")?;
283 let bias = cls_vb
284 .get(num_labels, "bias")
285 .context("carregando classifier.bias do safetensors")?;
286 let classifier = Linear::new(weight, Some(bias));
287
288 Ok(Self {
289 bert,
290 classifier,
291 device,
292 id2label,
293 })
294 }
295
296 fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
297 let len = token_ids.len();
298 let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
299 let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
300
301 let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
302 .context("criando tensor input_ids")?;
303 let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
304 .context("criando tensor token_type_ids")?;
305 let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
306 .context("criando tensor attention_mask")?;
307
308 let sequence_output = self
309 .bert
310 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
311 .context("forward pass do BertModel")?;
312
313 let logits = self
314 .classifier
315 .forward(&sequence_output)
316 .context("forward pass do classificador")?;
317
318 let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
319
320 let num_tokens = logits_2d.dim(0).context("dim(0)")?;
321
322 let mut labels = Vec::with_capacity(num_tokens);
323 for i in 0..num_tokens {
324 let token_logits = logits_2d.get(i).context("get token logits")?;
325 let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
326 let argmax = vec
327 .iter()
328 .enumerate()
329 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
330 .map(|(idx, _)| idx)
331 .unwrap_or(0);
332 let label = self
333 .id2label
334 .get(&argmax)
335 .cloned()
336 .unwrap_or_else(|| "O".to_string());
337 labels.push(label);
338 }
339
340 Ok(labels)
341 }
342
343 fn predict_batch(&self, windows: &[(Vec<u32>, Vec<String>)]) -> Result<Vec<Vec<String>>> {
352 let batch_size = windows.len();
353 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
354 if max_len == 0 {
355 return Ok(vec![vec![]; batch_size]);
356 }
357
358 let mut padded_ids: Vec<Tensor> = Vec::with_capacity(batch_size);
359 let mut padded_masks: Vec<Tensor> = Vec::with_capacity(batch_size);
360
361 for (ids, _) in windows {
362 let len = ids.len();
363 let pad_right = max_len - len;
364
365 let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
366 let t = Tensor::from_vec(ids_i64, len, &self.device)
368 .context("criando tensor de ids para batch")?;
369 let t = t
370 .pad_with_zeros(0, 0, pad_right)
371 .context("padding tensor de ids")?;
372 padded_ids.push(t);
373
374 let mut mask_i64 = vec![1i64; len];
376 mask_i64.extend(vec![0i64; pad_right]);
377 let m = Tensor::from_vec(mask_i64, max_len, &self.device)
378 .context("criando tensor de máscara para batch")?;
379 padded_masks.push(m);
380 }
381
382 let input_ids = Tensor::stack(&padded_ids, 0).context("stack input_ids")?;
384 let attn_mask = Tensor::stack(&padded_masks, 0).context("stack attn_mask")?;
385 let token_type_ids = Tensor::zeros((batch_size, max_len), DType::I64, &self.device)
386 .context("criando token_type_ids batch")?;
387
388 let sequence_output = self
390 .bert
391 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
392 .context("forward pass batch BertModel")?;
393 let logits = self
396 .classifier
397 .forward(&sequence_output)
398 .context("forward pass batch classificador")?;
399 let mut results = Vec::with_capacity(batch_size);
402 for (i, (window_ids, _)) in windows.iter().enumerate() {
403 let example_logits = logits.get(i).context("get logits exemplo")?;
404 let real_len = window_ids.len();
406 let example_slice = example_logits
407 .narrow(0, 0, real_len)
408 .context("narrow para tokens reais")?;
409 let logits_2d: Vec<Vec<f32>> = example_slice.to_vec2().context("to_vec2 logits")?;
410
411 let labels: Vec<String> = logits_2d
412 .iter()
413 .map(|token_logits| {
414 let argmax = token_logits
415 .iter()
416 .enumerate()
417 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
418 .map(|(idx, _)| idx)
419 .unwrap_or(0);
420 self.id2label
421 .get(&argmax)
422 .cloned()
423 .unwrap_or_else(|| "O".to_string())
424 })
425 .collect();
426
427 results.push(labels);
428 }
429
430 Ok(results)
431 }
432}
433
434static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
435
436fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
437 NER_MODEL
438 .get_or_init(|| match load_model(paths) {
439 Ok(m) => Some(m),
440 Err(e) => {
441 tracing::warn!("NER model unavailable (graceful degradation): {e:#}");
442 None
443 }
444 })
445 .as_ref()
446}
447
448fn model_dir(paths: &AppPaths) -> PathBuf {
449 paths.models.join("bert-multilingual-ner")
450}
451
452fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
453 let dir = model_dir(paths);
454 std::fs::create_dir_all(&dir)
455 .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
456
457 let weights = dir.join("model.safetensors");
458 let config = dir.join("config.json");
459 let tokenizer = dir.join("tokenizer.json");
460
461 if weights.exists() && config.exists() && tokenizer.exists() {
462 return Ok(dir);
463 }
464
465 tracing::info!("Downloading NER model (first run, ~676 MB)...");
466 crate::output::emit_progress_i18n(
467 "Downloading NER model (first run, ~676 MB)...",
468 "Baixando modelo NER (primeira execução, ~676 MB)...",
469 );
470
471 let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
472 let repo = api.model(MODEL_ID.to_string());
473
474 for (remote, local) in &[
478 ("model.safetensors", "model.safetensors"),
479 ("config.json", "config.json"),
480 ("onnx/tokenizer.json", "tokenizer.json"),
481 ("tokenizer_config.json", "tokenizer_config.json"),
482 ] {
483 let dest = dir.join(local);
484 if !dest.exists() {
485 let src = repo
486 .get(remote)
487 .with_context(|| format!("baixando {remote} do HF Hub"))?;
488 std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
489 }
490 }
491
492 Ok(dir)
493}
494
495fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
496 let dir = ensure_model_files(paths)?;
497 BertNerModel::load(&dir)
498}
499
500fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
501 let mut entities = Vec::new();
502 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
503
504 let add = |entities: &mut Vec<ExtractedEntity>,
505 seen: &mut std::collections::HashSet<String>,
506 name: &str,
507 entity_type: &str| {
508 let name = name.trim().to_string();
509 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
510 entities.push(ExtractedEntity {
511 name,
512 entity_type: entity_type.to_string(),
513 });
514 }
515 };
516
517 let cleaned = regex_section_marker().replace_all(body, " ");
520 let cleaned = cleaned.as_ref();
521
522 for m in regex_email().find_iter(cleaned) {
523 add(&mut entities, &mut seen, m.as_str(), "concept");
525 }
526 for m in regex_uuid().find_iter(cleaned) {
527 add(&mut entities, &mut seen, m.as_str(), "concept");
528 }
529 for m in regex_all_caps().find_iter(cleaned) {
530 let candidate = m.as_str();
531 if !is_filtered_all_caps(candidate) {
533 add(&mut entities, &mut seen, candidate, "concept");
534 }
535 }
536 for m in regex_brand_camel().find_iter(cleaned) {
539 let name = m.as_str();
540 if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
542 add(&mut entities, &mut seen, name, "organization");
543 }
544 }
545
546 entities
547}
548
549pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
553 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
554 let mut result = Vec::new();
555 for m in regex_url().find_iter(body) {
556 let raw = m.as_str();
557 let cleaned = raw
558 .trim_end_matches('`')
559 .trim_end_matches(',')
560 .trim_end_matches('.')
561 .trim_end_matches(';')
562 .trim_end_matches(')')
563 .trim_end_matches(']')
564 .trim_end_matches('}');
565 if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
566 result.push(ExtractedUrl {
567 url: cleaned.to_string(),
568 offset: m.start(),
569 });
570 }
571 }
572 result
573}
574
575fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
576 let mut entities: Vec<ExtractedEntity> = Vec::new();
577 let mut current_parts: Vec<String> = Vec::new();
578 let mut current_type: Option<String> = None;
579
580 let flush =
581 |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
582 if let Some(t) = typ.take() {
583 let name = parts.join(" ").trim().to_string();
584 let is_single_caps = !name.contains(' ')
588 && name == name.to_uppercase()
589 && name.len() >= MIN_ENTITY_CHARS;
590 let should_skip = is_single_caps && is_filtered_all_caps(&name);
591 let is_section_marker = regex_section_marker().is_match(&name);
596 if name.len() >= MIN_ENTITY_CHARS && !should_skip && !is_section_marker {
597 entities.push(ExtractedEntity {
598 name,
599 entity_type: t,
600 });
601 }
602 parts.clear();
603 }
604 };
605
606 for (token, label) in tokens.iter().zip(labels.iter()) {
607 if label == "O" {
608 flush(&mut current_parts, &mut current_type, &mut entities);
609 continue;
610 }
611
612 let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
613 ("B", rest)
614 } else if let Some(rest) = label.strip_prefix("I-") {
615 ("I", rest)
616 } else {
617 flush(&mut current_parts, &mut current_type, &mut entities);
618 continue;
619 };
620
621 const PT_VERB_FALSE_POSITIVES: &[&str] = &[
625 "Lê", "Vê", "Cá", "Pôr", "Ser", "Vir", "Ver", "Dar", "Ler", "Ter",
626 ];
627
628 let entity_type = match bio_type {
629 "DATE" => "date",
631 "PER" => {
632 if PT_VERB_FALSE_POSITIVES.contains(&token.as_str()) {
634 flush(&mut current_parts, &mut current_type, &mut entities);
635 continue;
636 }
637 "person"
638 }
639 "ORG" => {
640 let t = token.to_lowercase();
641 if t.contains("lib")
642 || t.contains("sdk")
643 || t.contains("cli")
644 || t.contains("crate")
645 || t.contains("npm")
646 {
647 "tool"
648 } else {
649 "organization"
651 }
652 }
653 "LOC" => "location",
655 other => other,
656 };
657
658 if prefix == "B" {
659 if token.starts_with("##") {
660 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
663 if let Some(last) = current_parts.last_mut() {
664 last.push_str(clean);
665 }
666 continue;
667 }
668 flush(&mut current_parts, &mut current_type, &mut entities);
669 current_parts.push(token.clone());
670 current_type = Some(entity_type.to_string());
671 } else if prefix == "I" && current_type.is_some() {
672 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
673 if token.starts_with("##") {
674 if let Some(last) = current_parts.last_mut() {
675 last.push_str(clean);
676 }
677 } else {
678 current_parts.push(clean.to_string());
679 }
680 }
681 }
682
683 flush(&mut current_parts, &mut current_type, &mut entities);
684 entities
685}
686
687fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
691 if entities.len() < 2 {
692 return (Vec::new(), false);
693 }
694
695 let max_rels = crate::constants::max_relationships_per_memory();
698 let n = entities.len().min(MAX_ENTS);
699 let mut rels: Vec<NewRelationship> = Vec::new();
700 let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
701
702 let mut hit_cap = false;
703 'outer: for i in 0..n {
704 if rels.len() >= max_rels {
705 hit_cap = true;
706 break;
707 }
708
709 let mut for_entity = 0usize;
710 for j in (i + 1)..n {
711 if for_entity >= TOP_K_RELATIONS {
712 break;
713 }
714 if rels.len() >= max_rels {
715 hit_cap = true;
716 break 'outer;
717 }
718
719 let src = &entities[i].name;
720 let tgt = &entities[j].name;
721 let key = (src.clone(), tgt.clone());
722
723 if seen.contains(&key) {
724 continue;
725 }
726 seen.insert(key);
727
728 rels.push(NewRelationship {
729 source: src.clone(),
730 target: tgt.clone(),
731 relation: DEFAULT_RELATION.to_string(),
732 strength: 0.5,
733 description: None,
734 });
735 for_entity += 1;
736 }
737 }
738
739 if hit_cap {
741 tracing::warn!(
742 "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
743 n.saturating_sub(1)
744 );
745 }
746
747 (rels, hit_cap)
748}
749
750fn run_ner_sliding_window(
751 model: &BertNerModel,
752 body: &str,
753 paths: &AppPaths,
754) -> Result<Vec<ExtractedEntity>> {
755 let tokenizer_path = model_dir(paths).join("tokenizer.json");
756 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
757 .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
758
759 let encoding = tokenizer
760 .encode(body, false)
761 .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
762
763 let all_ids: Vec<u32> = encoding.get_ids().to_vec();
764 let all_tokens: Vec<String> = encoding
765 .get_tokens()
766 .iter()
767 .map(|s| s.to_string())
768 .collect();
769
770 if all_ids.is_empty() {
771 return Ok(Vec::new());
772 }
773
774 let mut windows: Vec<(Vec<u32>, Vec<String>)> = Vec::new();
776 let mut start = 0usize;
777 loop {
778 let end = (start + MAX_SEQ_LEN).min(all_ids.len());
779 windows.push((
780 all_ids[start..end].to_vec(),
781 all_tokens[start..end].to_vec(),
782 ));
783 if end >= all_ids.len() {
784 break;
785 }
786 start += STRIDE;
787 }
788
789 windows.sort_by_key(|(ids, _)| ids.len());
791
792 let batch_size = crate::constants::ner_batch_size();
794 let mut entities: Vec<ExtractedEntity> = Vec::new();
795 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
796
797 for chunk in windows.chunks(batch_size) {
798 match model.predict_batch(chunk) {
799 Ok(batch_labels) => {
800 for (labels, (_, tokens)) in batch_labels.iter().zip(chunk.iter()) {
801 for ent in iob_to_entities(tokens, labels) {
802 if seen.insert(ent.name.clone()) {
803 entities.push(ent);
804 }
805 }
806 }
807 }
808 Err(e) => {
809 tracing::warn!(
810 "batch NER falhou (chunk de {} janelas): {e:#} — fallback single-window",
811 chunk.len()
812 );
813 for (ids, tokens) in chunk {
815 let mask = vec![1u32; ids.len()];
816 match model.predict(ids, &mask) {
817 Ok(labels) => {
818 for ent in iob_to_entities(tokens, &labels) {
819 if seen.insert(ent.name.clone()) {
820 entities.push(ent);
821 }
822 }
823 }
824 Err(e2) => {
825 tracing::warn!("NER window fallback also failed: {e2:#}");
826 }
827 }
828 }
829 }
830 }
831 }
832
833 Ok(entities)
834}
835
836fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
843 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
844 let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)").unwrap());
847
848 entities
849 .into_iter()
850 .map(|ent| {
851 if let Some(pos) = body.find(&ent.name) {
853 let after_pos = pos + ent.name.len();
854 if after_pos < body.len() {
855 let after = &body[after_pos..];
856 if let Some(m) = suffix_re.find(after) {
857 let suffix = m.as_str();
858 if suffix.len() <= 7 {
861 let extended = format!("{}{}", ent.name, suffix);
862 return ExtractedEntity {
863 name: extended,
864 entity_type: ent.entity_type,
865 };
866 }
867 }
868 }
869 }
870 ent
871 })
872 .collect()
873}
874
875fn augment_versioned_model_names(
895 entities: Vec<ExtractedEntity>,
896 body: &str,
897) -> Vec<ExtractedEntity> {
898 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
899 let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
906 Regex::new(
907 r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
908 )
909 .unwrap()
910 });
911
912 let mut existing_lc: std::collections::HashSet<String> =
913 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
914 let mut result = entities;
915
916 for caps in model_re.captures_iter(body) {
917 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
918 if full_match.is_empty() || full_match.len() > 24 {
921 continue;
922 }
923 let normalized_lc = full_match.to_lowercase();
924 if existing_lc.contains(&normalized_lc) {
925 continue;
926 }
927 if result.len() >= MAX_ENTS {
930 break;
931 }
932 existing_lc.insert(normalized_lc);
933 result.push(ExtractedEntity {
934 name: full_match.to_string(),
935 entity_type: "concept".to_string(),
936 });
937 }
938
939 result
940}
941
942fn merge_and_deduplicate(
943 regex_ents: Vec<ExtractedEntity>,
944 ner_ents: Vec<ExtractedEntity>,
945) -> Vec<ExtractedEntity> {
946 let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
961 let mut result: Vec<ExtractedEntity> = Vec::new();
962 let mut truncated = false;
963
964 let total_input = regex_ents.len() + ner_ents.len();
965 for ent in regex_ents.into_iter().chain(ner_ents) {
966 let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
967 let key = format!("{}\0{}", ent.entity_type, name_lc);
971
972 let mut collision_idx: Option<usize> = None;
977 for (existing_key, idx) in &by_lc {
978 let type_prefix = format!("{}\0", ent.entity_type);
980 if !existing_key.starts_with(&type_prefix) {
981 continue;
982 }
983 let existing_name_lc = &existing_key[type_prefix.len()..];
984 if existing_name_lc == name_lc
985 || existing_name_lc.contains(name_lc.as_str())
986 || name_lc.contains(existing_name_lc)
987 {
988 collision_idx = Some(*idx);
989 break;
990 }
991 }
992 match collision_idx {
993 Some(idx) => {
994 if ent.name.len() > result[idx].name.len() {
997 let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
998 let old_key = format!("{}\0{}", result[idx].entity_type, old_name_lc);
999 by_lc.remove(&old_key);
1000 result[idx] = ent;
1001 by_lc.insert(key, idx);
1002 }
1003 }
1004 None => {
1005 by_lc.insert(key, result.len());
1006 result.push(ent);
1007 }
1008 }
1009 if result.len() >= MAX_ENTS {
1010 truncated = true;
1011 break;
1012 }
1013 }
1014
1015 if truncated {
1017 tracing::warn!(
1018 "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
1019 );
1020 }
1021
1022 result
1023}
1024
1025fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1026 extracted
1027 .into_iter()
1028 .map(|e| NewEntity {
1029 name: e.name,
1030 entity_type: e.entity_type,
1031 description: None,
1032 })
1033 .collect()
1034}
1035
1036pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
1037 let regex_entities = apply_regex_prefilter(body);
1038
1039 let mut bert_used = false;
1040 let ner_entities = match get_or_init_model(paths) {
1041 Some(model) => match run_ner_sliding_window(model, body, paths) {
1042 Ok(ents) => {
1043 bert_used = true;
1044 ents
1045 }
1046 Err(e) => {
1047 tracing::warn!("NER falhou, usando apenas regex: {e:#}");
1048 Vec::new()
1049 }
1050 },
1051 None => Vec::new(),
1052 };
1053
1054 let merged = merge_and_deduplicate(regex_entities, ner_entities);
1055 let extended = extend_with_numeric_suffix(merged, body);
1057 let with_models = augment_versioned_model_names(extended, body);
1061 let with_models: Vec<ExtractedEntity> = with_models
1065 .into_iter()
1066 .filter(|e| !regex_section_marker().is_match(&e.name))
1067 .collect();
1068 let entities = to_new_entities(with_models);
1069 let (relationships, relationships_truncated) = build_relationships(&entities);
1070
1071 let extraction_method = if bert_used {
1072 "bert+regex-batch".to_string()
1073 } else {
1074 "regex-only".to_string()
1075 };
1076
1077 let urls = extract_urls(body);
1078
1079 Ok(ExtractionResult {
1080 entities,
1081 relationships,
1082 relationships_truncated,
1083 extraction_method,
1084 urls,
1085 })
1086}
1087
1088pub struct RegexExtractor;
1089
1090impl Extractor for RegexExtractor {
1091 fn extract(&self, body: &str) -> Result<ExtractionResult> {
1092 let regex_entities = apply_regex_prefilter(body);
1093 let entities = to_new_entities(regex_entities);
1094 let (relationships, relationships_truncated) = build_relationships(&entities);
1095 let urls = extract_urls(body);
1096 Ok(ExtractionResult {
1097 entities,
1098 relationships,
1099 relationships_truncated,
1100 extraction_method: "regex-only".to_string(),
1101 urls,
1102 })
1103 }
1104}
1105
1106#[cfg(test)]
1107mod tests {
1108 use super::*;
1109
1110 fn make_paths() -> AppPaths {
1111 use std::path::PathBuf;
1112 AppPaths {
1113 db: PathBuf::from("/tmp/test.sqlite"),
1114 models: PathBuf::from("/tmp/test_models"),
1115 }
1116 }
1117
1118 #[test]
1119 fn regex_email_captura_endereco() {
1120 let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
1121 assert!(ents
1123 .iter()
1124 .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
1125 }
1126
1127 #[test]
1128 fn regex_all_caps_filtra_palavra_regra_pt() {
1129 let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
1131 assert!(
1132 !ents.iter().any(|e| e.name == "NUNCA"),
1133 "NUNCA deveria ser filtrado como stopword"
1134 );
1135 assert!(
1136 !ents.iter().any(|e| e.name == "PROIBIDO"),
1137 "PROIBIDO deveria ser filtrado"
1138 );
1139 assert!(
1140 !ents.iter().any(|e| e.name == "DEVE"),
1141 "DEVE deveria ser filtrado"
1142 );
1143 }
1144
1145 #[test]
1146 fn regex_all_caps_aceita_constante_com_underscore() {
1147 let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
1149 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1150 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1151 }
1152
1153 #[test]
1154 fn regex_all_caps_aceita_acronimo_dominio() {
1155 let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
1157 assert!(ents.iter().any(|e| e.name == "OPENAI"));
1158 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1159 }
1160
1161 #[test]
1162 fn regex_url_nao_aparece_em_apply_regex_prefilter() {
1163 let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
1165 assert!(
1166 !ents.iter().any(|e| e.name.starts_with("https://")),
1167 "URLs não devem aparecer como entidades após split P0-2"
1168 );
1169 }
1170
1171 #[test]
1172 fn extract_urls_captura_https() {
1173 let urls = extract_urls("veja https://docs.rs/crate para detalhes");
1174 assert_eq!(urls.len(), 1);
1175 assert_eq!(urls[0].url, "https://docs.rs/crate");
1176 assert!(urls[0].offset > 0);
1177 }
1178
1179 #[test]
1180 fn extract_urls_trim_sufixo_pontuacao() {
1181 let urls = extract_urls("link: https://example.com/path. fim");
1182 assert!(!urls.is_empty());
1183 assert!(
1184 !urls[0].url.ends_with('.'),
1185 "sufixo ponto deve ser removido"
1186 );
1187 }
1188
1189 #[test]
1190 fn extract_urls_deduplica_repetidas() {
1191 let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1192 let urls = extract_urls(body);
1193 assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1194 }
1195
1196 #[test]
1197 fn regex_uuid_captura_identificador() {
1198 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1199 assert!(ents.iter().any(|e| e.entity_type == "concept"));
1200 }
1201
1202 #[test]
1203 fn regex_all_caps_captura_constante() {
1204 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1205 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1206 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1207 }
1208
1209 #[test]
1210 fn regex_all_caps_ignora_palavras_curtas() {
1211 let ents = apply_regex_prefilter("use AI em seu projeto");
1212 assert!(
1213 !ents.iter().any(|e| e.name == "AI"),
1214 "AI tem apenas 2 chars, deve ser ignorado"
1215 );
1216 }
1217
1218 #[test]
1219 fn iob_decodifica_per_para_person() {
1220 let tokens = vec![
1221 "John".to_string(),
1222 "Doe".to_string(),
1223 "trabalhou".to_string(),
1224 ];
1225 let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
1226 let ents = iob_to_entities(&tokens, &labels);
1227 assert_eq!(ents.len(), 1);
1228 assert_eq!(ents[0].entity_type, "person");
1229 assert!(ents[0].name.contains("John"));
1230 }
1231
1232 #[test]
1233 fn iob_strip_subword_b_prefix() {
1234 let tokens = vec!["Open".to_string(), "##AI".to_string()];
1237 let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
1238 let ents = iob_to_entities(&tokens, &labels);
1239 assert!(
1240 ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
1241 "deveria mergear ##AI ou descartar"
1242 );
1243 }
1244
1245 #[test]
1246 fn iob_subword_orphan_descarta() {
1247 let tokens = vec!["##AI".to_string()];
1249 let labels = vec!["B-ORG".to_string()];
1250 let ents = iob_to_entities(&tokens, &labels);
1251 assert!(
1252 ents.is_empty(),
1253 "subword órfão sem entidade ativa deve ser descartado"
1254 );
1255 }
1256
1257 #[test]
1258 fn iob_mapeia_date_para_date_v1025() {
1259 let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
1261 let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
1262 let ents = iob_to_entities(&tokens, &labels);
1263 assert_eq!(ents.len(), 1, "DATE deve ser emitido como entidade v1.0.25");
1264 assert_eq!(ents[0].entity_type, "date");
1265 }
1266
1267 #[test]
1268 fn iob_mapeia_org_para_organization_v1025() {
1269 let tokens = vec!["Empresa".to_string()];
1271 let labels = vec!["B-ORG".to_string()];
1272 let ents = iob_to_entities(&tokens, &labels);
1273 assert_eq!(ents[0].entity_type, "organization");
1274 }
1275
1276 #[test]
1277 fn iob_mapeia_org_sdk_para_tool() {
1278 let tokens = vec!["tokio-sdk".to_string()];
1279 let labels = vec!["B-ORG".to_string()];
1280 let ents = iob_to_entities(&tokens, &labels);
1281 assert_eq!(ents[0].entity_type, "tool");
1282 }
1283
1284 #[test]
1285 fn iob_mapeia_loc_para_location_v1025() {
1286 let tokens = vec!["Brasil".to_string()];
1288 let labels = vec!["B-LOC".to_string()];
1289 let ents = iob_to_entities(&tokens, &labels);
1290 assert_eq!(ents[0].entity_type, "location");
1291 }
1292
1293 #[test]
1294 fn build_relationships_respeitam_max_rels() {
1295 let entities: Vec<NewEntity> = (0..20)
1296 .map(|i| NewEntity {
1297 name: format!("entidade_{i}"),
1298 entity_type: "concept".to_string(),
1299 description: None,
1300 })
1301 .collect();
1302 let (rels, truncated) = build_relationships(&entities);
1303 let max_rels = crate::constants::max_relationships_per_memory();
1304 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1305 if rels.len() == max_rels {
1306 assert!(truncated, "truncated deve ser true quando atingiu o cap");
1307 }
1308 }
1309
1310 #[test]
1311 fn build_relationships_sem_duplicatas() {
1312 let entities: Vec<NewEntity> = (0..5)
1313 .map(|i| NewEntity {
1314 name: format!("ent_{i}"),
1315 entity_type: "concept".to_string(),
1316 description: None,
1317 })
1318 .collect();
1319 let (rels, _truncated) = build_relationships(&entities);
1320 let mut pares: std::collections::HashSet<(String, String)> =
1321 std::collections::HashSet::new();
1322 for r in &rels {
1323 let par = (r.source.clone(), r.target.clone());
1324 assert!(pares.insert(par), "par duplicado encontrado");
1325 }
1326 }
1327
1328 #[test]
1329 fn merge_deduplica_por_nome_lowercase() {
1330 let a = vec![ExtractedEntity {
1333 name: "Rust".to_string(),
1334 entity_type: "concept".to_string(),
1335 }];
1336 let b = vec![ExtractedEntity {
1337 name: "rust".to_string(),
1338 entity_type: "concept".to_string(),
1339 }];
1340 let merged = merge_and_deduplicate(a, b);
1341 assert_eq!(
1342 merged.len(),
1343 1,
1344 "rust e Rust com mesmo tipo são a mesma entidade"
1345 );
1346 }
1347
1348 #[test]
1349 fn regex_extractor_implementa_trait() {
1350 let extractor = RegexExtractor;
1351 let result = extractor
1352 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1353 .unwrap();
1354 assert!(!result.entities.is_empty());
1355 }
1356
1357 #[test]
1358 fn extract_retorna_ok_sem_modelo() {
1359 let paths = make_paths();
1361 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1362 let result = extract_graph_auto(body, &paths).unwrap();
1363 assert!(result
1364 .entities
1365 .iter()
1366 .any(|e| e.name.contains("teste@exemplo.com")));
1367 }
1368
1369 #[test]
1370 fn stopwords_filter_v1024_terms() {
1371 let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1374 DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1375 let ents = apply_regex_prefilter(body);
1376 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1377 for word in &[
1378 "ACEITE",
1379 "ACK",
1380 "ACL",
1381 "BORDA",
1382 "CHECKLIST",
1383 "COMPLETED",
1384 "CONFIRME",
1385 "DEVEMOS",
1386 "DONE",
1387 "FIXED",
1388 "NEGUE",
1389 "PENDING",
1390 "PLAN",
1391 "PODEMOS",
1392 "RECUSE",
1393 "TOKEN",
1394 "VAMOS",
1395 ] {
1396 assert!(
1397 !names.contains(word),
1398 "v1.0.24 stopword {word} should be filtered but was found in entities"
1399 );
1400 }
1401 }
1402
1403 #[test]
1404 fn dedup_normalizes_unicode_combining_marks() {
1405 let nfc = vec![ExtractedEntity {
1409 name: "Café".to_string(),
1410 entity_type: "concept".to_string(),
1411 }];
1412 let nfd_name = "Cafe\u{301}".to_string();
1414 let nfd = vec![ExtractedEntity {
1415 name: nfd_name,
1416 entity_type: "concept".to_string(),
1417 }];
1418 let merged = merge_and_deduplicate(nfc, nfd);
1419 assert_eq!(
1420 merged.len(),
1421 1,
1422 "NFC 'Café' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1423 );
1424 }
1425
1426 #[test]
1429 fn predict_batch_output_count_matches_input() {
1430 let w1_ids: Vec<u32> = vec![101, 100, 102];
1436 let w1_tok: Vec<String> = vec!["[CLS]".into(), "hello".into(), "[SEP]".into()];
1437 let w2_ids: Vec<u32> = vec![101, 100, 200, 300, 102];
1438 let w2_tok: Vec<String> = vec![
1439 "[CLS]".into(),
1440 "world".into(),
1441 "foo".into(),
1442 "bar".into(),
1443 "[SEP]".into(),
1444 ];
1445 let windows: Vec<(Vec<u32>, Vec<String>)> =
1446 vec![(w1_ids.clone(), w1_tok), (w2_ids.clone(), w2_tok)];
1447
1448 let device = Device::Cpu;
1451 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap();
1452 assert_eq!(max_len, 5, "max_len deve ser 5");
1453
1454 let mut padded_ids: Vec<Tensor> = Vec::new();
1455 for (ids, _) in &windows {
1456 let len = ids.len();
1457 let pad_right = max_len - len;
1458 let ids_i64: Vec<i64> = ids.iter().map(|&x| x as i64).collect();
1459 let t = Tensor::from_vec(ids_i64, len, &device).unwrap();
1460 let t = t.pad_with_zeros(0, 0, pad_right).unwrap();
1461 assert_eq!(
1462 t.dims(),
1463 &[max_len],
1464 "cada janela deve ter shape (max_len,) após padding"
1465 );
1466 padded_ids.push(t);
1467 }
1468
1469 let stacked = Tensor::stack(&padded_ids, 0).unwrap();
1470 assert_eq!(
1471 stacked.dims(),
1472 &[2, max_len],
1473 "stack deve produzir (batch_size=2, max_len=5)"
1474 );
1475
1476 let fake_logits_data: Vec<f32> = vec![0.0f32; 2 * max_len * 9]; let fake_logits =
1480 Tensor::from_vec(fake_logits_data, (2usize, max_len, 9usize), &device).unwrap();
1481 for (i, (ids, _)) in windows.iter().enumerate() {
1482 let real_len = ids.len();
1483 let example = fake_logits.get(i).unwrap();
1484 let sliced = example.narrow(0, 0, real_len).unwrap();
1485 assert_eq!(
1486 sliced.dims(),
1487 &[real_len, 9],
1488 "narrow deve preservar apenas {real_len} tokens reais"
1489 );
1490 }
1491 }
1492
1493 #[test]
1494 fn predict_batch_empty_windows_returns_empty() {
1495 let windows: Vec<(Vec<u32>, Vec<String>)> = vec![];
1498 let max_len = windows.iter().map(|(ids, _)| ids.len()).max().unwrap_or(0);
1499 assert_eq!(max_len, 0, "zero windows → max_len 0");
1500 let result: Vec<Vec<String>> = if max_len == 0 {
1503 Vec::new()
1504 } else {
1505 unreachable!()
1506 };
1507 assert!(result.is_empty());
1508 }
1509
1510 #[test]
1511 fn ner_batch_size_default_is_8() {
1512 std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1515 assert_eq!(crate::constants::ner_batch_size(), 8);
1516 }
1517
1518 #[test]
1519 fn ner_batch_size_env_override_clamped() {
1520 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "64");
1522 assert_eq!(crate::constants::ner_batch_size(), 32, "deve clampar em 32");
1523
1524 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "0");
1525 assert_eq!(crate::constants::ner_batch_size(), 1, "deve clampar em 1");
1526
1527 std::env::set_var("GRAPHRAG_NER_BATCH_SIZE", "4");
1528 assert_eq!(
1529 crate::constants::ner_batch_size(),
1530 4,
1531 "valor válido preservado"
1532 );
1533
1534 std::env::remove_var("GRAPHRAG_NER_BATCH_SIZE");
1535 }
1536
1537 #[test]
1538 fn extraction_method_regex_only_unchanged() {
1539 let result = RegexExtractor.extract("contato: dev@acme.io").unwrap();
1542 assert_eq!(
1543 result.extraction_method, "regex-only",
1544 "RegexExtractor deve retornar regex-only"
1545 );
1546 }
1547
1548 #[test]
1551 fn extend_suffix_pure_numeric_unchanged() {
1552 let ents = vec![ExtractedEntity {
1554 name: "GPT".to_string(),
1555 entity_type: "concept".to_string(),
1556 }];
1557 let result = extend_with_numeric_suffix(ents, "usando GPT-5 no projeto");
1558 assert_eq!(
1559 result[0].name, "GPT-5",
1560 "sufixo puramente numérico deve ser estendido"
1561 );
1562 }
1563
1564 #[test]
1565 fn extend_suffix_alphanumeric_letter_after_digit() {
1566 let ents = vec![ExtractedEntity {
1568 name: "GPT".to_string(),
1569 entity_type: "concept".to_string(),
1570 }];
1571 let result = extend_with_numeric_suffix(ents, "usando GPT-4o para tarefas avançadas");
1572 assert_eq!(result[0].name, "GPT-4o", "sufixo '4o' deve ser aceito");
1573 }
1574
1575 #[test]
1576 fn extend_suffix_alphanumeric_b_suffix() {
1577 let ents = vec![ExtractedEntity {
1579 name: "Llama".to_string(),
1580 entity_type: "concept".to_string(),
1581 }];
1582 let result = extend_with_numeric_suffix(ents, "modelo Llama-5b open-weight");
1583 assert_eq!(result[0].name, "Llama-5b", "sufixo '5b' deve ser aceito");
1584 }
1585
1586 #[test]
1587 fn extend_suffix_alphanumeric_x_suffix() {
1588 let ents = vec![ExtractedEntity {
1590 name: "Mistral".to_string(),
1591 entity_type: "concept".to_string(),
1592 }];
1593 let result = extend_with_numeric_suffix(ents, "testando Mistral-8x em produção");
1594 assert_eq!(result[0].name, "Mistral-8x", "sufixo '8x' deve ser aceito");
1595 }
1596
1597 #[test]
1600 fn augment_versioned_gpt4o() {
1601 let result = augment_versioned_model_names(vec![], "usando GPT-4o para análise");
1603 assert!(
1604 result.iter().any(|e| e.name == "GPT-4o"),
1605 "GPT-4o deve ser capturado pelo augment, achados: {:?}",
1606 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1607 );
1608 }
1609
1610 #[test]
1611 fn augment_versioned_claude_4_sonnet() {
1612 let result =
1614 augment_versioned_model_names(vec![], "melhor modelo: Claude 4 Sonnet lançado hoje");
1615 assert!(
1616 result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1617 "Claude 4 Sonnet deve ser capturado, achados: {:?}",
1618 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1619 );
1620 }
1621
1622 #[test]
1623 fn augment_versioned_llama_3_pro() {
1624 let result =
1626 augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1627 assert!(
1628 result.iter().any(|e| e.name == "Llama 3 Pro"),
1629 "Llama 3 Pro deve ser capturado, achados: {:?}",
1630 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1631 );
1632 }
1633
1634 #[test]
1635 fn augment_versioned_mixtral_8x7b() {
1636 let result =
1638 augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1639 assert!(
1640 result.iter().any(|e| e.name == "Mixtral 8x7B"),
1641 "Mixtral 8x7B deve ser capturado, achados: {:?}",
1642 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1643 );
1644 }
1645
1646 #[test]
1647 fn augment_versioned_does_not_duplicate_existing() {
1648 let existing = vec![ExtractedEntity {
1650 name: "Claude 4".to_string(),
1651 entity_type: "concept".to_string(),
1652 }];
1653 let result = augment_versioned_model_names(existing, "usando Claude 4 no projeto");
1654 let count = result.iter().filter(|e| e.name == "Claude 4").count();
1655 assert_eq!(count, 1, "Claude 4 não deve ser duplicado");
1656 }
1657
1658 #[test]
1661 fn stopwords_filter_url_jwt_api_v1025() {
1662 let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1664 let ents = apply_regex_prefilter(body);
1665 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1666 for blocked in &[
1667 "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1668 ] {
1669 assert!(
1670 !names.contains(blocked),
1671 "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1672 );
1673 }
1674 }
1675
1676 #[test]
1679 fn section_markers_etapa_fase_filtered_v1025() {
1680 let body = "Etapa 3 do plano: implementar Fase 1 da Migração.";
1682 let ents = apply_regex_prefilter(body);
1683 assert!(
1684 !ents
1685 .iter()
1686 .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1687 "section markers must be stripped; entities: {:?}",
1688 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1689 );
1690 }
1691
1692 #[test]
1693 fn section_markers_passo_secao_filtered_v1025() {
1694 let body = "Siga Passo 2 conforme Seção 3 do manual.";
1695 let ents = apply_regex_prefilter(body);
1696 assert!(
1697 !ents
1698 .iter()
1699 .any(|e| e.name.contains("Passo") || e.name.contains("Seção")),
1700 "Passo/Seção section markers must be stripped; entities: {:?}",
1701 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1702 );
1703 }
1704
1705 #[test]
1708 fn brand_camelcase_extracted_as_organization_v1025() {
1709 let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1711 let ents = apply_regex_prefilter(body);
1712 let openai = ents.iter().find(|e| e.name == "OpenAI");
1713 assert!(
1714 openai.is_some(),
1715 "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1716 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1717 );
1718 assert_eq!(
1719 openai.unwrap().entity_type,
1720 "organization",
1721 "brand CamelCase must map to organization (V008)"
1722 );
1723 }
1724
1725 #[test]
1726 fn brand_postgresql_extracted_as_organization_v1025() {
1727 let body = "migrating from MySQL to PostgreSQL for better performance.";
1728 let ents = apply_regex_prefilter(body);
1729 assert!(
1730 ents.iter()
1731 .any(|e| e.name == "PostgreSQL" && e.entity_type == "organization"),
1732 "PostgreSQL must be extracted as organization; entities: {:?}",
1733 ents.iter()
1734 .map(|e| (&e.name, &e.entity_type))
1735 .collect::<Vec<_>>()
1736 );
1737 }
1738
1739 #[test]
1742 fn iob_org_maps_to_organization_not_project_v1025() {
1743 let tokens = vec!["Microsoft".to_string()];
1745 let labels = vec!["B-ORG".to_string()];
1746 let ents = iob_to_entities(&tokens, &labels);
1747 assert_eq!(
1748 ents[0].entity_type, "organization",
1749 "B-ORG must map to organization (V008); got {}",
1750 ents[0].entity_type
1751 );
1752 }
1753
1754 #[test]
1755 fn iob_loc_maps_to_location_not_concept_v1025() {
1756 let tokens = vec!["São".to_string(), "Paulo".to_string()];
1758 let labels = vec!["B-LOC".to_string(), "I-LOC".to_string()];
1759 let ents = iob_to_entities(&tokens, &labels);
1760 assert_eq!(
1761 ents[0].entity_type, "location",
1762 "B-LOC must map to location (V008); got {}",
1763 ents[0].entity_type
1764 );
1765 }
1766
1767 #[test]
1768 fn iob_date_maps_to_date_not_discarded_v1025() {
1769 let tokens = vec!["2025".to_string(), "-".to_string(), "12".to_string()];
1771 let labels = vec![
1772 "B-DATE".to_string(),
1773 "I-DATE".to_string(),
1774 "I-DATE".to_string(),
1775 ];
1776 let ents = iob_to_entities(&tokens, &labels);
1777 assert_eq!(
1778 ents.len(),
1779 1,
1780 "DATE entity must be emitted (V008); entities: {ents:?}"
1781 );
1782 assert_eq!(ents[0].entity_type, "date");
1783 }
1784
1785 #[test]
1788 fn pt_verb_le_filtered_as_per_v1025() {
1789 let tokens = vec!["Lê".to_string(), "o".to_string(), "livro".to_string()];
1791 let labels = vec!["B-PER".to_string(), "O".to_string(), "O".to_string()];
1792 let ents = iob_to_entities(&tokens, &labels);
1793 assert!(
1794 !ents
1795 .iter()
1796 .any(|e| e.name == "Lê" && e.entity_type == "person"),
1797 "PT verb 'Lê' tagged B-PER must be filtered; entities: {ents:?}"
1798 );
1799 }
1800
1801 #[test]
1802 fn pt_verb_ver_filtered_as_per_v1025() {
1803 let tokens = vec!["Ver".to_string()];
1805 let labels = vec!["B-PER".to_string()];
1806 let ents = iob_to_entities(&tokens, &labels);
1807 assert!(
1808 ents.is_empty(),
1809 "PT verb 'Ver' tagged B-PER must be filtered; entities: {ents:?}"
1810 );
1811 }
1812
1813 fn entity(name: &str, entity_type: &str) -> ExtractedEntity {
1816 ExtractedEntity {
1817 name: name.to_string(),
1818 entity_type: entity_type.to_string(),
1819 }
1820 }
1821
1822 #[test]
1823 fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1824 let regex = vec![entity("Sonne", "concept")];
1826 let ner = vec![entity("Sonnet", "concept")];
1827 let result = merge_and_deduplicate(regex, ner);
1828 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1829 assert_eq!(result[0].name, "Sonnet");
1830 }
1831
1832 #[test]
1833 fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1834 let regex = vec![
1836 entity("Open", "organization"),
1837 entity("OpenAI", "organization"),
1838 ];
1839 let result = merge_and_deduplicate(regex, vec![]);
1840 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1841 assert_eq!(result[0].name, "OpenAI");
1842 }
1843
1844 #[test]
1845 fn merge_keeps_both_when_no_containment_v1025() {
1846 let regex = vec![entity("Alice", "person"), entity("Bob", "person")];
1848 let result = merge_and_deduplicate(regex, vec![]);
1849 assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1850 }
1851
1852 #[test]
1853 fn merge_respects_entity_type_boundary_v1025() {
1854 let regex = vec![entity("Apple", "organization"), entity("Apple", "concept")];
1856 let result = merge_and_deduplicate(regex, vec![]);
1857 assert_eq!(
1858 result.len(),
1859 2,
1860 "expected 2 entities (different types), got: {result:?}"
1861 );
1862 }
1863
1864 #[test]
1865 fn merge_case_insensitive_dedup_v1025() {
1866 let regex = vec![
1868 entity("OpenAI", "organization"),
1869 entity("openai", "organization"),
1870 ];
1871 let result = merge_and_deduplicate(regex, vec![]);
1872 assert_eq!(
1873 result.len(),
1874 1,
1875 "expected 1 entity after case-insensitive dedup, got: {result:?}"
1876 );
1877 }
1878
1879 #[test]
1882 fn iob_section_marker_etapa_filtered_v1025() {
1883 let tokens = vec!["Etapa".to_string(), "3".to_string()];
1885 let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1886 let ents = iob_to_entities(&tokens, &labels);
1887 assert!(
1888 !ents.iter().any(|e| e.name.contains("Etapa")),
1889 "section marker 'Etapa 3' from BERT must be filtered; entities: {ents:?}"
1890 );
1891 }
1892
1893 #[test]
1894 fn iob_section_marker_fase_filtered_v1025() {
1895 let tokens = vec!["Fase".to_string(), "1".to_string()];
1897 let labels = vec!["B-MISC".to_string(), "I-MISC".to_string()];
1898 let ents = iob_to_entities(&tokens, &labels);
1899 assert!(
1900 !ents.iter().any(|e| e.name.contains("Fase")),
1901 "section marker 'Fase 1' from BERT must be filtered; entities: {ents:?}"
1902 );
1903 }
1904}