1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::OnceLock;
4
5use anyhow::{Context, Result};
6use candle_core::{DType, Device, Tensor};
7use candle_nn::{Linear, Module, VarBuilder};
8use candle_transformers::models::bert::{BertModel, Config as BertConfig};
9use regex::Regex;
10use serde::Deserialize;
11
12use crate::paths::AppPaths;
13use crate::storage::entities::{NewEntity, NewRelationship};
14
15const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
16const MAX_SEQ_LEN: usize = 512;
17const STRIDE: usize = 256;
18const MAX_ENTS: usize = 30;
19const TOP_K_RELATIONS: usize = 5;
20const DEFAULT_RELATION: &str = "mentions";
21const MIN_ENTITY_CHARS: usize = 2;
22
23static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
24static REGEX_URL: OnceLock<Regex> = OnceLock::new();
25static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
26static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
27
28const ALL_CAPS_STOPWORDS: &[&str] = &[
35 "ACRESCENTADO",
36 "ADICIONAR",
37 "AGENTS",
38 "ALL",
39 "ALTA",
40 "ALWAYS",
41 "ARTEFATOS",
42 "ATIVO",
43 "BAIXA",
44 "BANCO",
45 "BLOQUEAR",
46 "BUG",
47 "CASO",
48 "CONFIRMADO",
49 "CONTRATO",
50 "CRÍTICO",
51 "CRITICAL",
52 "CSV",
53 "DEVE",
54 "DISCO",
55 "EFEITO",
56 "ENTRADA",
57 "ERROR",
58 "ESSA",
59 "ESSE",
60 "ESSENCIAL",
61 "ESTA",
62 "ESTE",
63 "EVITAR",
64 "EXPANDIR",
65 "EXPOR",
66 "FALHA",
67 "FIXME",
68 "FORBIDDEN",
69 "HACK",
70 "HEARTBEAT",
71 "INATIVO",
72 "JAMAIS",
73 "JSON",
74 "MUST",
75 "NEVER",
76 "NOTE",
77 "NUNCA",
78 "OBRIGATÓRIO",
79 "PADRÃO",
80 "PROIBIDO",
81 "REGRAS",
82 "REQUIRED",
83 "REQUISITO",
84 "SEMPRE",
85 "SHALL",
86 "SHOULD",
87 "SOUL",
88 "TODAS",
89 "TODO",
90 "TODOS",
91 "TOOLS",
92 "TSV",
93 "USAR",
94 "VALIDAR",
95 "VOCÊ",
96 "WARNING",
97 "XML",
98 "YAML",
99];
100
101const HTTP_METHODS: &[&str] = &[
104 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
105];
106
107fn is_filtered_all_caps(token: &str) -> bool {
108 let is_identifier = token.contains('_');
110 if is_identifier {
111 return false;
112 }
113 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
114}
115
116fn regex_email() -> &'static Regex {
117 REGEX_EMAIL
118 .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
119}
120
121fn regex_url() -> &'static Regex {
122 REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
123}
124
125fn regex_uuid() -> &'static Regex {
126 REGEX_UUID.get_or_init(|| {
127 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
128 .unwrap()
129 })
130}
131
132fn regex_all_caps() -> &'static Regex {
133 REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
134}
135
136#[derive(Debug, Clone, PartialEq)]
137pub struct ExtractedEntity {
138 pub name: String,
139 pub entity_type: String,
140}
141
142#[derive(Debug, Clone)]
143pub struct ExtractionResult {
144 pub entities: Vec<NewEntity>,
145 pub relationships: Vec<NewRelationship>,
146 pub extraction_method: String,
149}
150
151pub trait Extractor: Send + Sync {
152 fn extract(&self, body: &str) -> Result<ExtractionResult>;
153}
154
155#[derive(Deserialize)]
156struct ModelConfig {
157 #[serde(default)]
158 id2label: HashMap<String, String>,
159 hidden_size: usize,
160}
161
162struct BertNerModel {
163 bert: BertModel,
164 classifier: Linear,
165 device: Device,
166 id2label: HashMap<usize, String>,
167}
168
169impl BertNerModel {
170 fn load(model_dir: &Path) -> Result<Self> {
171 let config_path = model_dir.join("config.json");
172 let weights_path = model_dir.join("model.safetensors");
173
174 let config_str = std::fs::read_to_string(&config_path)
175 .with_context(|| format!("lendo config.json em {config_path:?}"))?;
176 let model_cfg: ModelConfig =
177 serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
178
179 let id2label: HashMap<usize, String> = model_cfg
180 .id2label
181 .into_iter()
182 .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
183 .collect();
184
185 let num_labels = id2label.len().max(9);
186 let hidden_size = model_cfg.hidden_size;
187
188 let bert_config_str = std::fs::read_to_string(&config_path)
189 .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
190 let bert_cfg: BertConfig =
191 serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
192
193 let device = Device::Cpu;
194
195 let vb = unsafe {
196 VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
197 .with_context(|| format!("mapeando {weights_path:?}"))?
198 };
199 let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
200
201 let cls_vb = vb.pp("classifier");
204 let weight = cls_vb
205 .get((num_labels, hidden_size), "weight")
206 .context("carregando classifier.weight do safetensors")?;
207 let bias = cls_vb
208 .get(num_labels, "bias")
209 .context("carregando classifier.bias do safetensors")?;
210 let classifier = Linear::new(weight, Some(bias));
211
212 Ok(Self {
213 bert,
214 classifier,
215 device,
216 id2label,
217 })
218 }
219
220 fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
221 let len = token_ids.len();
222 let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
223 let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
224
225 let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
226 .context("criando tensor input_ids")?;
227 let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
228 .context("criando tensor token_type_ids")?;
229 let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
230 .context("criando tensor attention_mask")?;
231
232 let sequence_output = self
233 .bert
234 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
235 .context("forward pass do BertModel")?;
236
237 let logits = self
238 .classifier
239 .forward(&sequence_output)
240 .context("forward pass do classificador")?;
241
242 let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
243
244 let num_tokens = logits_2d.dim(0).context("dim(0)")?;
245
246 let mut labels = Vec::with_capacity(num_tokens);
247 for i in 0..num_tokens {
248 let token_logits = logits_2d.get(i).context("get token logits")?;
249 let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
250 let argmax = vec
251 .iter()
252 .enumerate()
253 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
254 .map(|(idx, _)| idx)
255 .unwrap_or(0);
256 let label = self
257 .id2label
258 .get(&argmax)
259 .cloned()
260 .unwrap_or_else(|| "O".to_string());
261 labels.push(label);
262 }
263
264 Ok(labels)
265 }
266}
267
268static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
269
270fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
271 NER_MODEL
272 .get_or_init(|| match load_model(paths) {
273 Ok(m) => Some(m),
274 Err(e) => {
275 tracing::warn!("NER model não disponível (graceful degradation): {e:#}");
276 None
277 }
278 })
279 .as_ref()
280}
281
282fn model_dir(paths: &AppPaths) -> PathBuf {
283 paths.models.join("bert-multilingual-ner")
284}
285
286fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
287 let dir = model_dir(paths);
288 std::fs::create_dir_all(&dir)
289 .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
290
291 let weights = dir.join("model.safetensors");
292 let config = dir.join("config.json");
293 let tokenizer = dir.join("tokenizer.json");
294
295 if weights.exists() && config.exists() && tokenizer.exists() {
296 return Ok(dir);
297 }
298
299 tracing::info!("Baixando modelo NER (primeira execução, ~676 MB)...");
300 crate::output::emit_progress_i18n(
301 "Downloading NER model (first run, ~676 MB)...",
302 "Baixando modelo NER (primeira execução, ~676 MB)...",
303 );
304
305 let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
306 let repo = api.model(MODEL_ID.to_string());
307
308 for (remote, local) in &[
312 ("model.safetensors", "model.safetensors"),
313 ("config.json", "config.json"),
314 ("onnx/tokenizer.json", "tokenizer.json"),
315 ("tokenizer_config.json", "tokenizer_config.json"),
316 ] {
317 let dest = dir.join(local);
318 if !dest.exists() {
319 let src = repo
320 .get(remote)
321 .with_context(|| format!("baixando {remote} do HF Hub"))?;
322 std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
323 }
324 }
325
326 Ok(dir)
327}
328
329fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
330 let dir = ensure_model_files(paths)?;
331 BertNerModel::load(&dir)
332}
333
334fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
335 let mut entities = Vec::new();
336 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
337
338 let add = |entities: &mut Vec<ExtractedEntity>,
339 seen: &mut std::collections::HashSet<String>,
340 name: &str,
341 entity_type: &str| {
342 let name = name.trim().to_string();
343 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
344 entities.push(ExtractedEntity {
345 name,
346 entity_type: entity_type.to_string(),
347 });
348 }
349 };
350
351 for m in regex_email().find_iter(body) {
352 add(&mut entities, &mut seen, m.as_str(), "concept");
354 }
355 for m in regex_url().find_iter(body) {
356 let raw = m.as_str();
359 let cleaned = raw
360 .trim_end_matches('`')
361 .trim_end_matches(',')
362 .trim_end_matches('.')
363 .trim_end_matches(';')
364 .trim_end_matches(')')
365 .trim_end_matches(']')
366 .trim_end_matches('}');
367 add(&mut entities, &mut seen, cleaned, "concept");
368 }
369 for m in regex_uuid().find_iter(body) {
370 add(&mut entities, &mut seen, m.as_str(), "concept");
371 }
372 for m in regex_all_caps().find_iter(body) {
373 let candidate = m.as_str();
374 if !is_filtered_all_caps(candidate) {
376 add(&mut entities, &mut seen, candidate, "concept");
377 }
378 }
379
380 entities
381}
382
383fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
384 let mut entities: Vec<ExtractedEntity> = Vec::new();
385 let mut current_parts: Vec<String> = Vec::new();
386 let mut current_type: Option<String> = None;
387
388 let flush =
389 |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
390 if let Some(t) = typ.take() {
391 let name = parts.join(" ").trim().to_string();
392 let is_single_caps = !name.contains(' ')
396 && name == name.to_uppercase()
397 && name.len() >= MIN_ENTITY_CHARS;
398 let should_skip = is_single_caps && is_filtered_all_caps(&name);
399 if name.len() >= MIN_ENTITY_CHARS && !should_skip {
400 entities.push(ExtractedEntity {
401 name,
402 entity_type: t,
403 });
404 }
405 parts.clear();
406 }
407 };
408
409 for (token, label) in tokens.iter().zip(labels.iter()) {
410 if label == "O" {
411 flush(&mut current_parts, &mut current_type, &mut entities);
412 continue;
413 }
414
415 let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
416 ("B", rest)
417 } else if let Some(rest) = label.strip_prefix("I-") {
418 ("I", rest)
419 } else {
420 flush(&mut current_parts, &mut current_type, &mut entities);
421 continue;
422 };
423
424 let entity_type = match bio_type {
425 "DATE" => {
426 flush(&mut current_parts, &mut current_type, &mut entities);
427 continue;
428 }
429 "PER" => "person",
430 "ORG" => {
431 let t = token.to_lowercase();
432 if t.contains("lib")
433 || t.contains("sdk")
434 || t.contains("cli")
435 || t.contains("crate")
436 || t.contains("npm")
437 {
438 "tool"
439 } else {
440 "project"
441 }
442 }
443 "LOC" => "concept",
444 other => other,
445 };
446
447 if prefix == "B" {
448 if token.starts_with("##") {
449 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
452 if let Some(last) = current_parts.last_mut() {
453 last.push_str(clean);
454 }
455 continue;
456 }
457 flush(&mut current_parts, &mut current_type, &mut entities);
458 current_parts.push(token.clone());
459 current_type = Some(entity_type.to_string());
460 } else if prefix == "I" && current_type.is_some() {
461 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
462 if token.starts_with("##") {
463 if let Some(last) = current_parts.last_mut() {
464 last.push_str(clean);
465 }
466 } else {
467 current_parts.push(clean.to_string());
468 }
469 }
470 }
471
472 flush(&mut current_parts, &mut current_type, &mut entities);
473 entities
474}
475
476fn build_relationships(entities: &[NewEntity]) -> Vec<NewRelationship> {
477 if entities.len() < 2 {
478 return Vec::new();
479 }
480
481 let max_rels = crate::constants::max_relationships_per_memory();
484 let n = entities.len().min(MAX_ENTS);
485 let mut rels: Vec<NewRelationship> = Vec::new();
486 let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
487
488 let mut hit_cap = false;
489 'outer: for i in 0..n {
490 if rels.len() >= max_rels {
491 hit_cap = true;
492 break;
493 }
494
495 let mut for_entity = 0usize;
496 for j in (i + 1)..n {
497 if for_entity >= TOP_K_RELATIONS {
498 break;
499 }
500 if rels.len() >= max_rels {
501 hit_cap = true;
502 break 'outer;
503 }
504
505 let src = &entities[i].name;
506 let tgt = &entities[j].name;
507 let key = (src.clone(), tgt.clone());
508
509 if seen.contains(&key) {
510 continue;
511 }
512 seen.insert(key);
513
514 rels.push(NewRelationship {
515 source: src.clone(),
516 target: tgt.clone(),
517 relation: DEFAULT_RELATION.to_string(),
518 strength: 0.5,
519 description: None,
520 });
521 for_entity += 1;
522 }
523 }
524
525 if hit_cap {
527 tracing::warn!(
528 "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
529 n.saturating_sub(1)
530 );
531 }
532
533 rels
534}
535
536fn run_ner_sliding_window(
537 model: &BertNerModel,
538 body: &str,
539 paths: &AppPaths,
540) -> Result<Vec<ExtractedEntity>> {
541 let tokenizer_path = model_dir(paths).join("tokenizer.json");
542 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
543 .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
544
545 let encoding = tokenizer
546 .encode(body, false)
547 .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
548
549 let all_ids: Vec<u32> = encoding.get_ids().to_vec();
550 let all_tokens: Vec<String> = encoding
551 .get_tokens()
552 .iter()
553 .map(|s| s.to_string())
554 .collect();
555
556 if all_ids.is_empty() {
557 return Ok(Vec::new());
558 }
559
560 let mut entities: Vec<ExtractedEntity> = Vec::new();
561 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
562
563 let mut start = 0usize;
564 loop {
565 let end = (start + MAX_SEQ_LEN).min(all_ids.len());
566 let window_ids = &all_ids[start..end];
567 let window_tokens = &all_tokens[start..end];
568 let attention_mask: Vec<u32> = vec![1u32; window_ids.len()];
569
570 match model.predict(window_ids, &attention_mask) {
571 Ok(labels) => {
572 let window_ents = iob_to_entities(window_tokens, &labels);
573 for ent in window_ents {
574 if seen.insert(ent.name.clone()) {
575 entities.push(ent);
576 }
577 }
578 }
579 Err(e) => {
580 tracing::warn!("janela NER falhou (start={start}): {e:#}");
581 }
582 }
583
584 if end >= all_ids.len() {
585 break;
586 }
587 start += STRIDE;
588 }
589
590 Ok(entities)
591}
592
593fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
598 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
599 let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?)").unwrap());
600
601 entities
602 .into_iter()
603 .map(|ent| {
604 if let Some(pos) = body.find(&ent.name) {
606 let after_pos = pos + ent.name.len();
607 if after_pos < body.len() {
608 let after = &body[after_pos..];
609 if let Some(m) = suffix_re.find(after) {
610 let suffix = m.as_str();
611 if suffix.len() <= 6 {
613 let extended = format!("{}{}", ent.name, suffix);
614 return ExtractedEntity {
615 name: extended,
616 entity_type: ent.entity_type,
617 };
618 }
619 }
620 }
621 }
622 ent
623 })
624 .collect()
625}
626
627fn augment_versioned_model_names(
643 entities: Vec<ExtractedEntity>,
644 body: &str,
645) -> Vec<ExtractedEntity> {
646 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
647 let model_re = VERSIONED_MODEL_RE
648 .get_or_init(|| Regex::new(r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?)\b").unwrap());
649
650 let mut existing_lc: std::collections::HashSet<String> =
651 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
652 let mut result = entities;
653
654 for caps in model_re.captures_iter(body) {
655 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
656 if full_match.is_empty() || full_match.len() > 24 {
659 continue;
660 }
661 let normalized_lc = full_match.to_lowercase();
662 if existing_lc.contains(&normalized_lc) {
663 continue;
664 }
665 if result.len() >= MAX_ENTS {
668 break;
669 }
670 existing_lc.insert(normalized_lc);
671 result.push(ExtractedEntity {
672 name: full_match.to_string(),
673 entity_type: "concept".to_string(),
674 });
675 }
676
677 result
678}
679
680fn merge_and_deduplicate(
681 regex_ents: Vec<ExtractedEntity>,
682 ner_ents: Vec<ExtractedEntity>,
683) -> Vec<ExtractedEntity> {
684 let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
691 let mut result: Vec<ExtractedEntity> = Vec::new();
692 let mut truncated = false;
693
694 let total_input = regex_ents.len() + ner_ents.len();
695 for ent in regex_ents.into_iter().chain(ner_ents) {
696 let key = ent.name.to_lowercase();
697 let mut collision_idx: Option<usize> = None;
701 for (existing_key, idx) in &by_lc {
702 if existing_key == &key
703 || existing_key.starts_with(&key)
704 || key.starts_with(existing_key)
705 {
706 collision_idx = Some(*idx);
707 break;
708 }
709 }
710 match collision_idx {
711 Some(idx) => {
712 if ent.name.len() > result[idx].name.len() {
716 let old_key = result[idx].name.to_lowercase();
717 by_lc.remove(&old_key);
718 result[idx] = ent;
719 by_lc.insert(key, idx);
720 }
721 }
722 None => {
723 by_lc.insert(key, result.len());
724 result.push(ent);
725 }
726 }
727 if result.len() >= MAX_ENTS {
728 truncated = true;
729 break;
730 }
731 }
732
733 if truncated {
735 tracing::warn!(
736 "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
737 );
738 }
739
740 result
741}
742
743fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
744 extracted
745 .into_iter()
746 .map(|e| NewEntity {
747 name: e.name,
748 entity_type: e.entity_type,
749 description: None,
750 })
751 .collect()
752}
753
754pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
755 let regex_entities = apply_regex_prefilter(body);
756
757 let mut bert_used = false;
758 let ner_entities = match get_or_init_model(paths) {
759 Some(model) => match run_ner_sliding_window(model, body, paths) {
760 Ok(ents) => {
761 bert_used = true;
762 ents
763 }
764 Err(e) => {
765 tracing::warn!("NER falhou, usando apenas regex: {e:#}");
766 Vec::new()
767 }
768 },
769 None => Vec::new(),
770 };
771
772 let merged = merge_and_deduplicate(regex_entities, ner_entities);
773 let extended = extend_with_numeric_suffix(merged, body);
775 let with_models = augment_versioned_model_names(extended, body);
779 let entities = to_new_entities(with_models);
780 let relationships = build_relationships(&entities);
781
782 let extraction_method = if bert_used {
783 "bert+regex".to_string()
784 } else {
785 "regex-only".to_string()
786 };
787
788 Ok(ExtractionResult {
789 entities,
790 relationships,
791 extraction_method,
792 })
793}
794
795pub struct RegexExtractor;
796
797impl Extractor for RegexExtractor {
798 fn extract(&self, body: &str) -> Result<ExtractionResult> {
799 let regex_entities = apply_regex_prefilter(body);
800 let entities = to_new_entities(regex_entities);
801 let relationships = build_relationships(&entities);
802 Ok(ExtractionResult {
803 entities,
804 relationships,
805 extraction_method: "regex-only".to_string(),
806 })
807 }
808}
809
810#[cfg(test)]
811mod tests {
812 use super::*;
813
814 fn make_paths() -> AppPaths {
815 use std::path::PathBuf;
816 AppPaths {
817 db: PathBuf::from("/tmp/test.sqlite"),
818 models: PathBuf::from("/tmp/test_models"),
819 }
820 }
821
822 #[test]
823 fn regex_email_captura_endereco() {
824 let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
825 assert!(ents
827 .iter()
828 .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
829 }
830
831 #[test]
832 fn regex_all_caps_filtra_palavra_regra_pt() {
833 let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
835 assert!(
836 !ents.iter().any(|e| e.name == "NUNCA"),
837 "NUNCA deveria ser filtrado como stopword"
838 );
839 assert!(
840 !ents.iter().any(|e| e.name == "PROIBIDO"),
841 "PROIBIDO deveria ser filtrado"
842 );
843 assert!(
844 !ents.iter().any(|e| e.name == "DEVE"),
845 "DEVE deveria ser filtrado"
846 );
847 }
848
849 #[test]
850 fn regex_all_caps_aceita_constante_com_underscore() {
851 let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
853 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
854 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
855 }
856
857 #[test]
858 fn regex_all_caps_aceita_acronimo_dominio() {
859 let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
861 assert!(ents.iter().any(|e| e.name == "OPENAI"));
862 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
863 }
864
865 #[test]
866 fn regex_url_captura_link() {
867 let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
868 assert!(ents
869 .iter()
870 .any(|e| e.name.starts_with("https://") && e.entity_type == "concept"));
871 }
872
873 #[test]
874 fn regex_uuid_captura_identificador() {
875 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
876 assert!(ents.iter().any(|e| e.entity_type == "concept"));
877 }
878
879 #[test]
880 fn regex_all_caps_captura_constante() {
881 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
882 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
883 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
884 }
885
886 #[test]
887 fn regex_all_caps_ignora_palavras_curtas() {
888 let ents = apply_regex_prefilter("use AI em seu projeto");
889 assert!(
890 !ents.iter().any(|e| e.name == "AI"),
891 "AI tem apenas 2 chars, deve ser ignorado"
892 );
893 }
894
895 #[test]
896 fn iob_decodifica_per_para_person() {
897 let tokens = vec![
898 "John".to_string(),
899 "Doe".to_string(),
900 "trabalhou".to_string(),
901 ];
902 let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
903 let ents = iob_to_entities(&tokens, &labels);
904 assert_eq!(ents.len(), 1);
905 assert_eq!(ents[0].entity_type, "person");
906 assert!(ents[0].name.contains("John"));
907 }
908
909 #[test]
910 fn iob_strip_subword_b_prefix() {
911 let tokens = vec!["Open".to_string(), "##AI".to_string()];
914 let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
915 let ents = iob_to_entities(&tokens, &labels);
916 assert!(
917 ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
918 "deveria mergear ##AI ou descartar"
919 );
920 }
921
922 #[test]
923 fn iob_subword_orphan_descarta() {
924 let tokens = vec!["##AI".to_string()];
926 let labels = vec!["B-ORG".to_string()];
927 let ents = iob_to_entities(&tokens, &labels);
928 assert!(
929 ents.is_empty(),
930 "subword órfão sem entidade ativa deve ser descartado"
931 );
932 }
933
934 #[test]
935 fn iob_descarta_date() {
936 let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
937 let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
938 let ents = iob_to_entities(&tokens, &labels);
939 assert!(ents.is_empty(), "DATE deve ser descartado");
940 }
941
942 #[test]
943 fn iob_mapeia_org_para_project() {
944 let tokens = vec!["Empresa".to_string()];
945 let labels = vec!["B-ORG".to_string()];
946 let ents = iob_to_entities(&tokens, &labels);
947 assert_eq!(ents[0].entity_type, "project");
948 }
949
950 #[test]
951 fn iob_mapeia_org_sdk_para_tool() {
952 let tokens = vec!["tokio-sdk".to_string()];
953 let labels = vec!["B-ORG".to_string()];
954 let ents = iob_to_entities(&tokens, &labels);
955 assert_eq!(ents[0].entity_type, "tool");
956 }
957
958 #[test]
959 fn iob_mapeia_loc_para_concept() {
960 let tokens = vec!["Brasil".to_string()];
961 let labels = vec!["B-LOC".to_string()];
962 let ents = iob_to_entities(&tokens, &labels);
963 assert_eq!(ents[0].entity_type, "concept");
964 }
965
966 #[test]
967 fn build_relationships_respeitam_max_rels() {
968 let entities: Vec<NewEntity> = (0..20)
969 .map(|i| NewEntity {
970 name: format!("entidade_{i}"),
971 entity_type: "concept".to_string(),
972 description: None,
973 })
974 .collect();
975 let rels = build_relationships(&entities);
976 let max_rels = crate::constants::max_relationships_per_memory();
977 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
978 }
979
980 #[test]
981 fn build_relationships_sem_duplicatas() {
982 let entities: Vec<NewEntity> = (0..5)
983 .map(|i| NewEntity {
984 name: format!("ent_{i}"),
985 entity_type: "concept".to_string(),
986 description: None,
987 })
988 .collect();
989 let rels = build_relationships(&entities);
990 let mut pares: std::collections::HashSet<(String, String)> =
991 std::collections::HashSet::new();
992 for r in &rels {
993 let par = (r.source.clone(), r.target.clone());
994 assert!(pares.insert(par), "par duplicado encontrado");
995 }
996 }
997
998 #[test]
999 fn merge_deduplica_por_nome_lowercase() {
1000 let a = vec![ExtractedEntity {
1001 name: "Rust".to_string(),
1002 entity_type: "concept".to_string(),
1003 }];
1004 let b = vec![ExtractedEntity {
1005 name: "rust".to_string(),
1006 entity_type: "tool".to_string(),
1007 }];
1008 let merged = merge_and_deduplicate(a, b);
1009 assert_eq!(merged.len(), 1, "rust e Rust são a mesma entidade");
1010 }
1011
1012 #[test]
1013 fn regex_extractor_implementa_trait() {
1014 let extractor = RegexExtractor;
1015 let result = extractor
1016 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1017 .unwrap();
1018 assert!(!result.entities.is_empty());
1019 }
1020
1021 #[test]
1022 fn extract_retorna_ok_sem_modelo() {
1023 let paths = make_paths();
1025 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1026 let result = extract_graph_auto(body, &paths).unwrap();
1027 assert!(result
1028 .entities
1029 .iter()
1030 .any(|e| e.name.contains("teste@exemplo.com")));
1031 }
1032}