1use std::collections::HashMap;
2use std::path::{Path, PathBuf};
3use std::sync::OnceLock;
4
5use anyhow::{Context, Result};
6use candle_core::{DType, Device, Tensor};
7use candle_nn::{Linear, Module, VarBuilder};
8use candle_transformers::models::bert::{BertModel, Config as BertConfig};
9use regex::Regex;
10use serde::Deserialize;
11
12use crate::paths::AppPaths;
13use crate::storage::entities::{NewEntity, NewRelationship};
14
15const MODEL_ID: &str = "Davlan/bert-base-multilingual-cased-ner-hrl";
16const MAX_SEQ_LEN: usize = 512;
17const STRIDE: usize = 256;
18const MAX_ENTS: usize = 30;
19const TOP_K_RELATIONS: usize = 5;
20const DEFAULT_RELATION: &str = "mentions";
21const MIN_ENTITY_CHARS: usize = 2;
22
23static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
24static REGEX_URL: OnceLock<Regex> = OnceLock::new();
25static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
26static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
27
28const ALL_CAPS_STOPWORDS: &[&str] = &[
35 "ACRESCENTADO",
36 "ADICIONAR",
37 "AGENTS",
38 "ALL",
39 "ALTA",
40 "ALWAYS",
41 "ARTEFATOS",
42 "ATIVO",
43 "BAIXA",
44 "BANCO",
45 "BLOQUEAR",
46 "BUG",
47 "CASO",
48 "CONFIRMADO",
49 "CONTRATO",
50 "CRÍTICO",
51 "CRITICAL",
52 "CSV",
53 "DEVE",
54 "DISCO",
55 "EFEITO",
56 "ENTRADA",
57 "ERROR",
58 "ESSA",
59 "ESSE",
60 "ESSENCIAL",
61 "ESTA",
62 "ESTE",
63 "EVITAR",
64 "EXPANDIR",
65 "EXPOR",
66 "FALHA",
67 "FIXME",
68 "FORBIDDEN",
69 "HACK",
70 "HEARTBEAT",
71 "INATIVO",
72 "JAMAIS",
73 "JSON",
74 "MUST",
75 "NEVER",
76 "NOTE",
77 "NUNCA",
78 "OBRIGATÓRIO",
79 "PADRÃO",
80 "PROIBIDO",
81 "REGRAS",
82 "REQUIRED",
83 "REQUISITO",
84 "SEMPRE",
85 "SHALL",
86 "SHOULD",
87 "SOUL",
88 "TODAS",
89 "TODO",
90 "TODOS",
91 "TOOLS",
92 "TSV",
93 "USAR",
94 "VALIDAR",
95 "VOCÊ",
96 "WARNING",
97 "XML",
98 "YAML",
99];
100
101const HTTP_METHODS: &[&str] = &[
104 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
105];
106
107fn is_filtered_all_caps(token: &str) -> bool {
108 let is_identifier = token.contains('_');
110 if is_identifier {
111 return false;
112 }
113 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
114}
115
116fn regex_email() -> &'static Regex {
117 REGEX_EMAIL
118 .get_or_init(|| Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}").unwrap())
119}
120
121fn regex_url() -> &'static Regex {
122 REGEX_URL.get_or_init(|| Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#).unwrap())
123}
124
125fn regex_uuid() -> &'static Regex {
126 REGEX_UUID.get_or_init(|| {
127 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
128 .unwrap()
129 })
130}
131
132fn regex_all_caps() -> &'static Regex {
133 REGEX_ALL_CAPS.get_or_init(|| Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b").unwrap())
134}
135
136#[derive(Debug, Clone, PartialEq)]
137pub struct ExtractedEntity {
138 pub name: String,
139 pub entity_type: String,
140}
141
142#[derive(Debug, Clone)]
143pub struct ExtractionResult {
144 pub entities: Vec<NewEntity>,
145 pub relationships: Vec<NewRelationship>,
146 pub extraction_method: String,
149}
150
151pub trait Extractor: Send + Sync {
152 fn extract(&self, body: &str) -> Result<ExtractionResult>;
153}
154
155#[derive(Deserialize)]
156struct ModelConfig {
157 #[serde(default)]
158 id2label: HashMap<String, String>,
159 hidden_size: usize,
160}
161
162struct BertNerModel {
163 bert: BertModel,
164 classifier: Linear,
165 device: Device,
166 id2label: HashMap<usize, String>,
167}
168
169impl BertNerModel {
170 fn load(model_dir: &Path) -> Result<Self> {
171 let config_path = model_dir.join("config.json");
172 let weights_path = model_dir.join("model.safetensors");
173
174 let config_str = std::fs::read_to_string(&config_path)
175 .with_context(|| format!("lendo config.json em {config_path:?}"))?;
176 let model_cfg: ModelConfig =
177 serde_json::from_str(&config_str).context("parseando config.json do modelo NER")?;
178
179 let id2label: HashMap<usize, String> = model_cfg
180 .id2label
181 .into_iter()
182 .filter_map(|(k, v)| k.parse::<usize>().ok().map(|n| (n, v)))
183 .collect();
184
185 let num_labels = id2label.len().max(9);
186 let hidden_size = model_cfg.hidden_size;
187
188 let bert_config_str = std::fs::read_to_string(&config_path)
189 .with_context(|| format!("relendo config.json para bert em {config_path:?}"))?;
190 let bert_cfg: BertConfig =
191 serde_json::from_str(&bert_config_str).context("parseando BertConfig")?;
192
193 let device = Device::Cpu;
194
195 let vb = unsafe {
196 VarBuilder::from_mmaped_safetensors(&[&weights_path], DType::F32, &device)
197 .with_context(|| format!("mapeando {weights_path:?}"))?
198 };
199 let bert = BertModel::load(vb.pp("bert"), &bert_cfg).context("carregando BertModel")?;
200
201 let cls_vb = vb.pp("classifier");
204 let weight = cls_vb
205 .get((num_labels, hidden_size), "weight")
206 .context("carregando classifier.weight do safetensors")?;
207 let bias = cls_vb
208 .get(num_labels, "bias")
209 .context("carregando classifier.bias do safetensors")?;
210 let classifier = Linear::new(weight, Some(bias));
211
212 Ok(Self {
213 bert,
214 classifier,
215 device,
216 id2label,
217 })
218 }
219
220 fn predict(&self, token_ids: &[u32], attention_mask: &[u32]) -> Result<Vec<String>> {
221 let len = token_ids.len();
222 let ids_i64: Vec<i64> = token_ids.iter().map(|&x| x as i64).collect();
223 let mask_i64: Vec<i64> = attention_mask.iter().map(|&x| x as i64).collect();
224
225 let input_ids = Tensor::from_vec(ids_i64, (1, len), &self.device)
226 .context("criando tensor input_ids")?;
227 let token_type_ids = Tensor::zeros((1, len), DType::I64, &self.device)
228 .context("criando tensor token_type_ids")?;
229 let attn_mask = Tensor::from_vec(mask_i64, (1, len), &self.device)
230 .context("criando tensor attention_mask")?;
231
232 let sequence_output = self
233 .bert
234 .forward(&input_ids, &token_type_ids, Some(&attn_mask))
235 .context("forward pass do BertModel")?;
236
237 let logits = self
238 .classifier
239 .forward(&sequence_output)
240 .context("forward pass do classificador")?;
241
242 let logits_2d = logits.squeeze(0).context("removendo dimensão batch")?;
243
244 let num_tokens = logits_2d.dim(0).context("dim(0)")?;
245
246 let mut labels = Vec::with_capacity(num_tokens);
247 for i in 0..num_tokens {
248 let token_logits = logits_2d.get(i).context("get token logits")?;
249 let vec: Vec<f32> = token_logits.to_vec1().context("to_vec1 logits")?;
250 let argmax = vec
251 .iter()
252 .enumerate()
253 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap())
254 .map(|(idx, _)| idx)
255 .unwrap_or(0);
256 let label = self
257 .id2label
258 .get(&argmax)
259 .cloned()
260 .unwrap_or_else(|| "O".to_string());
261 labels.push(label);
262 }
263
264 Ok(labels)
265 }
266}
267
268static NER_MODEL: OnceLock<Option<BertNerModel>> = OnceLock::new();
269
270fn get_or_init_model(paths: &AppPaths) -> Option<&'static BertNerModel> {
271 NER_MODEL
272 .get_or_init(|| match load_model(paths) {
273 Ok(m) => Some(m),
274 Err(e) => {
275 tracing::warn!("NER model não disponível (graceful degradation): {e:#}");
276 None
277 }
278 })
279 .as_ref()
280}
281
282fn model_dir(paths: &AppPaths) -> PathBuf {
283 paths.models.join("bert-multilingual-ner")
284}
285
286fn ensure_model_files(paths: &AppPaths) -> Result<PathBuf> {
287 let dir = model_dir(paths);
288 std::fs::create_dir_all(&dir)
289 .with_context(|| format!("criando diretório do modelo: {dir:?}"))?;
290
291 let weights = dir.join("model.safetensors");
292 let config = dir.join("config.json");
293 let tokenizer = dir.join("tokenizer.json");
294
295 if weights.exists() && config.exists() && tokenizer.exists() {
296 return Ok(dir);
297 }
298
299 tracing::info!("Baixando modelo NER (primeira execução, ~676 MB)...");
300 crate::output::emit_progress_i18n(
301 "Downloading NER model (first run, ~676 MB)...",
302 "Baixando modelo NER (primeira execução, ~676 MB)...",
303 );
304
305 let api = huggingface_hub::api::sync::Api::new().context("criando cliente HF Hub")?;
306 let repo = api.model(MODEL_ID.to_string());
307
308 for (remote, local) in &[
312 ("model.safetensors", "model.safetensors"),
313 ("config.json", "config.json"),
314 ("onnx/tokenizer.json", "tokenizer.json"),
315 ("tokenizer_config.json", "tokenizer_config.json"),
316 ] {
317 let dest = dir.join(local);
318 if !dest.exists() {
319 let src = repo
320 .get(remote)
321 .with_context(|| format!("baixando {remote} do HF Hub"))?;
322 std::fs::copy(&src, &dest).with_context(|| format!("copiando {local} para cache"))?;
323 }
324 }
325
326 Ok(dir)
327}
328
329fn load_model(paths: &AppPaths) -> Result<BertNerModel> {
330 let dir = ensure_model_files(paths)?;
331 BertNerModel::load(&dir)
332}
333
334fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
335 let mut entities = Vec::new();
336 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
337
338 let add = |entities: &mut Vec<ExtractedEntity>,
339 seen: &mut std::collections::HashSet<String>,
340 name: &str,
341 entity_type: &str| {
342 let name = name.trim().to_string();
343 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
344 entities.push(ExtractedEntity {
345 name,
346 entity_type: entity_type.to_string(),
347 });
348 }
349 };
350
351 for m in regex_email().find_iter(body) {
352 add(&mut entities, &mut seen, m.as_str(), "concept");
354 }
355 for m in regex_url().find_iter(body) {
356 let raw = m.as_str();
359 let cleaned = raw
360 .trim_end_matches('`')
361 .trim_end_matches(',')
362 .trim_end_matches('.')
363 .trim_end_matches(';')
364 .trim_end_matches(')')
365 .trim_end_matches(']')
366 .trim_end_matches('}');
367 add(&mut entities, &mut seen, cleaned, "concept");
368 }
369 for m in regex_uuid().find_iter(body) {
370 add(&mut entities, &mut seen, m.as_str(), "concept");
371 }
372 for m in regex_all_caps().find_iter(body) {
373 let candidate = m.as_str();
374 if !is_filtered_all_caps(candidate) {
376 add(&mut entities, &mut seen, candidate, "concept");
377 }
378 }
379
380 entities
381}
382
383fn iob_to_entities(tokens: &[String], labels: &[String]) -> Vec<ExtractedEntity> {
384 let mut entities: Vec<ExtractedEntity> = Vec::new();
385 let mut current_parts: Vec<String> = Vec::new();
386 let mut current_type: Option<String> = None;
387
388 let flush =
389 |parts: &mut Vec<String>, typ: &mut Option<String>, entities: &mut Vec<ExtractedEntity>| {
390 if let Some(t) = typ.take() {
391 let name = parts.join(" ").trim().to_string();
392 let is_single_caps = !name.contains(' ')
396 && name == name.to_uppercase()
397 && name.len() >= MIN_ENTITY_CHARS;
398 let should_skip = is_single_caps && is_filtered_all_caps(&name);
399 if name.len() >= MIN_ENTITY_CHARS && !should_skip {
400 entities.push(ExtractedEntity {
401 name,
402 entity_type: t,
403 });
404 }
405 parts.clear();
406 }
407 };
408
409 for (token, label) in tokens.iter().zip(labels.iter()) {
410 if label == "O" {
411 flush(&mut current_parts, &mut current_type, &mut entities);
412 continue;
413 }
414
415 let (prefix, bio_type) = if let Some(rest) = label.strip_prefix("B-") {
416 ("B", rest)
417 } else if let Some(rest) = label.strip_prefix("I-") {
418 ("I", rest)
419 } else {
420 flush(&mut current_parts, &mut current_type, &mut entities);
421 continue;
422 };
423
424 let entity_type = match bio_type {
425 "DATE" => {
426 flush(&mut current_parts, &mut current_type, &mut entities);
427 continue;
428 }
429 "PER" => "person",
430 "ORG" => {
431 let t = token.to_lowercase();
432 if t.contains("lib")
433 || t.contains("sdk")
434 || t.contains("cli")
435 || t.contains("crate")
436 || t.contains("npm")
437 {
438 "tool"
439 } else {
440 "project"
441 }
442 }
443 "LOC" => "concept",
444 other => other,
445 };
446
447 if prefix == "B" {
448 if token.starts_with("##") {
449 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
452 if let Some(last) = current_parts.last_mut() {
453 last.push_str(clean);
454 }
455 continue;
456 }
457 flush(&mut current_parts, &mut current_type, &mut entities);
458 current_parts.push(token.clone());
459 current_type = Some(entity_type.to_string());
460 } else if prefix == "I" && current_type.is_some() {
461 let clean = token.strip_prefix("##").unwrap_or(token.as_str());
462 if token.starts_with("##") {
463 if let Some(last) = current_parts.last_mut() {
464 last.push_str(clean);
465 }
466 } else {
467 current_parts.push(clean.to_string());
468 }
469 }
470 }
471
472 flush(&mut current_parts, &mut current_type, &mut entities);
473 entities
474}
475
476fn build_relationships(entities: &[NewEntity]) -> Vec<NewRelationship> {
477 if entities.len() < 2 {
478 return Vec::new();
479 }
480
481 let max_rels = crate::constants::max_relationships_per_memory();
484 let n = entities.len().min(MAX_ENTS);
485 let mut rels: Vec<NewRelationship> = Vec::new();
486 let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
487
488 let mut hit_cap = false;
489 'outer: for i in 0..n {
490 if rels.len() >= max_rels {
491 hit_cap = true;
492 break;
493 }
494
495 let mut for_entity = 0usize;
496 for j in (i + 1)..n {
497 if for_entity >= TOP_K_RELATIONS {
498 break;
499 }
500 if rels.len() >= max_rels {
501 hit_cap = true;
502 break 'outer;
503 }
504
505 let src = &entities[i].name;
506 let tgt = &entities[j].name;
507 let key = (src.clone(), tgt.clone());
508
509 if seen.contains(&key) {
510 continue;
511 }
512 seen.insert(key);
513
514 rels.push(NewRelationship {
515 source: src.clone(),
516 target: tgt.clone(),
517 relation: DEFAULT_RELATION.to_string(),
518 strength: 0.5,
519 description: None,
520 });
521 for_entity += 1;
522 }
523 }
524
525 if hit_cap {
527 tracing::warn!(
528 "relacionamentos truncados em {max_rels} (com {n} entidades, máx teórico era ~{}× combinações)",
529 n.saturating_sub(1)
530 );
531 }
532
533 rels
534}
535
536fn run_ner_sliding_window(
537 model: &BertNerModel,
538 body: &str,
539 paths: &AppPaths,
540) -> Result<Vec<ExtractedEntity>> {
541 let tokenizer_path = model_dir(paths).join("tokenizer.json");
542 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
543 .map_err(|e| anyhow::anyhow!("carregando tokenizer NER: {e}"))?;
544
545 let encoding = tokenizer
546 .encode(body, false)
547 .map_err(|e| anyhow::anyhow!("encoding NER: {e}"))?;
548
549 let all_ids: Vec<u32> = encoding.get_ids().to_vec();
550 let all_tokens: Vec<String> = encoding
551 .get_tokens()
552 .iter()
553 .map(|s| s.to_string())
554 .collect();
555
556 if all_ids.is_empty() {
557 return Ok(Vec::new());
558 }
559
560 let mut entities: Vec<ExtractedEntity> = Vec::new();
561 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
562
563 let mut start = 0usize;
564 loop {
565 let end = (start + MAX_SEQ_LEN).min(all_ids.len());
566 let window_ids = &all_ids[start..end];
567 let window_tokens = &all_tokens[start..end];
568 let attention_mask: Vec<u32> = vec![1u32; window_ids.len()];
569
570 match model.predict(window_ids, &attention_mask) {
571 Ok(labels) => {
572 let window_ents = iob_to_entities(window_tokens, &labels);
573 for ent in window_ents {
574 if seen.insert(ent.name.clone()) {
575 entities.push(ent);
576 }
577 }
578 }
579 Err(e) => {
580 tracing::warn!("janela NER falhou (start={start}): {e:#}");
581 }
582 }
583
584 if end >= all_ids.len() {
585 break;
586 }
587 start += STRIDE;
588 }
589
590 Ok(entities)
591}
592
593fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
598 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
599 let suffix_re = SUFFIX_RE.get_or_init(|| Regex::new(r"^([\-\s]+\d+(?:\.\d+)?)").unwrap());
600
601 entities
602 .into_iter()
603 .map(|ent| {
604 if let Some(pos) = body.find(&ent.name) {
606 let after_pos = pos + ent.name.len();
607 if after_pos < body.len() {
608 let after = &body[after_pos..];
609 if let Some(m) = suffix_re.find(after) {
610 let suffix = m.as_str();
611 if suffix.len() <= 6 {
613 let extended = format!("{}{}", ent.name, suffix);
614 return ExtractedEntity {
615 name: extended,
616 entity_type: ent.entity_type,
617 };
618 }
619 }
620 }
621 }
622 ent
623 })
624 .collect()
625}
626
627fn merge_and_deduplicate(
628 regex_ents: Vec<ExtractedEntity>,
629 ner_ents: Vec<ExtractedEntity>,
630) -> Vec<ExtractedEntity> {
631 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
632 let mut result: Vec<ExtractedEntity> = Vec::new();
633 let mut truncated = false;
634
635 let total_input = regex_ents.len() + ner_ents.len();
636 for ent in regex_ents.into_iter().chain(ner_ents) {
637 let key = ent.name.to_lowercase();
638 if seen.insert(key) {
639 result.push(ent);
640 }
641 if result.len() >= MAX_ENTS {
642 truncated = true;
643 break;
644 }
645 }
646
647 if truncated {
649 tracing::warn!(
650 "extração truncada em {MAX_ENTS} entidades (entrada tinha {total_input} candidatos antes da deduplicação)"
651 );
652 }
653
654 result
655}
656
657fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
658 extracted
659 .into_iter()
660 .map(|e| NewEntity {
661 name: e.name,
662 entity_type: e.entity_type,
663 description: None,
664 })
665 .collect()
666}
667
668pub fn extract_graph_auto(body: &str, paths: &AppPaths) -> Result<ExtractionResult> {
669 let regex_entities = apply_regex_prefilter(body);
670
671 let mut bert_used = false;
672 let ner_entities = match get_or_init_model(paths) {
673 Some(model) => match run_ner_sliding_window(model, body, paths) {
674 Ok(ents) => {
675 bert_used = true;
676 ents
677 }
678 Err(e) => {
679 tracing::warn!("NER falhou, usando apenas regex: {e:#}");
680 Vec::new()
681 }
682 },
683 None => Vec::new(),
684 };
685
686 let merged = merge_and_deduplicate(regex_entities, ner_entities);
687 let extended = extend_with_numeric_suffix(merged, body);
689 let entities = to_new_entities(extended);
690 let relationships = build_relationships(&entities);
691
692 let extraction_method = if bert_used {
693 "bert+regex".to_string()
694 } else {
695 "regex-only".to_string()
696 };
697
698 Ok(ExtractionResult {
699 entities,
700 relationships,
701 extraction_method,
702 })
703}
704
705pub struct RegexExtractor;
706
707impl Extractor for RegexExtractor {
708 fn extract(&self, body: &str) -> Result<ExtractionResult> {
709 let regex_entities = apply_regex_prefilter(body);
710 let entities = to_new_entities(regex_entities);
711 let relationships = build_relationships(&entities);
712 Ok(ExtractionResult {
713 entities,
714 relationships,
715 extraction_method: "regex-only".to_string(),
716 })
717 }
718}
719
720#[cfg(test)]
721mod tests {
722 use super::*;
723
724 fn make_paths() -> AppPaths {
725 use std::path::PathBuf;
726 AppPaths {
727 db: PathBuf::from("/tmp/test.sqlite"),
728 models: PathBuf::from("/tmp/test_models"),
729 }
730 }
731
732 #[test]
733 fn regex_email_captura_endereco() {
734 let ents = apply_regex_prefilter("contato: fulano@empresa.com.br para mais info");
735 assert!(ents
737 .iter()
738 .any(|e| e.name == "fulano@empresa.com.br" && e.entity_type == "concept"));
739 }
740
741 #[test]
742 fn regex_all_caps_filtra_palavra_regra_pt() {
743 let ents = apply_regex_prefilter("NUNCA fazer isso. PROIBIDO usar X. DEVE seguir Y.");
745 assert!(
746 !ents.iter().any(|e| e.name == "NUNCA"),
747 "NUNCA deveria ser filtrado como stopword"
748 );
749 assert!(
750 !ents.iter().any(|e| e.name == "PROIBIDO"),
751 "PROIBIDO deveria ser filtrado"
752 );
753 assert!(
754 !ents.iter().any(|e| e.name == "DEVE"),
755 "DEVE deveria ser filtrado"
756 );
757 }
758
759 #[test]
760 fn regex_all_caps_aceita_constante_com_underscore() {
761 let ents = apply_regex_prefilter("configure MAX_RETRY=3 e API_TIMEOUT=30");
763 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
764 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
765 }
766
767 #[test]
768 fn regex_all_caps_aceita_acronimo_dominio() {
769 let ents = apply_regex_prefilter("OPENAI lançou GPT-5 com NVIDIA H100");
771 assert!(ents.iter().any(|e| e.name == "OPENAI"));
772 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
773 }
774
775 #[test]
776 fn regex_url_captura_link() {
777 let ents = apply_regex_prefilter("veja https://docs.rs/crate para detalhes");
778 assert!(ents
779 .iter()
780 .any(|e| e.name.starts_with("https://") && e.entity_type == "concept"));
781 }
782
783 #[test]
784 fn regex_uuid_captura_identificador() {
785 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
786 assert!(ents.iter().any(|e| e.entity_type == "concept"));
787 }
788
789 #[test]
790 fn regex_all_caps_captura_constante() {
791 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
792 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
793 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
794 }
795
796 #[test]
797 fn regex_all_caps_ignora_palavras_curtas() {
798 let ents = apply_regex_prefilter("use AI em seu projeto");
799 assert!(
800 !ents.iter().any(|e| e.name == "AI"),
801 "AI tem apenas 2 chars, deve ser ignorado"
802 );
803 }
804
805 #[test]
806 fn iob_decodifica_per_para_person() {
807 let tokens = vec![
808 "John".to_string(),
809 "Doe".to_string(),
810 "trabalhou".to_string(),
811 ];
812 let labels = vec!["B-PER".to_string(), "I-PER".to_string(), "O".to_string()];
813 let ents = iob_to_entities(&tokens, &labels);
814 assert_eq!(ents.len(), 1);
815 assert_eq!(ents[0].entity_type, "person");
816 assert!(ents[0].name.contains("John"));
817 }
818
819 #[test]
820 fn iob_strip_subword_b_prefix() {
821 let tokens = vec!["Open".to_string(), "##AI".to_string()];
824 let labels = vec!["B-ORG".to_string(), "B-ORG".to_string()];
825 let ents = iob_to_entities(&tokens, &labels);
826 assert!(
827 ents.iter().any(|e| e.name == "OpenAI" || e.name == "Open"),
828 "deveria mergear ##AI ou descartar"
829 );
830 }
831
832 #[test]
833 fn iob_subword_orphan_descarta() {
834 let tokens = vec!["##AI".to_string()];
836 let labels = vec!["B-ORG".to_string()];
837 let ents = iob_to_entities(&tokens, &labels);
838 assert!(
839 ents.is_empty(),
840 "subword órfão sem entidade ativa deve ser descartado"
841 );
842 }
843
844 #[test]
845 fn iob_descarta_date() {
846 let tokens = vec!["Janeiro".to_string(), "2024".to_string()];
847 let labels = vec!["B-DATE".to_string(), "I-DATE".to_string()];
848 let ents = iob_to_entities(&tokens, &labels);
849 assert!(ents.is_empty(), "DATE deve ser descartado");
850 }
851
852 #[test]
853 fn iob_mapeia_org_para_project() {
854 let tokens = vec!["Empresa".to_string()];
855 let labels = vec!["B-ORG".to_string()];
856 let ents = iob_to_entities(&tokens, &labels);
857 assert_eq!(ents[0].entity_type, "project");
858 }
859
860 #[test]
861 fn iob_mapeia_org_sdk_para_tool() {
862 let tokens = vec!["tokio-sdk".to_string()];
863 let labels = vec!["B-ORG".to_string()];
864 let ents = iob_to_entities(&tokens, &labels);
865 assert_eq!(ents[0].entity_type, "tool");
866 }
867
868 #[test]
869 fn iob_mapeia_loc_para_concept() {
870 let tokens = vec!["Brasil".to_string()];
871 let labels = vec!["B-LOC".to_string()];
872 let ents = iob_to_entities(&tokens, &labels);
873 assert_eq!(ents[0].entity_type, "concept");
874 }
875
876 #[test]
877 fn build_relationships_respeitam_max_rels() {
878 let entities: Vec<NewEntity> = (0..20)
879 .map(|i| NewEntity {
880 name: format!("entidade_{i}"),
881 entity_type: "concept".to_string(),
882 description: None,
883 })
884 .collect();
885 let rels = build_relationships(&entities);
886 let max_rels = crate::constants::max_relationships_per_memory();
887 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
888 }
889
890 #[test]
891 fn build_relationships_sem_duplicatas() {
892 let entities: Vec<NewEntity> = (0..5)
893 .map(|i| NewEntity {
894 name: format!("ent_{i}"),
895 entity_type: "concept".to_string(),
896 description: None,
897 })
898 .collect();
899 let rels = build_relationships(&entities);
900 let mut pares: std::collections::HashSet<(String, String)> =
901 std::collections::HashSet::new();
902 for r in &rels {
903 let par = (r.source.clone(), r.target.clone());
904 assert!(pares.insert(par), "par duplicado encontrado");
905 }
906 }
907
908 #[test]
909 fn merge_deduplica_por_nome_lowercase() {
910 let a = vec![ExtractedEntity {
911 name: "Rust".to_string(),
912 entity_type: "concept".to_string(),
913 }];
914 let b = vec![ExtractedEntity {
915 name: "rust".to_string(),
916 entity_type: "tool".to_string(),
917 }];
918 let merged = merge_and_deduplicate(a, b);
919 assert_eq!(merged.len(), 1, "rust e Rust são a mesma entidade");
920 }
921
922 #[test]
923 fn regex_extractor_implementa_trait() {
924 let extractor = RegexExtractor;
925 let result = extractor
926 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
927 .unwrap();
928 assert!(!result.entities.is_empty());
929 }
930
931 #[test]
932 fn extract_retorna_ok_sem_modelo() {
933 let paths = make_paths();
935 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
936 let result = extract_graph_auto(body, &paths).unwrap();
937 assert!(result
938 .entities
939 .iter()
940 .any(|e| e.name.contains("teste@exemplo.com")));
941 }
942}