1use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use anyhow::{Context, Result};
10use ort::session::{builder::GraphOptimizationLevel, Session};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use unicode_normalization::UnicodeNormalization;
14
15use crate::entity_type::EntityType;
16use crate::paths::AppPaths;
17use crate::storage::entities::{NewEntity, NewRelationship};
18
19const MAX_ENTS: usize = 30;
20#[cfg(test)]
23const TOP_K_RELATIONS: usize = 5;
24const DEFAULT_RELATION: &str = "mentions";
25const MIN_ENTITY_CHARS: usize = 2;
26
27static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35
36const ALL_CAPS_STOPWORDS: &[&str] = &[
54 "ACEITE",
55 "ACID",
56 "ACK",
57 "ACL",
58 "ACRESCENTADO",
59 "ADAPTER",
60 "ADICIONADA",
61 "ADICIONADAS",
62 "ADICIONADO",
63 "ADICIONADOS",
64 "ADICIONAR",
65 "AGENTS",
66 "AINDA",
67 "ALL",
68 "ALTA",
69 "ALWAYS",
70 "APENAS",
71 "API",
72 "ARTEFATOS",
73 "ATIVA",
74 "ATIVO",
75 "BAIXA",
76 "BANCO",
77 "BLOQUEAR",
78 "BORDA",
79 "BUG",
80 "CAPÍTULO",
81 "CASO",
82 "CEO",
83 "CHECKLIST",
84 "CLARO",
85 "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86 "CLI",
87 "COMPLETED",
88 "CONFIRMADO",
89 "CONFIRMARAM",
90 "CONFIRME",
91 "CONFIRMEI",
92 "CONFIRMOU",
93 "CONTRATO",
94 "CRIE",
95 "CRÍTICO",
96 "CRITICAL",
97 "CSV",
98 "DDL",
99 "DEFAULT",
100 "DEFINIR",
101 "DEPARTMENT",
102 "DESC",
103 "DEVE",
104 "DEVEMOS",
105 "DISCO",
106 "DONE",
107 "DSL",
108 "DTO",
109 "EFEITO",
110 "ENTRADA",
111 "EOF",
112 "EPERM",
113 "ERROR",
114 "ESCREVA",
115 "ESCRITA",
116 "ESRCH",
117 "ESSA",
118 "ESSE",
119 "ESSENCIAL",
120 "ESTA",
121 "ESTADO",
122 "ESTE",
123 "ETAPA",
124 "EVITAR",
125 "EXEMPLO",
126 "EXPANDIR",
127 "EXPOR",
128 "FALHA",
129 "FASE",
130 "FATO",
131 "FIFO",
132 "FIXED",
133 "FIXME",
134 "FLUXO",
135 "FONTES",
136 "FORBIDDEN",
137 "FUNCIONA",
138 "GNU",
139 "HACK",
140 "HEARTBEAT",
141 "HTTP",
142 "HTTPS",
143 "INATIVO",
144 "JAMAIS",
145 "JSON",
146 "JWT",
147 "LEITURA",
148 "LLM",
149 "MCP",
150 "MESMO",
151 "METADADOS",
152 "MUST",
153 "NDJSON",
154 "NEGUE",
155 "NEVER",
156 "NOTE",
157 "NUNCA",
158 "OBRIGATORIA",
159 "OBRIGATÓRIO",
160 "OBSERVEI",
161 "PADRÃO",
162 "PASSIVA",
163 "PASSO",
164 "PENDING",
165 "PGID",
166 "PID",
167 "PLAN",
168 "PODEMOS",
169 "PONTEIROS",
170 "PREFERIR",
171 "PROIBIDO",
172 "PROJETO",
173 "RECUSE",
174 "REGRA",
175 "REGRAS",
176 "REMOVIDAS",
177 "REQUIRED",
178 "REQUISITO",
179 "REST",
180 "SEÇÃO",
181 "SEMPRE",
182 "SHALL",
183 "SHOULD",
184 "SIGTERM",
185 "SOMENTE",
186 "SOUL",
187 "TODAS",
188 "TODO",
189 "TODOS",
190 "TOKEN",
191 "TOOLS",
192 "TSV",
193 "TUI",
194 "UI",
195 "URL",
196 "USAR",
197 "VALIDAR",
198 "VAMOS",
199 "VOCÊ",
200 "WARNING",
201 "XML",
202 "YAML",
203];
204
205const HTTP_METHODS: &[&str] = &[
208 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209];
210
211fn is_filtered_all_caps(token: &str) -> bool {
212 let is_identifier = token.contains('_');
214 if is_identifier {
215 return false;
216 }
217 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
218}
219
220fn regex_email() -> &'static Regex {
221 REGEX_EMAIL.get_or_init(|| {
223 Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224 .expect("compile-time validated email regex literal")
225 })
226}
227
228fn regex_url() -> &'static Regex {
229 REGEX_URL.get_or_init(|| {
231 Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232 .expect("compile-time validated URL regex literal")
233 })
234}
235
236fn regex_uuid() -> &'static Regex {
237 REGEX_UUID.get_or_init(|| {
239 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240 .expect("compile-time validated UUID regex literal")
241 })
242}
243
244fn regex_all_caps() -> &'static Regex {
245 REGEX_ALL_CAPS.get_or_init(|| {
246 Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247 .expect("compile-time validated all-caps regex literal")
248 })
249}
250
251fn regex_section_marker() -> &'static Regex {
252 REGEX_SECTION_MARKER.get_or_init(|| {
253 Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260 .expect("compile-time validated section marker regex literal")
261 })
262}
263
264fn regex_brand_camel() -> &'static Regex {
265 REGEX_BRAND_CAMEL.get_or_init(|| {
266 Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269 .expect("compile-time validated CamelCase brand regex literal")
270 })
271}
272
273#[derive(Debug, Clone, PartialEq)]
274pub struct ExtractedEntity {
275 pub name: String,
276 pub entity_type: EntityType,
277}
278
279#[derive(Debug, Clone)]
281pub struct ExtractedUrl {
282 pub url: String,
283 pub offset: usize,
285}
286
287#[derive(Debug, Clone)]
288pub struct ExtractionResult {
289 pub entities: Vec<NewEntity>,
290 pub relationships: Vec<NewRelationship>,
291 pub relationships_truncated: bool,
294 pub extraction_method: String,
297 pub urls: Vec<ExtractedUrl>,
299}
300
301pub trait Extractor: Send + Sync {
302 fn extract(&self, body: &str) -> Result<ExtractionResult>;
303}
304
305#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307pub enum GlinerVariant {
308 Fp32,
309 Fp16,
310 Int8,
311 Q4,
312 Q4f16,
313}
314
315impl GlinerVariant {
316 pub fn as_filename(self) -> &'static str {
318 match self {
319 Self::Fp32 => "model.onnx",
320 Self::Fp16 => "model_fp16.onnx",
321 Self::Int8 => "model_quantized.onnx",
322 Self::Q4 => "model_q4.onnx",
323 Self::Q4f16 => "model_q4f16.onnx",
324 }
325 }
326
327 pub fn display_size(self) -> &'static str {
329 match self {
330 Self::Fp32 => "1.1 GB",
331 Self::Fp16 => "580 MB",
332 Self::Int8 => "349 MB",
333 Self::Q4 => "894 MB",
334 Self::Q4f16 => "472 MB",
335 }
336 }
337}
338
339impl std::fmt::Display for GlinerVariant {
340 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341 match self {
342 Self::Fp32 => f.write_str("fp32"),
343 Self::Fp16 => f.write_str("fp16"),
344 Self::Int8 => f.write_str("int8"),
345 Self::Q4 => f.write_str("q4"),
346 Self::Q4f16 => f.write_str("q4f16"),
347 }
348 }
349}
350
351impl std::str::FromStr for GlinerVariant {
352 type Err = anyhow::Error;
353 fn from_str(s: &str) -> Result<Self> {
354 match s.to_lowercase().as_str() {
355 "fp32" => Ok(Self::Fp32),
356 "fp16" => Ok(Self::Fp16),
357 "int8" => Ok(Self::Int8),
358 "q4" => Ok(Self::Q4),
359 "q4f16" => Ok(Self::Q4f16),
360 other => {
361 anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362 }
363 }
364 }
365}
366
367const GLINER_MAX_WIDTH: usize = 12;
368const GLINER_MAX_SEQ_LEN: usize = 384;
369const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371
372const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373 ("person", EntityType::Person),
374 ("organization", EntityType::Organization),
375 ("location", EntityType::Location),
376 ("date", EntityType::Date),
377 ("project", EntityType::Project),
378 ("tool", EntityType::Tool),
379 ("file", EntityType::File),
380 ("concept", EntityType::Concept),
381 ("decision", EntityType::Decision),
382 ("incident", EntityType::Incident),
383 ("dashboard", EntityType::Dashboard),
384 ("issue tracker", EntityType::IssueTracker),
385 ("memory", EntityType::Memory),
386];
387
388struct GlinerModel {
389 session: parking_lot::Mutex<Session>,
390 tokenizer: tokenizers::Tokenizer,
391 #[allow(dead_code)]
392 variant: GlinerVariant,
393}
394
395impl GlinerModel {
396 fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397 let model_path = model_dir.join(variant.as_filename());
398 let tokenizer_path = model_dir.join("tokenizer.json");
399
400 let session = Session::builder()
401 .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
402 .with_optimization_level(GraphOptimizationLevel::Level3)
403 .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
404 .commit_from_file(&model_path)
405 .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
406
407 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408 .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
409
410 Ok(Self {
411 session: parking_lot::Mutex::new(session),
412 tokenizer,
413 variant,
414 })
415 }
416
417 fn predict(
418 &self,
419 body: &str,
420 entity_labels: &[(&str, EntityType)],
421 threshold: f32,
422 ) -> Result<Vec<ExtractedEntity>> {
423 let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424 let words: Vec<&str> = body.split_whitespace().collect();
425 if words.is_empty() {
426 return Ok(Vec::new());
427 }
428
429 let label_token_count = label_names.len() * 2 + 1;
431 let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432 let words = if words.len() > max_words {
433 tracing::warn!(target: "extraction",
434 original_words = words.len(),
435 capped_words = max_words,
436 "GLiNER input truncated to fit model sequence length"
437 );
438 &words[..max_words]
439 } else {
440 &words[..]
441 };
442 let num_words = words.len();
443
444 let prompt_cap = label_names.len() * 2 + 1 + num_words;
446 let mut prompt_tokens: Vec<String> = Vec::new();
447 prompt_tokens.try_reserve(prompt_cap).map_err(|_| {
448 anyhow::anyhow!(
449 "allocation of {prompt_cap} prompt tokens would exceed available memory"
450 )
451 })?;
452 for label in &label_names {
453 prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
454 prompt_tokens.push((*label).to_string());
455 }
456 prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
457 for word in words {
458 prompt_tokens.push((*word).to_string());
459 }
460
461 let seq_estimate = prompt_tokens.len() * 3;
463 let mut all_ids: Vec<i64> = Vec::new();
464 all_ids.try_reserve(seq_estimate).map_err(|_| {
465 anyhow::anyhow!("allocation of {seq_estimate} token IDs would exceed available memory")
466 })?;
467 let mut all_attention: Vec<i64> = Vec::new();
468 all_attention.try_reserve(seq_estimate).map_err(|_| {
469 anyhow::anyhow!(
470 "allocation of {seq_estimate} attention masks would exceed available memory"
471 )
472 })?;
473 let mut all_word_mask: Vec<i64> = Vec::new();
474 all_word_mask.try_reserve(seq_estimate).map_err(|_| {
475 anyhow::anyhow!("allocation of {seq_estimate} word masks would exceed available memory")
476 })?;
477
478 all_ids.push(1);
480 all_attention.push(1);
481 all_word_mask.push(0);
482
483 let text_offset = label_names.len() * 2 + 1;
484 let mut word_id: i64 = 0;
485
486 for (pos, token_str) in prompt_tokens.iter().enumerate() {
487 let encoding = self
488 .tokenizer
489 .encode(token_str.as_str(), false)
490 .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
491 let ids = encoding.get_ids();
492 let is_text_token = pos >= text_offset;
493
494 for (sub_idx, &id) in ids.iter().enumerate() {
495 all_ids.push(id as i64);
496 all_attention.push(1);
497 if is_text_token && sub_idx == 0 {
498 word_id += 1;
499 all_word_mask.push(word_id);
500 } else {
501 all_word_mask.push(0);
502 }
503 }
504 }
505
506 all_ids.push(2);
508 all_attention.push(1);
509 all_word_mask.push(0);
510
511 let seq_len = all_ids.len();
512
513 let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
515 .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
516 let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
517 .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
518 let t_words_mask =
519 ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
520 .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
521 let t_text_lengths =
522 ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
523 .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
524
525 let num_spans = num_words * GLINER_MAX_WIDTH;
527 let mut span_idx_data = vec![0i64; num_spans * 2];
528 let mut span_mask_data = vec![false; num_spans];
529
530 for start in 0..num_words {
531 let remaining = num_words - start;
532 let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
533 for width in 0..actual_max_width {
534 let dim = start * GLINER_MAX_WIDTH + width;
535 span_idx_data[dim * 2] = start as i64;
536 span_idx_data[dim * 2 + 1] = (start + width) as i64;
537 span_mask_data[dim] = true;
538 }
539 }
540
541 let t_span_idx =
542 ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
543 .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
544 let t_span_mask =
545 ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
546 .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
547
548 let mut session_guard = self.session.lock();
550 let outputs = session_guard
551 .run(ort::inputs![
552 "input_ids" => t_input_ids,
553 "attention_mask" => t_attention,
554 "words_mask" => t_words_mask,
555 "text_lengths" => t_text_lengths,
556 "span_idx" => t_span_idx,
557 "span_mask" => t_span_mask
558 ])
559 .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
560
561 let (logits_shape, logits_data) = outputs["logits"]
564 .try_extract_tensor::<f32>()
565 .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
566
567 let num_classes = label_names.len();
568 let max_width = logits_shape
571 .get(2)
572 .copied()
573 .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
574 let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
575
576 let candidates_cap = num_words * max_width;
577 let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
578 candidates.try_reserve(candidates_cap).map_err(|_| {
579 anyhow::anyhow!(
580 "allocation of {candidates_cap} candidates would exceed available memory"
581 )
582 })?;
583
584 for start in 0..num_words {
585 for width in 0..max_width {
586 let end = start + width;
587 if end >= num_words {
588 break;
589 }
590 for class_idx in 0..nc.min(num_classes) {
591 let flat = start * (max_width * nc) + width * nc + class_idx;
593 if flat >= logits_data.len() {
594 break;
595 }
596 let raw = logits_data[flat];
597 let score = 1.0 / (1.0 + (-raw).exp());
598 if score >= threshold {
599 candidates.push((start, end, class_idx, score));
600 }
601 }
602 }
603 }
604
605 candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
607
608 let mut used = vec![false; num_words];
610 let mut entities: Vec<ExtractedEntity> = Vec::with_capacity(candidates.len().min(MAX_ENTS));
611
612 for (start, end, class_idx, _score) in &candidates {
613 let overlap = (*start..=*end).any(|i| used[i]);
614 if overlap {
615 continue;
616 }
617 for flag in used.iter_mut().take(*end + 1).skip(*start) {
618 *flag = true;
619 }
620 let text = words[*start..=*end].join(" ");
621 if text.len() < MIN_ENTITY_CHARS {
622 continue;
623 }
624 let entity_type = entity_labels[*class_idx].1;
625 entities.push(ExtractedEntity {
626 name: text,
627 entity_type,
628 });
629 if entities.len() >= MAX_ENTS {
630 break;
631 }
632 }
633
634 Ok(entities)
635 }
636}
637
638static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
639
640fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
641 paths.models.join(format!("gliner-multi-v2.1/{variant}"))
642}
643
644fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
645 let dir = gliner_model_dir(paths, variant);
646 std::fs::create_dir_all(&dir)
647 .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
648
649 let model_file = dir.join(variant.as_filename());
650 let tokenizer_file = dir.join("tokenizer.json");
651
652 if model_file.exists() && tokenizer_file.exists() {
653 return Ok(dir);
654 }
655
656 let repo = crate::constants::gliner_model_repo();
657 tracing::info!(target: "extraction",
658 "Downloading GLiNER model ({variant}, ~{})...",
659 variant.display_size()
660 );
661 crate::output::emit_progress_i18n(
662 &format!(
663 "Downloading GLiNER model ({variant}, ~{})...",
664 variant.display_size()
665 ),
666 &format!(
667 "Baixando modelo GLiNER ({variant}, ~{})...",
668 variant.display_size()
669 ),
670 );
671
672 let api = huggingface_hub::api::sync::Api::new().with_context(|| "creating HF Hub client")?;
673 let hf_repo = api.model(repo);
674
675 let remote_model = format!("onnx/{}", variant.as_filename());
676 if !model_file.exists() {
677 let src = hf_repo
678 .get(&remote_model)
679 .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
680 std::fs::copy(&src, &model_file)
681 .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
682 }
683
684 if !tokenizer_file.exists() {
685 let src = hf_repo
686 .get("tokenizer.json")
687 .with_context(|| "downloading tokenizer.json from HF Hub")?;
688 std::fs::copy(&src, &tokenizer_file).with_context(|| "copying tokenizer.json to cache")?;
689 }
690
691 Ok(dir)
692}
693
694fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
695 let dir = ensure_gliner_model_files(paths, variant)?;
696 GlinerModel::load(&dir, variant)
697}
698
699fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
700 GLINER_MODEL
701 .get_or_init(|| match load_gliner_model(paths, variant) {
702 Ok(m) => Some(m),
703 Err(e) => {
704 tracing::warn!(target: "extraction", error = %e, "GLiNER model unavailable, graceful degradation");
705 None
706 }
707 })
708 .as_ref()
709}
710
711fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
712 let mut entities = Vec::with_capacity(16);
713 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(32);
714
715 let add = |entities: &mut Vec<ExtractedEntity>,
716 seen: &mut std::collections::HashSet<String>,
717 name: &str,
718 entity_type: EntityType| {
719 let name = name.trim().to_string();
720 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
721 entities.push(ExtractedEntity { name, entity_type });
722 }
723 };
724
725 let cleaned = regex_section_marker().replace_all(body, " ");
728 let cleaned = cleaned.as_ref();
729
730 for m in regex_email().find_iter(cleaned) {
731 add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
733 }
734 for m in regex_uuid().find_iter(cleaned) {
735 add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
736 }
737 for m in regex_all_caps().find_iter(cleaned) {
738 let candidate = m.as_str();
739 if !is_filtered_all_caps(candidate) {
741 add(&mut entities, &mut seen, candidate, EntityType::Concept);
742 }
743 }
744 for m in regex_brand_camel().find_iter(cleaned) {
747 let name = m.as_str();
748 if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
750 add(&mut entities, &mut seen, name, EntityType::Organization);
751 }
752 }
753
754 entities
755}
756
757pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
761 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::with_capacity(8);
762 let mut result = Vec::with_capacity(4);
763 for m in regex_url().find_iter(body) {
764 let raw = m.as_str();
765 let cleaned = raw
766 .trim_end_matches('`')
767 .trim_end_matches(',')
768 .trim_end_matches('.')
769 .trim_end_matches(';')
770 .trim_end_matches(')')
771 .trim_end_matches(']')
772 .trim_end_matches('}');
773 if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
774 result.push(ExtractedUrl {
775 url: cleaned.to_string(),
776 offset: m.start(),
777 });
778 }
779 }
780 result
781}
782
783#[cfg(test)]
793fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
794 if entities.len() < 2 {
795 return (Vec::new(), false);
796 }
797
798 let max_rels = crate::constants::max_relationships_per_memory();
801 let n = entities.len().min(MAX_ENTS);
802 let mut rels: Vec<NewRelationship> = Vec::with_capacity(n.min(max_rels));
803 let mut seen: std::collections::HashSet<(usize, usize)> =
804 std::collections::HashSet::with_capacity(n.min(max_rels));
805
806 let mut hit_cap = false;
807 'outer: for i in 0..n {
808 if rels.len() >= max_rels {
809 hit_cap = true;
810 break;
811 }
812
813 let mut for_entity = 0usize;
814 for j in (i + 1)..n {
815 if for_entity >= TOP_K_RELATIONS {
816 break;
817 }
818 if rels.len() >= max_rels {
819 hit_cap = true;
820 break 'outer;
821 }
822
823 let key = (i.min(j), i.max(j));
824 if !seen.insert(key) {
825 continue;
826 }
827
828 rels.push(NewRelationship {
829 source: entities[i].name.clone(),
831 target: entities[j].name.clone(),
832 relation: DEFAULT_RELATION.to_string(),
833 strength: 0.5,
834 description: None,
835 });
836 for_entity += 1;
837 }
838 }
839
840 if hit_cap {
842 tracing::warn!(target: "extraction",
843 "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
844 n.saturating_sub(1)
845 );
846 }
847
848 (rels, hit_cap)
849}
850
851fn build_relationships_by_sentence_cooccurrence(
862 body: &str,
863 entities: &[NewEntity],
864) -> (Vec<NewRelationship>, bool) {
865 if entities.len() < 2 {
866 return (Vec::new(), false);
867 }
868
869 let max_rels = crate::constants::max_relationships_per_memory();
870 let lower_names: Vec<(usize, String)> = entities
871 .iter()
872 .take(MAX_ENTS)
873 .enumerate()
874 .map(|(i, e)| (i, e.name.to_lowercase()))
875 .collect();
876
877 let mut rels: Vec<NewRelationship> = Vec::with_capacity(max_rels);
878 let mut seen: std::collections::HashSet<(usize, usize)> =
879 std::collections::HashSet::with_capacity(max_rels);
880 let mut hit_cap = false;
881
882 for sentence in body.split(['.', '!', '?', '\n']) {
883 if sentence.trim().is_empty() {
884 continue;
885 }
886 let lower_sentence = sentence.to_lowercase();
887 let present: Vec<usize> = lower_names
888 .iter()
889 .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
890 .map(|(i, _)| *i)
891 .collect();
892
893 if present.len() < 2 {
894 continue;
895 }
896
897 let n = present.len();
898 for i in 0..n {
899 for j in (i + 1)..n {
900 if rels.len() >= max_rels {
901 hit_cap = true;
902 tracing::warn!(target: "extraction",
903 "relationships truncated to {max_rels} during sentence-level pairing"
904 );
905 return (rels, hit_cap);
906 }
907 let ei = present[i];
908 let ej = present[j];
909 let key = (ei.min(ej), ei.max(ej));
910 if seen.insert(key) {
911 rels.push(NewRelationship {
912 source: entities[ei].name.clone(),
913 target: entities[ej].name.clone(),
914 relation: DEFAULT_RELATION.to_string(),
915 strength: 0.5,
916 description: None,
917 });
918 }
919 }
920 }
921 }
922
923 (rels, hit_cap)
924}
925
926fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
933 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
934 let suffix_re = SUFFIX_RE.get_or_init(|| {
937 Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
938 .expect("compile-time validated numeric suffix regex literal")
939 });
940
941 entities
942 .into_iter()
943 .map(|ent| {
944 if let Some(pos) = body.find(&ent.name) {
946 let after_pos = pos + ent.name.len();
947 if after_pos < body.len() {
948 let after = &body[after_pos..];
949 if let Some(m) = suffix_re.find(after) {
950 let suffix = m.as_str();
951 if suffix.len() <= 7 {
954 let mut extended = String::with_capacity(ent.name.len() + suffix.len());
955 extended.push_str(&ent.name);
956 extended.push_str(suffix);
957 return ExtractedEntity {
958 name: extended,
959 entity_type: ent.entity_type,
960 };
961 }
962 }
963 }
964 }
965 ent
966 })
967 .collect()
968}
969
970fn augment_versioned_model_names(
990 entities: Vec<ExtractedEntity>,
991 body: &str,
992) -> Vec<ExtractedEntity> {
993 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
994 let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
1001 Regex::new(
1002 r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
1003 )
1004 .expect("compile-time validated versioned model regex literal")
1005 });
1006
1007 let mut existing_lc: std::collections::HashSet<String> =
1008 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
1009 let mut result = entities;
1010
1011 for caps in model_re.captures_iter(body) {
1012 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
1013 if full_match.is_empty() || full_match.len() > 24 {
1016 continue;
1017 }
1018 let normalized_lc = full_match.to_lowercase();
1019 if existing_lc.contains(&normalized_lc) {
1020 continue;
1021 }
1022 if result.len() >= MAX_ENTS {
1025 break;
1026 }
1027 existing_lc.insert(normalized_lc);
1028 result.push(ExtractedEntity {
1029 name: full_match.to_string(),
1030 entity_type: EntityType::Concept,
1031 });
1032 }
1033
1034 result
1035}
1036
1037fn merge_and_deduplicate(
1038 regex_ents: Vec<ExtractedEntity>,
1039 ner_ents: Vec<ExtractedEntity>,
1040) -> Vec<ExtractedEntity> {
1041 let mut by_lc: std::collections::HashMap<String, usize> =
1056 std::collections::HashMap::with_capacity(regex_ents.len() + ner_ents.len());
1057 let mut result: Vec<ExtractedEntity> = Vec::with_capacity(MAX_ENTS);
1058 let mut truncated = false;
1059
1060 let total_input = regex_ents.len() + ner_ents.len();
1061 for ent in regex_ents.into_iter().chain(ner_ents) {
1062 let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1063 let key = {
1067 let et = ent.entity_type.as_str();
1068 let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1069 k.push_str(et);
1070 k.push('\0');
1071 k.push_str(&name_lc);
1072 k
1073 };
1074
1075 let type_prefix = {
1080 let et = ent.entity_type.as_str();
1081 let mut p = String::with_capacity(et.len() + 1);
1082 p.push_str(et);
1083 p.push('\0');
1084 p
1085 };
1086 let mut collision_idx: Option<usize> = None;
1087 for (existing_key, idx) in &by_lc {
1088 if !existing_key.starts_with(&type_prefix) {
1090 continue;
1091 }
1092 let existing_name_lc = &existing_key[type_prefix.len()..];
1093 if existing_name_lc == name_lc
1094 || existing_name_lc.contains(name_lc.as_str())
1095 || name_lc.contains(existing_name_lc)
1096 {
1097 collision_idx = Some(*idx);
1098 break;
1099 }
1100 }
1101 match collision_idx {
1102 Some(idx) => {
1103 if ent.name.len() > result[idx].name.len() {
1106 let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1107 let old_key = {
1108 let et = result[idx].entity_type.as_str();
1109 let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1110 k.push_str(et);
1111 k.push('\0');
1112 k.push_str(&old_name_lc);
1113 k
1114 };
1115 by_lc.remove(&old_key);
1116 result[idx] = ent;
1117 by_lc.insert(key, idx);
1118 }
1119 }
1120 None => {
1121 by_lc.insert(key, result.len());
1122 result.push(ent);
1123 }
1124 }
1125 if result.len() >= MAX_ENTS {
1126 truncated = true;
1127 break;
1128 }
1129 }
1130
1131 if truncated {
1133 tracing::warn!(target: "extraction",
1134 "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1135 );
1136 }
1137
1138 result
1139}
1140
1141fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1142 extracted
1143 .into_iter()
1144 .map(|e| NewEntity {
1145 name: e.name,
1146 entity_type: e.entity_type,
1147 description: None,
1148 })
1149 .collect()
1150}
1151
1152pub fn extract_graph_auto(
1153 body: &str,
1154 paths: &AppPaths,
1155 variant: GlinerVariant,
1156) -> Result<ExtractionResult> {
1157 let regex_entities = apply_regex_prefilter(body);
1158 let threshold = crate::constants::gliner_confidence_threshold();
1159
1160 let mut gliner_used = false;
1161 let ner_entities = match get_or_init_gliner(paths, variant) {
1162 Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1163 Ok(ents) => {
1164 gliner_used = true;
1165 ents
1166 }
1167 Err(e) => {
1168 tracing::warn!(target: "extraction", error = %e, "GLiNER NER failed, falling back to regex-only");
1169 Vec::new()
1170 }
1171 },
1172 None => Vec::new(),
1173 };
1174
1175 let merged = merge_and_deduplicate(regex_entities, ner_entities);
1176 let extended = extend_with_numeric_suffix(merged, body);
1177 let with_models = augment_versioned_model_names(extended, body);
1178 let with_models: Vec<ExtractedEntity> = with_models
1179 .into_iter()
1180 .filter(|e| !regex_section_marker().is_match(&e.name))
1181 .collect();
1182 let entities = to_new_entities(with_models);
1183 let (relationships, relationships_truncated) =
1184 build_relationships_by_sentence_cooccurrence(body, &entities);
1185
1186 let extraction_method = if gliner_used {
1187 format!("gliner-{variant}+regex")
1188 } else {
1189 "regex-only".to_string()
1190 };
1191
1192 let urls = extract_urls(body);
1193
1194 Ok(ExtractionResult {
1195 entities,
1196 relationships,
1197 relationships_truncated,
1198 extraction_method,
1199 urls,
1200 })
1201}
1202
1203pub struct RegexExtractor;
1204
1205impl Extractor for RegexExtractor {
1206 fn extract(&self, body: &str) -> Result<ExtractionResult> {
1207 let regex_entities = apply_regex_prefilter(body);
1208 let entities = to_new_entities(regex_entities);
1209 let (relationships, relationships_truncated) =
1210 build_relationships_by_sentence_cooccurrence(body, &entities);
1211 let urls = extract_urls(body);
1212 Ok(ExtractionResult {
1213 entities,
1214 relationships,
1215 relationships_truncated,
1216 extraction_method: "regex-only".to_string(),
1217 urls,
1218 })
1219 }
1220}
1221
1222#[cfg(test)]
1223mod tests {
1224 use super::*;
1225 use crate::entity_type::EntityType;
1226
1227 fn make_paths() -> AppPaths {
1228 use std::path::PathBuf;
1229 AppPaths {
1230 db: PathBuf::from("/tmp/test.sqlite"),
1231 models: PathBuf::from("/tmp/test_models"),
1232 }
1233 }
1234
1235 #[test]
1236 fn regex_email_captures_address() {
1237 let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1238 assert!(ents
1240 .iter()
1241 .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1242 }
1243
1244 #[test]
1245 fn regex_all_caps_filters_pt_rule_word() {
1246 let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1248 assert!(
1249 !ents.iter().any(|e| e.name == "NUNCA"),
1250 "NUNCA must be filtered as a stopword"
1251 );
1252 assert!(
1253 !ents.iter().any(|e| e.name == "PROIBIDO"),
1254 "PROIBIDO must be filtered"
1255 );
1256 assert!(
1257 !ents.iter().any(|e| e.name == "DEVE"),
1258 "DEVE must be filtered"
1259 );
1260 }
1261
1262 #[test]
1263 fn regex_all_caps_accepts_underscored_constant() {
1264 let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1266 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1267 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1268 }
1269
1270 #[test]
1271 fn regex_all_caps_accepts_domain_acronym() {
1272 let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1274 assert!(ents.iter().any(|e| e.name == "OPENAI"));
1275 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1276 }
1277
1278 #[test]
1279 fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1280 let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1282 assert!(
1283 !ents.iter().any(|e| e.name.starts_with("https://")),
1284 "URLs must not appear as entities after the P0-2 split"
1285 );
1286 }
1287
1288 #[test]
1289 fn extract_urls_captures_https() {
1290 let urls = extract_urls("see https://docs.rs/crate for details");
1291 assert_eq!(urls.len(), 1);
1292 assert_eq!(urls[0].url, "https://docs.rs/crate");
1293 assert!(urls[0].offset > 0);
1294 }
1295
1296 #[test]
1297 fn extract_urls_trim_sufixo_pontuacao() {
1298 let urls = extract_urls("link: https://example.com/path. fim");
1299 assert!(!urls.is_empty());
1300 assert!(
1301 !urls[0].url.ends_with('.'),
1302 "sufixo ponto deve ser removido"
1303 );
1304 }
1305
1306 #[test]
1307 fn extract_urls_dedupes_repeated() {
1308 let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1309 let urls = extract_urls(body);
1310 assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1311 }
1312
1313 #[test]
1314 fn regex_uuid_captura_identificador() {
1315 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1316 assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1317 }
1318
1319 #[test]
1320 fn regex_all_caps_captura_constante() {
1321 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1322 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1323 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1324 }
1325
1326 #[test]
1327 fn regex_all_caps_ignores_short_words() {
1328 let ents = apply_regex_prefilter("use AI em seu projeto");
1329 assert!(
1330 !ents.iter().any(|e| e.name == "AI"),
1331 "AI tem apenas 2 chars, deve ser ignorado"
1332 );
1333 }
1334
1335 #[test]
1336 fn build_relationships_respeitam_max_rels() {
1337 let entities: Vec<NewEntity> = (0..20)
1338 .map(|i| NewEntity {
1339 name: format!("entidade_{i}"),
1340 entity_type: EntityType::Concept,
1341 description: None,
1342 })
1343 .collect();
1344 let (rels, truncated) = build_relationships(&entities);
1345 let max_rels = crate::constants::max_relationships_per_memory();
1346 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1347 if rels.len() == max_rels {
1348 assert!(truncated, "truncated deve ser true quando atingiu o cap");
1349 }
1350 }
1351
1352 #[test]
1353 fn build_relationships_without_duplicates() {
1354 let entities: Vec<NewEntity> = (0..5)
1355 .map(|i| NewEntity {
1356 name: format!("ent_{i}"),
1357 entity_type: EntityType::Concept,
1358 description: None,
1359 })
1360 .collect();
1361 let (rels, _truncated) = build_relationships(&entities);
1362 let mut pares: std::collections::HashSet<(String, String)> =
1363 std::collections::HashSet::new();
1364 for r in &rels {
1365 let par = (r.source.clone(), r.target.clone());
1366 assert!(pares.insert(par), "par duplicado encontrado");
1367 }
1368 }
1369
1370 #[test]
1371 fn merge_dedupes_by_lowercase_name() {
1372 let a = vec![ExtractedEntity {
1375 name: "Rust".to_string(),
1376 entity_type: EntityType::Concept,
1377 }];
1378 let b = vec![ExtractedEntity {
1379 name: "rust".to_string(),
1380 entity_type: EntityType::Concept,
1381 }];
1382 let merged = merge_and_deduplicate(a, b);
1383 assert_eq!(
1384 merged.len(),
1385 1,
1386 "rust and Rust with the same type are the same entity"
1387 );
1388 }
1389
1390 #[test]
1391 fn regex_extractor_implements_trait() {
1392 let extractor = RegexExtractor;
1393 let result = extractor
1394 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1395 .unwrap();
1396 assert!(!result.entities.is_empty());
1397 }
1398
1399 #[test]
1400 fn extract_returns_ok_without_model() {
1401 let paths = make_paths();
1403 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1404 let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1405 assert!(result
1406 .entities
1407 .iter()
1408 .any(|e| e.name.contains("teste@exemplo.com")));
1409 }
1410
1411 #[test]
1412 fn stopwords_filter_v1024_terms() {
1413 let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1416 DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1417 let ents = apply_regex_prefilter(body);
1418 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1419 for word in &[
1420 "ACEITE",
1421 "ACK",
1422 "ACL",
1423 "BORDA",
1424 "CHECKLIST",
1425 "COMPLETED",
1426 "CONFIRME",
1427 "DEVEMOS",
1428 "DONE",
1429 "FIXED",
1430 "NEGUE",
1431 "PENDING",
1432 "PLAN",
1433 "PODEMOS",
1434 "RECUSE",
1435 "TOKEN",
1436 "VAMOS",
1437 ] {
1438 assert!(
1439 !names.contains(word),
1440 "v1.0.24 stopword {word} should be filtered but was found in entities"
1441 );
1442 }
1443 }
1444
1445 #[test]
1446 fn dedup_normalizes_unicode_combining_marks() {
1447 let nfc = vec![ExtractedEntity {
1451 name: "Caf\u{e9}".to_string(),
1452 entity_type: EntityType::Concept,
1453 }];
1454 let nfd_name = "Cafe\u{301}".to_string();
1456 let nfd = vec![ExtractedEntity {
1457 name: nfd_name,
1458 entity_type: EntityType::Concept,
1459 }];
1460 let merged = merge_and_deduplicate(nfc, nfd);
1461 assert_eq!(
1462 merged.len(),
1463 1,
1464 "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1465 );
1466 }
1467
1468 #[test]
1469 fn extraction_method_regex_only_unchanged() {
1470 let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1473 assert_eq!(
1474 result.extraction_method, "regex-only",
1475 "RegexExtractor must return regex-only"
1476 );
1477 }
1478
1479 #[test]
1482 fn extend_suffix_pure_numeric_unchanged() {
1483 let ents = vec![ExtractedEntity {
1485 name: "GPT".to_string(),
1486 entity_type: EntityType::Concept,
1487 }];
1488 let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1489 assert_eq!(
1490 result[0].name, "GPT-5",
1491 "purely numeric suffix must be extended"
1492 );
1493 }
1494
1495 #[test]
1496 fn extend_suffix_alphanumeric_letter_after_digit() {
1497 let ents = vec![ExtractedEntity {
1499 name: "GPT".to_string(),
1500 entity_type: EntityType::Concept,
1501 }];
1502 let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1503 assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
1504 }
1505
1506 #[test]
1507 fn extend_suffix_alphanumeric_b_suffix() {
1508 let ents = vec![ExtractedEntity {
1510 name: "Llama".to_string(),
1511 entity_type: EntityType::Concept,
1512 }];
1513 let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1514 assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
1515 }
1516
1517 #[test]
1518 fn extend_suffix_alphanumeric_x_suffix() {
1519 let ents = vec![ExtractedEntity {
1521 name: "Mistral".to_string(),
1522 entity_type: EntityType::Concept,
1523 }];
1524 let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1525 assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
1526 }
1527
1528 #[test]
1531 fn augment_versioned_gpt4o() {
1532 let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1534 assert!(
1535 result.iter().any(|e| e.name == "GPT-4o"),
1536 "GPT-4o must be captured by augment, found: {:?}",
1537 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1538 );
1539 }
1540
1541 #[test]
1542 fn augment_versioned_claude_4_sonnet() {
1543 let result =
1545 augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1546 assert!(
1547 result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1548 "Claude 4 Sonnet must be captured, found: {:?}",
1549 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1550 );
1551 }
1552
1553 #[test]
1554 fn augment_versioned_llama_3_pro() {
1555 let result =
1557 augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1558 assert!(
1559 result.iter().any(|e| e.name == "Llama 3 Pro"),
1560 "Llama 3 Pro deve ser capturado, achados: {:?}",
1561 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1562 );
1563 }
1564
1565 #[test]
1566 fn augment_versioned_mixtral_8x7b() {
1567 let result =
1569 augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1570 assert!(
1571 result.iter().any(|e| e.name == "Mixtral 8x7B"),
1572 "Mixtral 8x7B deve ser capturado, achados: {:?}",
1573 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1574 );
1575 }
1576
1577 #[test]
1578 fn augment_versioned_does_not_duplicate_existing() {
1579 let existing = vec![ExtractedEntity {
1581 name: "Claude 4".to_string(),
1582 entity_type: EntityType::Concept,
1583 }];
1584 let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1585 let count = result.iter().filter(|e| e.name == "Claude 4").count();
1586 assert_eq!(count, 1, "Claude 4 must not be duplicated");
1587 }
1588
1589 #[test]
1592 fn stopwords_filter_url_jwt_api_v1025() {
1593 let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1595 let ents = apply_regex_prefilter(body);
1596 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1597 for blocked in &[
1598 "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1599 ] {
1600 assert!(
1601 !names.contains(blocked),
1602 "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1603 );
1604 }
1605 }
1606
1607 #[test]
1610 fn section_markers_etapa_fase_filtered_v1025() {
1611 let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1615 let ents = apply_regex_prefilter(body);
1616 assert!(
1617 !ents
1618 .iter()
1619 .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1620 "section markers must be stripped; entities: {:?}",
1621 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1622 );
1623 }
1624
1625 #[test]
1626 fn section_markers_passo_secao_filtered_v1025() {
1627 let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1630 let ents = apply_regex_prefilter(body);
1631 assert!(
1632 !ents
1633 .iter()
1634 .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
1635 "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1636 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1637 );
1638 }
1639
1640 #[test]
1643 fn brand_camelcase_extracted_as_organization_v1025() {
1644 let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1646 let ents = apply_regex_prefilter(body);
1647 let openai = ents.iter().find(|e| e.name == "OpenAI");
1648 assert!(
1649 openai.is_some(),
1650 "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1651 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1652 );
1653 assert_eq!(
1654 openai.unwrap().entity_type,
1655 EntityType::Organization,
1656 "brand CamelCase must map to organization (V008)"
1657 );
1658 }
1659
1660 #[test]
1661 fn brand_postgresql_extracted_as_organization_v1025() {
1662 let body = "migrating from MySQL to PostgreSQL for better performance.";
1663 let ents = apply_regex_prefilter(body);
1664 assert!(
1665 ents.iter()
1666 .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
1667 "PostgreSQL must be extracted as organization; entities: {:?}",
1668 ents.iter()
1669 .map(|e| (&e.name, &e.entity_type))
1670 .collect::<Vec<_>>()
1671 );
1672 }
1673
1674 fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1677 ExtractedEntity {
1678 name: name.to_string(),
1679 entity_type,
1680 }
1681 }
1682
1683 #[test]
1684 fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1685 let regex = vec![entity("Sonne", EntityType::Concept)];
1687 let ner = vec![entity("Sonnet", EntityType::Concept)];
1688 let result = merge_and_deduplicate(regex, ner);
1689 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1690 assert_eq!(result[0].name, "Sonnet");
1691 }
1692
1693 #[test]
1694 fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1695 let regex = vec![
1697 entity("Open", EntityType::Organization),
1698 entity("OpenAI", EntityType::Organization),
1699 ];
1700 let result = merge_and_deduplicate(regex, vec![]);
1701 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1702 assert_eq!(result[0].name, "OpenAI");
1703 }
1704
1705 #[test]
1706 fn merge_keeps_both_when_no_containment_v1025() {
1707 let regex = vec![
1709 entity("Alice", EntityType::Person),
1710 entity("Bob", EntityType::Person),
1711 ];
1712 let result = merge_and_deduplicate(regex, vec![]);
1713 assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1714 }
1715
1716 #[test]
1717 fn merge_respects_entity_type_boundary_v1025() {
1718 let regex = vec![
1720 entity("Apple", EntityType::Organization),
1721 entity("Apple", EntityType::Concept),
1722 ];
1723 let result = merge_and_deduplicate(regex, vec![]);
1724 assert_eq!(
1725 result.len(),
1726 2,
1727 "expected 2 entities (different types), got: {result:?}"
1728 );
1729 }
1730
1731 #[test]
1732 fn merge_case_insensitive_dedup_v1025() {
1733 let regex = vec![
1735 entity("OpenAI", EntityType::Organization),
1736 entity("openai", EntityType::Organization),
1737 ];
1738 let result = merge_and_deduplicate(regex, vec![]);
1739 assert_eq!(
1740 result.len(),
1741 1,
1742 "expected 1 entity after case-insensitive dedup, got: {result:?}"
1743 );
1744 }
1745
1746 #[test]
1749 fn extract_graph_auto_handles_large_body_under_30s() {
1750 let body = "x ".repeat(40_000);
1753 let paths = make_paths();
1754 let start = std::time::Instant::now();
1755 let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1756 .expect("extraction must not error");
1757 let elapsed = start.elapsed();
1758 assert!(
1759 elapsed.as_secs() < 30,
1760 "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1761 elapsed.as_secs()
1762 );
1763 let _ = result.entities;
1765 }
1766
1767 #[test]
1770 fn pt_uppercase_stopwords_filtered_v1031() {
1771 let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1772 SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1773 let ents = apply_regex_prefilter(body);
1774 let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
1775 for stop in &[
1776 "ADAPTER",
1777 "PROJETO",
1778 "PASSIVA",
1779 "SOMENTE",
1780 "LEITURA",
1781 "REGRA",
1782 "OBRIGATORIA",
1783 "EXEMPLO",
1784 "DEFAULT",
1785 ] {
1786 assert!(
1787 !names.contains(&stop.to_string()),
1788 "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1789 );
1790 }
1791 }
1792
1793 #[test]
1794 fn pt_underscored_identifier_preserved_v1031() {
1795 let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1798 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1799 assert!(names.contains(&"FLOWAIPER_API_KEY"));
1800 assert!(names.contains(&"MAX_TIMEOUT"));
1801 }
1802
1803 #[test]
1806 fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1807 let body = "Alice met Bob at the conference. Carol works alone in another room.";
1808 let entities = vec![
1809 NewEntity {
1810 name: "Alice".to_string(),
1811 entity_type: EntityType::Person,
1812 description: None,
1813 },
1814 NewEntity {
1815 name: "Bob".to_string(),
1816 entity_type: EntityType::Person,
1817 description: None,
1818 },
1819 NewEntity {
1820 name: "Carol".to_string(),
1821 entity_type: EntityType::Person,
1822 description: None,
1823 },
1824 ];
1825 let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1826 assert!(!truncated);
1827 assert_eq!(
1828 rels.len(),
1829 1,
1830 "only Alice/Bob should pair (same sentence); Carol is isolated"
1831 );
1832 let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1833 assert!(
1834 matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
1835 "unexpected pair {pair:?}"
1836 );
1837 }
1838
1839 #[test]
1840 fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1841 let body = "Alice is here.";
1842 let entities = vec![NewEntity {
1843 name: "Alice".to_string(),
1844 entity_type: EntityType::Person,
1845 description: None,
1846 }];
1847 let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1848 assert!(rels.is_empty());
1849 assert!(!truncated);
1850 }
1851
1852 #[test]
1853 fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1854 let body = "Alice met Bob. Bob saw Alice again.";
1855 let entities = vec![
1856 NewEntity {
1857 name: "Alice".to_string(),
1858 entity_type: EntityType::Person,
1859 description: None,
1860 },
1861 NewEntity {
1862 name: "Bob".to_string(),
1863 entity_type: EntityType::Person,
1864 description: None,
1865 },
1866 ];
1867 let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1868 assert_eq!(
1869 rels.len(),
1870 1,
1871 "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1872 );
1873 }
1874
1875 #[test]
1876 fn extraction_max_tokens_default_is_5000() {
1877 std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1878 assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1879 }
1880
1881 #[test]
1882 fn extraction_max_tokens_env_override_clamped() {
1883 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1884 assert_eq!(
1885 crate::constants::extraction_max_tokens(),
1886 5_000,
1887 "value below 512 must fall back to default"
1888 );
1889
1890 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1891 assert_eq!(
1892 crate::constants::extraction_max_tokens(),
1893 5_000,
1894 "value above 100_000 must fall back to default"
1895 );
1896
1897 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1898 assert_eq!(
1899 crate::constants::extraction_max_tokens(),
1900 8_000,
1901 "valid value must be honoured"
1902 );
1903
1904 std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1905 }
1906
1907 #[test]
1908 fn gliner_variant_from_str_valid() {
1909 assert_eq!(
1910 "fp32".parse::<GlinerVariant>().unwrap(),
1911 GlinerVariant::Fp32
1912 );
1913 assert_eq!(
1914 "fp16".parse::<GlinerVariant>().unwrap(),
1915 GlinerVariant::Fp16
1916 );
1917 assert_eq!(
1918 "int8".parse::<GlinerVariant>().unwrap(),
1919 GlinerVariant::Int8
1920 );
1921 assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1922 assert_eq!(
1923 "q4f16".parse::<GlinerVariant>().unwrap(),
1924 GlinerVariant::Q4f16
1925 );
1926 assert_eq!(
1928 "FP32".parse::<GlinerVariant>().unwrap(),
1929 GlinerVariant::Fp32
1930 );
1931 assert_eq!(
1932 "INT8".parse::<GlinerVariant>().unwrap(),
1933 GlinerVariant::Int8
1934 );
1935 }
1936
1937 #[test]
1938 fn gliner_variant_from_str_invalid() {
1939 assert!("invalid".parse::<GlinerVariant>().is_err());
1940 assert!("fp64".parse::<GlinerVariant>().is_err());
1941 assert!("".parse::<GlinerVariant>().is_err());
1942 }
1943
1944 #[test]
1945 fn gliner_variant_filename_mapping() {
1946 assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1947 assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1948 assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1949 assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1950 assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1951 }
1952
1953 #[test]
1954 fn gliner_variant_display() {
1955 assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1956 assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1957 assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1958 assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1959 assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1960 }
1961
1962 #[test]
1963 fn gliner_variant_display_size() {
1964 assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1965 assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1966 }
1967
1968 #[test]
1969 fn gliner_entity_labels_covers_all_types() {
1970 let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1971 assert!(label_types.contains(&EntityType::Person));
1972 assert!(label_types.contains(&EntityType::Organization));
1973 assert!(label_types.contains(&EntityType::Location));
1974 assert!(label_types.contains(&EntityType::Date));
1975 assert!(label_types.contains(&EntityType::Project));
1976 assert!(label_types.contains(&EntityType::Tool));
1977 assert!(label_types.contains(&EntityType::File));
1978 assert!(label_types.contains(&EntityType::Concept));
1979 assert!(label_types.contains(&EntityType::Decision));
1980 assert!(label_types.contains(&EntityType::Incident));
1981 assert!(label_types.contains(&EntityType::Dashboard));
1982 assert!(label_types.contains(&EntityType::IssueTracker));
1983 assert!(label_types.contains(&EntityType::Memory));
1984 assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1985 }
1986
1987 #[test]
1988 fn gliner_entity_labels_no_duplicates() {
1989 let mut seen = std::collections::HashSet::new();
1990 for (label, _) in GLINER_ENTITY_LABELS {
1991 assert!(seen.insert(*label), "duplicate label: {label}");
1992 }
1993 }
1994
1995 #[test]
1996 fn extract_graph_auto_regex_only_fallback() {
1997 let result = extract_graph_auto(
2002 "Contact someone@test.com about OPENAI project",
2003 &make_paths(),
2004 GlinerVariant::Fp32,
2005 );
2006 assert!(result.is_ok());
2007 let res = result.unwrap();
2008 assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
2010 assert!(
2012 res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
2013 "unexpected extraction_method: {}",
2014 res.extraction_method
2015 );
2016 }
2017
2018 #[test]
2019 fn gliner_variant_roundtrip() {
2020 for variant in &[
2021 GlinerVariant::Fp32,
2022 GlinerVariant::Fp16,
2023 GlinerVariant::Int8,
2024 GlinerVariant::Q4,
2025 GlinerVariant::Q4f16,
2026 ] {
2027 let s = format!("{variant}");
2028 let parsed: GlinerVariant = s.parse().unwrap();
2029 assert_eq!(*variant, parsed);
2030 }
2031 }
2032}