1use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use anyhow::{Context, Result};
10use ort::session::{builder::GraphOptimizationLevel, Session};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use unicode_normalization::UnicodeNormalization;
14
15use crate::entity_type::EntityType;
16use crate::paths::AppPaths;
17use crate::storage::entities::{NewEntity, NewRelationship};
18
19const MAX_ENTS: usize = 30;
20#[cfg(test)]
23const TOP_K_RELATIONS: usize = 5;
24const DEFAULT_RELATION: &str = "mentions";
25const MIN_ENTITY_CHARS: usize = 2;
26
27static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35
36const ALL_CAPS_STOPWORDS: &[&str] = &[
54 "ACEITE",
55 "ACID",
56 "ACK",
57 "ACL",
58 "ACRESCENTADO",
59 "ADAPTER",
60 "ADICIONADA",
61 "ADICIONADAS",
62 "ADICIONADO",
63 "ADICIONADOS",
64 "ADICIONAR",
65 "AGENTS",
66 "AINDA",
67 "ALL",
68 "ALTA",
69 "ALWAYS",
70 "APENAS",
71 "API",
72 "ARTEFATOS",
73 "ATIVA",
74 "ATIVO",
75 "BAIXA",
76 "BANCO",
77 "BLOQUEAR",
78 "BORDA",
79 "BUG",
80 "CAPÍTULO",
81 "CASO",
82 "CEO",
83 "CHECKLIST",
84 "CLARO",
85 "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86 "CLI",
87 "COMPLETED",
88 "CONFIRMADO",
89 "CONFIRMARAM",
90 "CONFIRME",
91 "CONFIRMEI",
92 "CONFIRMOU",
93 "CONTRATO",
94 "CRIE",
95 "CRÍTICO",
96 "CRITICAL",
97 "CSV",
98 "DDL",
99 "DEFAULT",
100 "DEFINIR",
101 "DEPARTMENT",
102 "DESC",
103 "DEVE",
104 "DEVEMOS",
105 "DISCO",
106 "DONE",
107 "DSL",
108 "DTO",
109 "EFEITO",
110 "ENTRADA",
111 "EOF",
112 "EPERM",
113 "ERROR",
114 "ESCREVA",
115 "ESCRITA",
116 "ESRCH",
117 "ESSA",
118 "ESSE",
119 "ESSENCIAL",
120 "ESTA",
121 "ESTADO",
122 "ESTE",
123 "ETAPA",
124 "EVITAR",
125 "EXEMPLO",
126 "EXPANDIR",
127 "EXPOR",
128 "FALHA",
129 "FASE",
130 "FATO",
131 "FIFO",
132 "FIXED",
133 "FIXME",
134 "FLUXO",
135 "FONTES",
136 "FORBIDDEN",
137 "FUNCIONA",
138 "GNU",
139 "HACK",
140 "HEARTBEAT",
141 "HTTP",
142 "HTTPS",
143 "INATIVO",
144 "JAMAIS",
145 "JSON",
146 "JWT",
147 "LEITURA",
148 "LLM",
149 "MCP",
150 "MESMO",
151 "METADADOS",
152 "MUST",
153 "NDJSON",
154 "NEGUE",
155 "NEVER",
156 "NOTE",
157 "NUNCA",
158 "OBRIGATORIA",
159 "OBRIGATÓRIO",
160 "OBSERVEI",
161 "PADRÃO",
162 "PASSIVA",
163 "PASSO",
164 "PENDING",
165 "PGID",
166 "PID",
167 "PLAN",
168 "PODEMOS",
169 "PONTEIROS",
170 "PREFERIR",
171 "PROIBIDO",
172 "PROJETO",
173 "RECUSE",
174 "REGRA",
175 "REGRAS",
176 "REMOVIDAS",
177 "REQUIRED",
178 "REQUISITO",
179 "REST",
180 "SEÇÃO",
181 "SEMPRE",
182 "SHALL",
183 "SHOULD",
184 "SIGTERM",
185 "SOMENTE",
186 "SOUL",
187 "TODAS",
188 "TODO",
189 "TODOS",
190 "TOKEN",
191 "TOOLS",
192 "TSV",
193 "TUI",
194 "UI",
195 "URL",
196 "USAR",
197 "VALIDAR",
198 "VAMOS",
199 "VOCÊ",
200 "WARNING",
201 "XML",
202 "YAML",
203];
204
205const HTTP_METHODS: &[&str] = &[
208 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209];
210
211fn is_filtered_all_caps(token: &str) -> bool {
212 let is_identifier = token.contains('_');
214 if is_identifier {
215 return false;
216 }
217 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
218}
219
220fn regex_email() -> &'static Regex {
221 REGEX_EMAIL.get_or_init(|| {
223 Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224 .expect("compile-time validated email regex literal")
225 })
226}
227
228fn regex_url() -> &'static Regex {
229 REGEX_URL.get_or_init(|| {
231 Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232 .expect("compile-time validated URL regex literal")
233 })
234}
235
236fn regex_uuid() -> &'static Regex {
237 REGEX_UUID.get_or_init(|| {
239 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240 .expect("compile-time validated UUID regex literal")
241 })
242}
243
244fn regex_all_caps() -> &'static Regex {
245 REGEX_ALL_CAPS.get_or_init(|| {
246 Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247 .expect("compile-time validated all-caps regex literal")
248 })
249}
250
251fn regex_section_marker() -> &'static Regex {
252 REGEX_SECTION_MARKER.get_or_init(|| {
253 Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260 .expect("compile-time validated section marker regex literal")
261 })
262}
263
264fn regex_brand_camel() -> &'static Regex {
265 REGEX_BRAND_CAMEL.get_or_init(|| {
266 Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269 .expect("compile-time validated CamelCase brand regex literal")
270 })
271}
272
273#[derive(Debug, Clone, PartialEq)]
274pub struct ExtractedEntity {
275 pub name: String,
276 pub entity_type: EntityType,
277}
278
279#[derive(Debug, Clone)]
281pub struct ExtractedUrl {
282 pub url: String,
283 pub offset: usize,
285}
286
287#[derive(Debug, Clone)]
288pub struct ExtractionResult {
289 pub entities: Vec<NewEntity>,
290 pub relationships: Vec<NewRelationship>,
291 pub relationships_truncated: bool,
294 pub extraction_method: String,
297 pub urls: Vec<ExtractedUrl>,
299}
300
301pub trait Extractor: Send + Sync {
302 fn extract(&self, body: &str) -> Result<ExtractionResult>;
303}
304
305#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307pub enum GlinerVariant {
308 Fp32,
309 Fp16,
310 Int8,
311 Q4,
312 Q4f16,
313}
314
315impl GlinerVariant {
316 pub fn as_filename(self) -> &'static str {
318 match self {
319 Self::Fp32 => "model.onnx",
320 Self::Fp16 => "model_fp16.onnx",
321 Self::Int8 => "model_quantized.onnx",
322 Self::Q4 => "model_q4.onnx",
323 Self::Q4f16 => "model_q4f16.onnx",
324 }
325 }
326
327 pub fn display_size(self) -> &'static str {
329 match self {
330 Self::Fp32 => "1.1 GB",
331 Self::Fp16 => "580 MB",
332 Self::Int8 => "349 MB",
333 Self::Q4 => "894 MB",
334 Self::Q4f16 => "472 MB",
335 }
336 }
337}
338
339impl std::fmt::Display for GlinerVariant {
340 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341 match self {
342 Self::Fp32 => f.write_str("fp32"),
343 Self::Fp16 => f.write_str("fp16"),
344 Self::Int8 => f.write_str("int8"),
345 Self::Q4 => f.write_str("q4"),
346 Self::Q4f16 => f.write_str("q4f16"),
347 }
348 }
349}
350
351impl std::str::FromStr for GlinerVariant {
352 type Err = anyhow::Error;
353 fn from_str(s: &str) -> Result<Self> {
354 match s.to_lowercase().as_str() {
355 "fp32" => Ok(Self::Fp32),
356 "fp16" => Ok(Self::Fp16),
357 "int8" => Ok(Self::Int8),
358 "q4" => Ok(Self::Q4),
359 "q4f16" => Ok(Self::Q4f16),
360 other => {
361 anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362 }
363 }
364 }
365}
366
367const GLINER_MAX_WIDTH: usize = 12;
368const GLINER_MAX_SEQ_LEN: usize = 384;
369const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371
372const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373 ("person", EntityType::Person),
374 ("organization", EntityType::Organization),
375 ("location", EntityType::Location),
376 ("date", EntityType::Date),
377 ("project", EntityType::Project),
378 ("tool", EntityType::Tool),
379 ("file", EntityType::File),
380 ("concept", EntityType::Concept),
381 ("decision", EntityType::Decision),
382 ("incident", EntityType::Incident),
383 ("dashboard", EntityType::Dashboard),
384 ("issue tracker", EntityType::IssueTracker),
385 ("memory", EntityType::Memory),
386];
387
388struct GlinerModel {
389 session: std::sync::Mutex<Session>,
390 tokenizer: tokenizers::Tokenizer,
391 #[allow(dead_code)]
392 variant: GlinerVariant,
393}
394
395impl GlinerModel {
396 fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397 let model_path = model_dir.join(variant.as_filename());
398 let tokenizer_path = model_dir.join("tokenizer.json");
399
400 let session = Session::builder()
401 .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
402 .with_optimization_level(GraphOptimizationLevel::Level3)
403 .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
404 .commit_from_file(&model_path)
405 .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
406
407 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408 .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
409
410 Ok(Self {
411 session: std::sync::Mutex::new(session),
412 tokenizer,
413 variant,
414 })
415 }
416
417 fn predict(
418 &self,
419 body: &str,
420 entity_labels: &[(&str, EntityType)],
421 threshold: f32,
422 ) -> Result<Vec<ExtractedEntity>> {
423 let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424 let words: Vec<&str> = body.split_whitespace().collect();
425 if words.is_empty() {
426 return Ok(Vec::new());
427 }
428
429 let label_token_count = label_names.len() * 2 + 1;
431 let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432 let words = if words.len() > max_words {
433 tracing::warn!(
434 original_words = words.len(),
435 capped_words = max_words,
436 "GLiNER input truncated to fit model sequence length"
437 );
438 &words[..max_words]
439 } else {
440 &words[..]
441 };
442 let num_words = words.len();
443
444 let mut prompt_tokens: Vec<String> =
446 Vec::with_capacity(label_names.len() * 2 + 1 + num_words);
447 for label in &label_names {
448 prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
449 prompt_tokens.push((*label).to_string());
450 }
451 prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
452 for word in words {
453 prompt_tokens.push((*word).to_string());
454 }
455
456 let mut all_ids: Vec<i64> = Vec::new();
458 let mut all_attention: Vec<i64> = Vec::new();
459 let mut all_word_mask: Vec<i64> = Vec::new();
460
461 all_ids.push(1);
463 all_attention.push(1);
464 all_word_mask.push(0);
465
466 let text_offset = label_names.len() * 2 + 1;
467 let mut word_id: i64 = 0;
468
469 for (pos, token_str) in prompt_tokens.iter().enumerate() {
470 let encoding = self
471 .tokenizer
472 .encode(token_str.as_str(), false)
473 .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
474 let ids = encoding.get_ids();
475 let is_text_token = pos >= text_offset;
476
477 for (sub_idx, &id) in ids.iter().enumerate() {
478 all_ids.push(id as i64);
479 all_attention.push(1);
480 if is_text_token && sub_idx == 0 {
481 word_id += 1;
482 all_word_mask.push(word_id);
483 } else {
484 all_word_mask.push(0);
485 }
486 }
487 }
488
489 all_ids.push(2);
491 all_attention.push(1);
492 all_word_mask.push(0);
493
494 let seq_len = all_ids.len();
495
496 let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
498 .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
499 let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
500 .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
501 let t_words_mask =
502 ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
503 .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
504 let t_text_lengths =
505 ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
506 .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
507
508 let num_spans = num_words * GLINER_MAX_WIDTH;
510 let mut span_idx_data = vec![0i64; num_spans * 2];
511 let mut span_mask_data = vec![false; num_spans];
512
513 for start in 0..num_words {
514 let remaining = num_words - start;
515 let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
516 for width in 0..actual_max_width {
517 let dim = start * GLINER_MAX_WIDTH + width;
518 span_idx_data[dim * 2] = start as i64;
519 span_idx_data[dim * 2 + 1] = (start + width) as i64;
520 span_mask_data[dim] = true;
521 }
522 }
523
524 let t_span_idx =
525 ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
526 .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
527 let t_span_mask =
528 ort::value::Tensor::<bool>::from_array(([1usize, num_spans], span_mask_data))
529 .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
530
531 let mut session_guard = self
533 .session
534 .lock()
535 .map_err(|_| anyhow::anyhow!("GLiNER session mutex poisoned"))?;
536 let outputs = session_guard
537 .run(ort::inputs![
538 "input_ids" => t_input_ids,
539 "attention_mask" => t_attention,
540 "words_mask" => t_words_mask,
541 "text_lengths" => t_text_lengths,
542 "span_idx" => t_span_idx,
543 "span_mask" => t_span_mask
544 ])
545 .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
546
547 let (logits_shape, logits_data) = outputs["logits"]
550 .try_extract_tensor::<f32>()
551 .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
552
553 let num_classes = label_names.len();
554 let max_width = logits_shape
557 .get(2)
558 .copied()
559 .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
560 let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
561
562 let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
563
564 for start in 0..num_words {
565 for width in 0..max_width {
566 let end = start + width;
567 if end >= num_words {
568 break;
569 }
570 for class_idx in 0..nc.min(num_classes) {
571 let flat = start * (max_width * nc) + width * nc + class_idx;
573 if flat >= logits_data.len() {
574 break;
575 }
576 let raw = logits_data[flat];
577 let score = 1.0 / (1.0 + (-raw).exp());
578 if score >= threshold {
579 candidates.push((start, end, class_idx, score));
580 }
581 }
582 }
583 }
584
585 candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
587
588 let mut used = vec![false; num_words];
590 let mut entities: Vec<ExtractedEntity> = Vec::new();
591
592 for (start, end, class_idx, _score) in &candidates {
593 let overlap = (*start..=*end).any(|i| used[i]);
594 if overlap {
595 continue;
596 }
597 for flag in used.iter_mut().take(*end + 1).skip(*start) {
598 *flag = true;
599 }
600 let text = words[*start..=*end].join(" ");
601 if text.len() < MIN_ENTITY_CHARS {
602 continue;
603 }
604 let entity_type = entity_labels[*class_idx].1;
605 entities.push(ExtractedEntity {
606 name: text,
607 entity_type,
608 });
609 if entities.len() >= MAX_ENTS {
610 break;
611 }
612 }
613
614 Ok(entities)
615 }
616}
617
618static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
619
620fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
621 paths.models.join(format!("gliner-multi-v2.1/{variant}"))
622}
623
624fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
625 let dir = gliner_model_dir(paths, variant);
626 std::fs::create_dir_all(&dir)
627 .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
628
629 let model_file = dir.join(variant.as_filename());
630 let tokenizer_file = dir.join("tokenizer.json");
631
632 if model_file.exists() && tokenizer_file.exists() {
633 return Ok(dir);
634 }
635
636 let repo = crate::constants::gliner_model_repo();
637 tracing::info!(
638 "Downloading GLiNER model ({variant}, ~{})...",
639 variant.display_size()
640 );
641 crate::output::emit_progress_i18n(
642 &format!(
643 "Downloading GLiNER model ({variant}, ~{})...",
644 variant.display_size()
645 ),
646 &format!(
647 "Baixando modelo GLiNER ({variant}, ~{})...",
648 variant.display_size()
649 ),
650 );
651
652 let api = huggingface_hub::api::sync::Api::new().context("creating HF Hub client")?;
653 let hf_repo = api.model(repo);
654
655 let remote_model = format!("onnx/{}", variant.as_filename());
656 if !model_file.exists() {
657 let src = hf_repo
658 .get(&remote_model)
659 .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
660 std::fs::copy(&src, &model_file)
661 .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
662 }
663
664 if !tokenizer_file.exists() {
665 let src = hf_repo
666 .get("tokenizer.json")
667 .context("downloading tokenizer.json from HF Hub")?;
668 std::fs::copy(&src, &tokenizer_file).context("copying tokenizer.json to cache")?;
669 }
670
671 Ok(dir)
672}
673
674fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
675 let dir = ensure_gliner_model_files(paths, variant)?;
676 GlinerModel::load(&dir, variant)
677}
678
679fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
680 GLINER_MODEL
681 .get_or_init(|| match load_gliner_model(paths, variant) {
682 Ok(m) => Some(m),
683 Err(e) => {
684 tracing::warn!("GLiNER model unavailable (graceful degradation): {e:#}");
685 None
686 }
687 })
688 .as_ref()
689}
690
691fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
692 let mut entities = Vec::with_capacity(16);
693 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
694
695 let add = |entities: &mut Vec<ExtractedEntity>,
696 seen: &mut std::collections::HashSet<String>,
697 name: &str,
698 entity_type: EntityType| {
699 let name = name.trim().to_string();
700 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
701 entities.push(ExtractedEntity { name, entity_type });
702 }
703 };
704
705 let cleaned = regex_section_marker().replace_all(body, " ");
708 let cleaned = cleaned.as_ref();
709
710 for m in regex_email().find_iter(cleaned) {
711 add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
713 }
714 for m in regex_uuid().find_iter(cleaned) {
715 add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
716 }
717 for m in regex_all_caps().find_iter(cleaned) {
718 let candidate = m.as_str();
719 if !is_filtered_all_caps(candidate) {
721 add(&mut entities, &mut seen, candidate, EntityType::Concept);
722 }
723 }
724 for m in regex_brand_camel().find_iter(cleaned) {
727 let name = m.as_str();
728 if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
730 add(&mut entities, &mut seen, name, EntityType::Organization);
731 }
732 }
733
734 entities
735}
736
737pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
741 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
742 let mut result = Vec::with_capacity(4);
743 for m in regex_url().find_iter(body) {
744 let raw = m.as_str();
745 let cleaned = raw
746 .trim_end_matches('`')
747 .trim_end_matches(',')
748 .trim_end_matches('.')
749 .trim_end_matches(';')
750 .trim_end_matches(')')
751 .trim_end_matches(']')
752 .trim_end_matches('}');
753 if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
754 result.push(ExtractedUrl {
755 url: cleaned.to_string(),
756 offset: m.start(),
757 });
758 }
759 }
760 result
761}
762
763#[cfg(test)]
773fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
774 if entities.len() < 2 {
775 return (Vec::new(), false);
776 }
777
778 let max_rels = crate::constants::max_relationships_per_memory();
781 let n = entities.len().min(MAX_ENTS);
782 let mut rels: Vec<NewRelationship> = Vec::new();
783 let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
784
785 let mut hit_cap = false;
786 'outer: for i in 0..n {
787 if rels.len() >= max_rels {
788 hit_cap = true;
789 break;
790 }
791
792 let mut for_entity = 0usize;
793 for j in (i + 1)..n {
794 if for_entity >= TOP_K_RELATIONS {
795 break;
796 }
797 if rels.len() >= max_rels {
798 hit_cap = true;
799 break 'outer;
800 }
801
802 let key = (i.min(j), i.max(j));
803 if !seen.insert(key) {
804 continue;
805 }
806
807 rels.push(NewRelationship {
808 source: entities[i].name.clone(),
810 target: entities[j].name.clone(),
811 relation: DEFAULT_RELATION.to_string(),
812 strength: 0.5,
813 description: None,
814 });
815 for_entity += 1;
816 }
817 }
818
819 if hit_cap {
821 tracing::warn!(
822 "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
823 n.saturating_sub(1)
824 );
825 }
826
827 (rels, hit_cap)
828}
829
830fn build_relationships_by_sentence_cooccurrence(
841 body: &str,
842 entities: &[NewEntity],
843) -> (Vec<NewRelationship>, bool) {
844 if entities.len() < 2 {
845 return (Vec::new(), false);
846 }
847
848 let max_rels = crate::constants::max_relationships_per_memory();
849 let lower_names: Vec<(usize, String)> = entities
850 .iter()
851 .take(MAX_ENTS)
852 .enumerate()
853 .map(|(i, e)| (i, e.name.to_lowercase()))
854 .collect();
855
856 let mut rels: Vec<NewRelationship> = Vec::new();
857 let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
858 let mut hit_cap = false;
859
860 for sentence in body.split(['.', '!', '?', '\n']) {
861 if sentence.trim().is_empty() {
862 continue;
863 }
864 let lower_sentence = sentence.to_lowercase();
865 let present: Vec<usize> = lower_names
866 .iter()
867 .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
868 .map(|(i, _)| *i)
869 .collect();
870
871 if present.len() < 2 {
872 continue;
873 }
874
875 for i in 0..present.len() {
876 for j in (i + 1)..present.len() {
877 if rels.len() >= max_rels {
878 hit_cap = true;
879 tracing::warn!(
880 "relationships truncated to {max_rels} during sentence-level pairing"
881 );
882 return (rels, hit_cap);
883 }
884 let ei = present[i];
885 let ej = present[j];
886 let key = (ei.min(ej), ei.max(ej));
887 if seen.insert(key) {
888 rels.push(NewRelationship {
889 source: entities[ei].name.clone(),
890 target: entities[ej].name.clone(),
891 relation: DEFAULT_RELATION.to_string(),
892 strength: 0.5,
893 description: None,
894 });
895 }
896 }
897 }
898 }
899
900 (rels, hit_cap)
901}
902
903fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
910 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
911 let suffix_re = SUFFIX_RE.get_or_init(|| {
914 Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
915 .expect("compile-time validated numeric suffix regex literal")
916 });
917
918 entities
919 .into_iter()
920 .map(|ent| {
921 if let Some(pos) = body.find(&ent.name) {
923 let after_pos = pos + ent.name.len();
924 if after_pos < body.len() {
925 let after = &body[after_pos..];
926 if let Some(m) = suffix_re.find(after) {
927 let suffix = m.as_str();
928 if suffix.len() <= 7 {
931 let mut extended = String::with_capacity(ent.name.len() + suffix.len());
932 extended.push_str(&ent.name);
933 extended.push_str(suffix);
934 return ExtractedEntity {
935 name: extended,
936 entity_type: ent.entity_type,
937 };
938 }
939 }
940 }
941 }
942 ent
943 })
944 .collect()
945}
946
947fn augment_versioned_model_names(
967 entities: Vec<ExtractedEntity>,
968 body: &str,
969) -> Vec<ExtractedEntity> {
970 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
971 let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
978 Regex::new(
979 r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
980 )
981 .expect("compile-time validated versioned model regex literal")
982 });
983
984 let mut existing_lc: std::collections::HashSet<String> =
985 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
986 let mut result = entities;
987
988 for caps in model_re.captures_iter(body) {
989 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
990 if full_match.is_empty() || full_match.len() > 24 {
993 continue;
994 }
995 let normalized_lc = full_match.to_lowercase();
996 if existing_lc.contains(&normalized_lc) {
997 continue;
998 }
999 if result.len() >= MAX_ENTS {
1002 break;
1003 }
1004 existing_lc.insert(normalized_lc);
1005 result.push(ExtractedEntity {
1006 name: full_match.to_string(),
1007 entity_type: EntityType::Concept,
1008 });
1009 }
1010
1011 result
1012}
1013
1014fn merge_and_deduplicate(
1015 regex_ents: Vec<ExtractedEntity>,
1016 ner_ents: Vec<ExtractedEntity>,
1017) -> Vec<ExtractedEntity> {
1018 let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
1033 let mut result: Vec<ExtractedEntity> = Vec::new();
1034 let mut truncated = false;
1035
1036 let total_input = regex_ents.len() + ner_ents.len();
1037 for ent in regex_ents.into_iter().chain(ner_ents) {
1038 let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1039 let key = {
1043 let et = ent.entity_type.as_str();
1044 let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1045 k.push_str(et);
1046 k.push('\0');
1047 k.push_str(&name_lc);
1048 k
1049 };
1050
1051 let type_prefix = {
1056 let et = ent.entity_type.as_str();
1057 let mut p = String::with_capacity(et.len() + 1);
1058 p.push_str(et);
1059 p.push('\0');
1060 p
1061 };
1062 let mut collision_idx: Option<usize> = None;
1063 for (existing_key, idx) in &by_lc {
1064 if !existing_key.starts_with(&type_prefix) {
1066 continue;
1067 }
1068 let existing_name_lc = &existing_key[type_prefix.len()..];
1069 if existing_name_lc == name_lc
1070 || existing_name_lc.contains(name_lc.as_str())
1071 || name_lc.contains(existing_name_lc)
1072 {
1073 collision_idx = Some(*idx);
1074 break;
1075 }
1076 }
1077 match collision_idx {
1078 Some(idx) => {
1079 if ent.name.len() > result[idx].name.len() {
1082 let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1083 let old_key = {
1084 let et = result[idx].entity_type.as_str();
1085 let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1086 k.push_str(et);
1087 k.push('\0');
1088 k.push_str(&old_name_lc);
1089 k
1090 };
1091 by_lc.remove(&old_key);
1092 result[idx] = ent;
1093 by_lc.insert(key, idx);
1094 }
1095 }
1096 None => {
1097 by_lc.insert(key, result.len());
1098 result.push(ent);
1099 }
1100 }
1101 if result.len() >= MAX_ENTS {
1102 truncated = true;
1103 break;
1104 }
1105 }
1106
1107 if truncated {
1109 tracing::warn!(
1110 "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1111 );
1112 }
1113
1114 result
1115}
1116
1117fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1118 extracted
1119 .into_iter()
1120 .map(|e| NewEntity {
1121 name: e.name,
1122 entity_type: e.entity_type,
1123 description: None,
1124 })
1125 .collect()
1126}
1127
1128pub fn extract_graph_auto(
1129 body: &str,
1130 paths: &AppPaths,
1131 variant: GlinerVariant,
1132) -> Result<ExtractionResult> {
1133 let regex_entities = apply_regex_prefilter(body);
1134 let threshold = crate::constants::gliner_confidence_threshold();
1135
1136 let mut gliner_used = false;
1137 let ner_entities = match get_or_init_gliner(paths, variant) {
1138 Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1139 Ok(ents) => {
1140 gliner_used = true;
1141 ents
1142 }
1143 Err(e) => {
1144 tracing::warn!("GLiNER NER failed, falling back to regex-only extraction: {e:#}");
1145 Vec::new()
1146 }
1147 },
1148 None => Vec::new(),
1149 };
1150
1151 let merged = merge_and_deduplicate(regex_entities, ner_entities);
1152 let extended = extend_with_numeric_suffix(merged, body);
1153 let with_models = augment_versioned_model_names(extended, body);
1154 let with_models: Vec<ExtractedEntity> = with_models
1155 .into_iter()
1156 .filter(|e| !regex_section_marker().is_match(&e.name))
1157 .collect();
1158 let entities = to_new_entities(with_models);
1159 let (relationships, relationships_truncated) =
1160 build_relationships_by_sentence_cooccurrence(body, &entities);
1161
1162 let extraction_method = if gliner_used {
1163 format!("gliner-{variant}+regex")
1164 } else {
1165 "regex-only".to_string()
1166 };
1167
1168 let urls = extract_urls(body);
1169
1170 Ok(ExtractionResult {
1171 entities,
1172 relationships,
1173 relationships_truncated,
1174 extraction_method,
1175 urls,
1176 })
1177}
1178
1179pub struct RegexExtractor;
1180
1181impl Extractor for RegexExtractor {
1182 fn extract(&self, body: &str) -> Result<ExtractionResult> {
1183 let regex_entities = apply_regex_prefilter(body);
1184 let entities = to_new_entities(regex_entities);
1185 let (relationships, relationships_truncated) =
1186 build_relationships_by_sentence_cooccurrence(body, &entities);
1187 let urls = extract_urls(body);
1188 Ok(ExtractionResult {
1189 entities,
1190 relationships,
1191 relationships_truncated,
1192 extraction_method: "regex-only".to_string(),
1193 urls,
1194 })
1195 }
1196}
1197
1198#[cfg(test)]
1199mod tests {
1200 use super::*;
1201 use crate::entity_type::EntityType;
1202
1203 fn make_paths() -> AppPaths {
1204 use std::path::PathBuf;
1205 AppPaths {
1206 db: PathBuf::from("/tmp/test.sqlite"),
1207 models: PathBuf::from("/tmp/test_models"),
1208 }
1209 }
1210
1211 #[test]
1212 fn regex_email_captures_address() {
1213 let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1214 assert!(ents
1216 .iter()
1217 .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1218 }
1219
1220 #[test]
1221 fn regex_all_caps_filters_pt_rule_word() {
1222 let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1224 assert!(
1225 !ents.iter().any(|e| e.name == "NUNCA"),
1226 "NUNCA must be filtered as a stopword"
1227 );
1228 assert!(
1229 !ents.iter().any(|e| e.name == "PROIBIDO"),
1230 "PROIBIDO must be filtered"
1231 );
1232 assert!(
1233 !ents.iter().any(|e| e.name == "DEVE"),
1234 "DEVE must be filtered"
1235 );
1236 }
1237
1238 #[test]
1239 fn regex_all_caps_accepts_underscored_constant() {
1240 let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1242 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1243 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1244 }
1245
1246 #[test]
1247 fn regex_all_caps_accepts_domain_acronym() {
1248 let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1250 assert!(ents.iter().any(|e| e.name == "OPENAI"));
1251 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1252 }
1253
1254 #[test]
1255 fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1256 let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1258 assert!(
1259 !ents.iter().any(|e| e.name.starts_with("https://")),
1260 "URLs must not appear as entities after the P0-2 split"
1261 );
1262 }
1263
1264 #[test]
1265 fn extract_urls_captures_https() {
1266 let urls = extract_urls("see https://docs.rs/crate for details");
1267 assert_eq!(urls.len(), 1);
1268 assert_eq!(urls[0].url, "https://docs.rs/crate");
1269 assert!(urls[0].offset > 0);
1270 }
1271
1272 #[test]
1273 fn extract_urls_trim_sufixo_pontuacao() {
1274 let urls = extract_urls("link: https://example.com/path. fim");
1275 assert!(!urls.is_empty());
1276 assert!(
1277 !urls[0].url.ends_with('.'),
1278 "sufixo ponto deve ser removido"
1279 );
1280 }
1281
1282 #[test]
1283 fn extract_urls_dedupes_repeated() {
1284 let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1285 let urls = extract_urls(body);
1286 assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1287 }
1288
1289 #[test]
1290 fn regex_uuid_captura_identificador() {
1291 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1292 assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1293 }
1294
1295 #[test]
1296 fn regex_all_caps_captura_constante() {
1297 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1298 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1299 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1300 }
1301
1302 #[test]
1303 fn regex_all_caps_ignores_short_words() {
1304 let ents = apply_regex_prefilter("use AI em seu projeto");
1305 assert!(
1306 !ents.iter().any(|e| e.name == "AI"),
1307 "AI tem apenas 2 chars, deve ser ignorado"
1308 );
1309 }
1310
1311 #[test]
1312 fn build_relationships_respeitam_max_rels() {
1313 let entities: Vec<NewEntity> = (0..20)
1314 .map(|i| NewEntity {
1315 name: format!("entidade_{i}"),
1316 entity_type: EntityType::Concept,
1317 description: None,
1318 })
1319 .collect();
1320 let (rels, truncated) = build_relationships(&entities);
1321 let max_rels = crate::constants::max_relationships_per_memory();
1322 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1323 if rels.len() == max_rels {
1324 assert!(truncated, "truncated deve ser true quando atingiu o cap");
1325 }
1326 }
1327
1328 #[test]
1329 fn build_relationships_without_duplicates() {
1330 let entities: Vec<NewEntity> = (0..5)
1331 .map(|i| NewEntity {
1332 name: format!("ent_{i}"),
1333 entity_type: EntityType::Concept,
1334 description: None,
1335 })
1336 .collect();
1337 let (rels, _truncated) = build_relationships(&entities);
1338 let mut pares: std::collections::HashSet<(String, String)> =
1339 std::collections::HashSet::new();
1340 for r in &rels {
1341 let par = (r.source.clone(), r.target.clone());
1342 assert!(pares.insert(par), "par duplicado encontrado");
1343 }
1344 }
1345
1346 #[test]
1347 fn merge_dedupes_by_lowercase_name() {
1348 let a = vec![ExtractedEntity {
1351 name: "Rust".to_string(),
1352 entity_type: EntityType::Concept,
1353 }];
1354 let b = vec![ExtractedEntity {
1355 name: "rust".to_string(),
1356 entity_type: EntityType::Concept,
1357 }];
1358 let merged = merge_and_deduplicate(a, b);
1359 assert_eq!(
1360 merged.len(),
1361 1,
1362 "rust and Rust with the same type are the same entity"
1363 );
1364 }
1365
1366 #[test]
1367 fn regex_extractor_implements_trait() {
1368 let extractor = RegexExtractor;
1369 let result = extractor
1370 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1371 .unwrap();
1372 assert!(!result.entities.is_empty());
1373 }
1374
1375 #[test]
1376 fn extract_returns_ok_without_model() {
1377 let paths = make_paths();
1379 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1380 let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1381 assert!(result
1382 .entities
1383 .iter()
1384 .any(|e| e.name.contains("teste@exemplo.com")));
1385 }
1386
1387 #[test]
1388 fn stopwords_filter_v1024_terms() {
1389 let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1392 DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1393 let ents = apply_regex_prefilter(body);
1394 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1395 for word in &[
1396 "ACEITE",
1397 "ACK",
1398 "ACL",
1399 "BORDA",
1400 "CHECKLIST",
1401 "COMPLETED",
1402 "CONFIRME",
1403 "DEVEMOS",
1404 "DONE",
1405 "FIXED",
1406 "NEGUE",
1407 "PENDING",
1408 "PLAN",
1409 "PODEMOS",
1410 "RECUSE",
1411 "TOKEN",
1412 "VAMOS",
1413 ] {
1414 assert!(
1415 !names.contains(word),
1416 "v1.0.24 stopword {word} should be filtered but was found in entities"
1417 );
1418 }
1419 }
1420
1421 #[test]
1422 fn dedup_normalizes_unicode_combining_marks() {
1423 let nfc = vec![ExtractedEntity {
1427 name: "Caf\u{e9}".to_string(),
1428 entity_type: EntityType::Concept,
1429 }];
1430 let nfd_name = "Cafe\u{301}".to_string();
1432 let nfd = vec![ExtractedEntity {
1433 name: nfd_name,
1434 entity_type: EntityType::Concept,
1435 }];
1436 let merged = merge_and_deduplicate(nfc, nfd);
1437 assert_eq!(
1438 merged.len(),
1439 1,
1440 "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1441 );
1442 }
1443
1444 #[test]
1445 fn extraction_method_regex_only_unchanged() {
1446 let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1449 assert_eq!(
1450 result.extraction_method, "regex-only",
1451 "RegexExtractor must return regex-only"
1452 );
1453 }
1454
1455 #[test]
1458 fn extend_suffix_pure_numeric_unchanged() {
1459 let ents = vec![ExtractedEntity {
1461 name: "GPT".to_string(),
1462 entity_type: EntityType::Concept,
1463 }];
1464 let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1465 assert_eq!(
1466 result[0].name, "GPT-5",
1467 "purely numeric suffix must be extended"
1468 );
1469 }
1470
1471 #[test]
1472 fn extend_suffix_alphanumeric_letter_after_digit() {
1473 let ents = vec![ExtractedEntity {
1475 name: "GPT".to_string(),
1476 entity_type: EntityType::Concept,
1477 }];
1478 let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1479 assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
1480 }
1481
1482 #[test]
1483 fn extend_suffix_alphanumeric_b_suffix() {
1484 let ents = vec![ExtractedEntity {
1486 name: "Llama".to_string(),
1487 entity_type: EntityType::Concept,
1488 }];
1489 let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1490 assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
1491 }
1492
1493 #[test]
1494 fn extend_suffix_alphanumeric_x_suffix() {
1495 let ents = vec![ExtractedEntity {
1497 name: "Mistral".to_string(),
1498 entity_type: EntityType::Concept,
1499 }];
1500 let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1501 assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
1502 }
1503
1504 #[test]
1507 fn augment_versioned_gpt4o() {
1508 let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1510 assert!(
1511 result.iter().any(|e| e.name == "GPT-4o"),
1512 "GPT-4o must be captured by augment, found: {:?}",
1513 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1514 );
1515 }
1516
1517 #[test]
1518 fn augment_versioned_claude_4_sonnet() {
1519 let result =
1521 augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1522 assert!(
1523 result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1524 "Claude 4 Sonnet must be captured, found: {:?}",
1525 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1526 );
1527 }
1528
1529 #[test]
1530 fn augment_versioned_llama_3_pro() {
1531 let result =
1533 augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1534 assert!(
1535 result.iter().any(|e| e.name == "Llama 3 Pro"),
1536 "Llama 3 Pro deve ser capturado, achados: {:?}",
1537 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1538 );
1539 }
1540
1541 #[test]
1542 fn augment_versioned_mixtral_8x7b() {
1543 let result =
1545 augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1546 assert!(
1547 result.iter().any(|e| e.name == "Mixtral 8x7B"),
1548 "Mixtral 8x7B deve ser capturado, achados: {:?}",
1549 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1550 );
1551 }
1552
1553 #[test]
1554 fn augment_versioned_does_not_duplicate_existing() {
1555 let existing = vec![ExtractedEntity {
1557 name: "Claude 4".to_string(),
1558 entity_type: EntityType::Concept,
1559 }];
1560 let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1561 let count = result.iter().filter(|e| e.name == "Claude 4").count();
1562 assert_eq!(count, 1, "Claude 4 must not be duplicated");
1563 }
1564
1565 #[test]
1568 fn stopwords_filter_url_jwt_api_v1025() {
1569 let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1571 let ents = apply_regex_prefilter(body);
1572 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1573 for blocked in &[
1574 "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1575 ] {
1576 assert!(
1577 !names.contains(blocked),
1578 "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1579 );
1580 }
1581 }
1582
1583 #[test]
1586 fn section_markers_etapa_fase_filtered_v1025() {
1587 let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1591 let ents = apply_regex_prefilter(body);
1592 assert!(
1593 !ents
1594 .iter()
1595 .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1596 "section markers must be stripped; entities: {:?}",
1597 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1598 );
1599 }
1600
1601 #[test]
1602 fn section_markers_passo_secao_filtered_v1025() {
1603 let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1606 let ents = apply_regex_prefilter(body);
1607 assert!(
1608 !ents
1609 .iter()
1610 .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
1611 "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1612 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1613 );
1614 }
1615
1616 #[test]
1619 fn brand_camelcase_extracted_as_organization_v1025() {
1620 let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1622 let ents = apply_regex_prefilter(body);
1623 let openai = ents.iter().find(|e| e.name == "OpenAI");
1624 assert!(
1625 openai.is_some(),
1626 "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1627 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1628 );
1629 assert_eq!(
1630 openai.unwrap().entity_type,
1631 EntityType::Organization,
1632 "brand CamelCase must map to organization (V008)"
1633 );
1634 }
1635
1636 #[test]
1637 fn brand_postgresql_extracted_as_organization_v1025() {
1638 let body = "migrating from MySQL to PostgreSQL for better performance.";
1639 let ents = apply_regex_prefilter(body);
1640 assert!(
1641 ents.iter()
1642 .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
1643 "PostgreSQL must be extracted as organization; entities: {:?}",
1644 ents.iter()
1645 .map(|e| (&e.name, &e.entity_type))
1646 .collect::<Vec<_>>()
1647 );
1648 }
1649
1650 fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1653 ExtractedEntity {
1654 name: name.to_string(),
1655 entity_type,
1656 }
1657 }
1658
1659 #[test]
1660 fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1661 let regex = vec![entity("Sonne", EntityType::Concept)];
1663 let ner = vec![entity("Sonnet", EntityType::Concept)];
1664 let result = merge_and_deduplicate(regex, ner);
1665 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1666 assert_eq!(result[0].name, "Sonnet");
1667 }
1668
1669 #[test]
1670 fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1671 let regex = vec![
1673 entity("Open", EntityType::Organization),
1674 entity("OpenAI", EntityType::Organization),
1675 ];
1676 let result = merge_and_deduplicate(regex, vec![]);
1677 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1678 assert_eq!(result[0].name, "OpenAI");
1679 }
1680
1681 #[test]
1682 fn merge_keeps_both_when_no_containment_v1025() {
1683 let regex = vec![
1685 entity("Alice", EntityType::Person),
1686 entity("Bob", EntityType::Person),
1687 ];
1688 let result = merge_and_deduplicate(regex, vec![]);
1689 assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1690 }
1691
1692 #[test]
1693 fn merge_respects_entity_type_boundary_v1025() {
1694 let regex = vec![
1696 entity("Apple", EntityType::Organization),
1697 entity("Apple", EntityType::Concept),
1698 ];
1699 let result = merge_and_deduplicate(regex, vec![]);
1700 assert_eq!(
1701 result.len(),
1702 2,
1703 "expected 2 entities (different types), got: {result:?}"
1704 );
1705 }
1706
1707 #[test]
1708 fn merge_case_insensitive_dedup_v1025() {
1709 let regex = vec![
1711 entity("OpenAI", EntityType::Organization),
1712 entity("openai", EntityType::Organization),
1713 ];
1714 let result = merge_and_deduplicate(regex, vec![]);
1715 assert_eq!(
1716 result.len(),
1717 1,
1718 "expected 1 entity after case-insensitive dedup, got: {result:?}"
1719 );
1720 }
1721
1722 #[test]
1725 fn extract_graph_auto_handles_large_body_under_30s() {
1726 let body = "x ".repeat(40_000);
1729 let paths = make_paths();
1730 let start = std::time::Instant::now();
1731 let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1732 .expect("extraction must not error");
1733 let elapsed = start.elapsed();
1734 assert!(
1735 elapsed.as_secs() < 30,
1736 "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1737 elapsed.as_secs()
1738 );
1739 let _ = result.entities;
1741 }
1742
1743 #[test]
1746 fn pt_uppercase_stopwords_filtered_v1031() {
1747 let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1748 SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1749 let ents = apply_regex_prefilter(body);
1750 let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
1751 for stop in &[
1752 "ADAPTER",
1753 "PROJETO",
1754 "PASSIVA",
1755 "SOMENTE",
1756 "LEITURA",
1757 "REGRA",
1758 "OBRIGATORIA",
1759 "EXEMPLO",
1760 "DEFAULT",
1761 ] {
1762 assert!(
1763 !names.contains(&stop.to_string()),
1764 "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1765 );
1766 }
1767 }
1768
1769 #[test]
1770 fn pt_underscored_identifier_preserved_v1031() {
1771 let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1774 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1775 assert!(names.contains(&"FLOWAIPER_API_KEY"));
1776 assert!(names.contains(&"MAX_TIMEOUT"));
1777 }
1778
1779 #[test]
1782 fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1783 let body = "Alice met Bob at the conference. Carol works alone in another room.";
1784 let entities = vec![
1785 NewEntity {
1786 name: "Alice".to_string(),
1787 entity_type: EntityType::Person,
1788 description: None,
1789 },
1790 NewEntity {
1791 name: "Bob".to_string(),
1792 entity_type: EntityType::Person,
1793 description: None,
1794 },
1795 NewEntity {
1796 name: "Carol".to_string(),
1797 entity_type: EntityType::Person,
1798 description: None,
1799 },
1800 ];
1801 let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1802 assert!(!truncated);
1803 assert_eq!(
1804 rels.len(),
1805 1,
1806 "only Alice/Bob should pair (same sentence); Carol is isolated"
1807 );
1808 let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1809 assert!(
1810 matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
1811 "unexpected pair {pair:?}"
1812 );
1813 }
1814
1815 #[test]
1816 fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1817 let body = "Alice is here.";
1818 let entities = vec![NewEntity {
1819 name: "Alice".to_string(),
1820 entity_type: EntityType::Person,
1821 description: None,
1822 }];
1823 let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1824 assert!(rels.is_empty());
1825 assert!(!truncated);
1826 }
1827
1828 #[test]
1829 fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1830 let body = "Alice met Bob. Bob saw Alice again.";
1831 let entities = vec![
1832 NewEntity {
1833 name: "Alice".to_string(),
1834 entity_type: EntityType::Person,
1835 description: None,
1836 },
1837 NewEntity {
1838 name: "Bob".to_string(),
1839 entity_type: EntityType::Person,
1840 description: None,
1841 },
1842 ];
1843 let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1844 assert_eq!(
1845 rels.len(),
1846 1,
1847 "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1848 );
1849 }
1850
1851 #[test]
1852 fn extraction_max_tokens_default_is_5000() {
1853 std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1854 assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1855 }
1856
1857 #[test]
1858 fn extraction_max_tokens_env_override_clamped() {
1859 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1860 assert_eq!(
1861 crate::constants::extraction_max_tokens(),
1862 5_000,
1863 "value below 512 must fall back to default"
1864 );
1865
1866 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1867 assert_eq!(
1868 crate::constants::extraction_max_tokens(),
1869 5_000,
1870 "value above 100_000 must fall back to default"
1871 );
1872
1873 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1874 assert_eq!(
1875 crate::constants::extraction_max_tokens(),
1876 8_000,
1877 "valid value must be honoured"
1878 );
1879
1880 std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1881 }
1882
1883 #[test]
1884 fn gliner_variant_from_str_valid() {
1885 assert_eq!(
1886 "fp32".parse::<GlinerVariant>().unwrap(),
1887 GlinerVariant::Fp32
1888 );
1889 assert_eq!(
1890 "fp16".parse::<GlinerVariant>().unwrap(),
1891 GlinerVariant::Fp16
1892 );
1893 assert_eq!(
1894 "int8".parse::<GlinerVariant>().unwrap(),
1895 GlinerVariant::Int8
1896 );
1897 assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1898 assert_eq!(
1899 "q4f16".parse::<GlinerVariant>().unwrap(),
1900 GlinerVariant::Q4f16
1901 );
1902 assert_eq!(
1904 "FP32".parse::<GlinerVariant>().unwrap(),
1905 GlinerVariant::Fp32
1906 );
1907 assert_eq!(
1908 "INT8".parse::<GlinerVariant>().unwrap(),
1909 GlinerVariant::Int8
1910 );
1911 }
1912
1913 #[test]
1914 fn gliner_variant_from_str_invalid() {
1915 assert!("invalid".parse::<GlinerVariant>().is_err());
1916 assert!("fp64".parse::<GlinerVariant>().is_err());
1917 assert!("".parse::<GlinerVariant>().is_err());
1918 }
1919
1920 #[test]
1921 fn gliner_variant_filename_mapping() {
1922 assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1923 assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1924 assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1925 assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1926 assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1927 }
1928
1929 #[test]
1930 fn gliner_variant_display() {
1931 assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1932 assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1933 assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1934 assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1935 assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1936 }
1937
1938 #[test]
1939 fn gliner_variant_display_size() {
1940 assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1941 assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1942 }
1943
1944 #[test]
1945 fn gliner_entity_labels_covers_all_types() {
1946 let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1947 assert!(label_types.contains(&EntityType::Person));
1948 assert!(label_types.contains(&EntityType::Organization));
1949 assert!(label_types.contains(&EntityType::Location));
1950 assert!(label_types.contains(&EntityType::Date));
1951 assert!(label_types.contains(&EntityType::Project));
1952 assert!(label_types.contains(&EntityType::Tool));
1953 assert!(label_types.contains(&EntityType::File));
1954 assert!(label_types.contains(&EntityType::Concept));
1955 assert!(label_types.contains(&EntityType::Decision));
1956 assert!(label_types.contains(&EntityType::Incident));
1957 assert!(label_types.contains(&EntityType::Dashboard));
1958 assert!(label_types.contains(&EntityType::IssueTracker));
1959 assert!(label_types.contains(&EntityType::Memory));
1960 assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1961 }
1962
1963 #[test]
1964 fn gliner_entity_labels_no_duplicates() {
1965 let mut seen = std::collections::HashSet::new();
1966 for (label, _) in GLINER_ENTITY_LABELS {
1967 assert!(seen.insert(*label), "duplicate label: {label}");
1968 }
1969 }
1970
1971 #[test]
1972 fn extract_graph_auto_regex_only_fallback() {
1973 let result = extract_graph_auto(
1978 "Contact someone@test.com about OPENAI project",
1979 &make_paths(),
1980 GlinerVariant::Fp32,
1981 );
1982 assert!(result.is_ok());
1983 let res = result.unwrap();
1984 assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
1986 assert!(
1988 res.extraction_method == "regex-only" || res.extraction_method.starts_with("gliner-"),
1989 "unexpected extraction_method: {}",
1990 res.extraction_method
1991 );
1992 }
1993
1994 #[test]
1995 fn gliner_variant_roundtrip() {
1996 for variant in &[
1997 GlinerVariant::Fp32,
1998 GlinerVariant::Fp16,
1999 GlinerVariant::Int8,
2000 GlinerVariant::Q4,
2001 GlinerVariant::Q4f16,
2002 ] {
2003 let s = format!("{variant}");
2004 let parsed: GlinerVariant = s.parse().unwrap();
2005 assert_eq!(*variant, parsed);
2006 }
2007 }
2008}