1use std::path::{Path, PathBuf};
7use std::sync::OnceLock;
8
9use anyhow::{Context, Result};
10use ort::session::{builder::GraphOptimizationLevel, Session};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use unicode_normalization::UnicodeNormalization;
14
15use crate::entity_type::EntityType;
16use crate::paths::AppPaths;
17use crate::storage::entities::{NewEntity, NewRelationship};
18
19const MAX_ENTS: usize = 30;
20#[cfg(test)]
23const TOP_K_RELATIONS: usize = 5;
24const DEFAULT_RELATION: &str = "mentions";
25const MIN_ENTITY_CHARS: usize = 2;
26
27static REGEX_EMAIL: OnceLock<Regex> = OnceLock::new();
28static REGEX_URL: OnceLock<Regex> = OnceLock::new();
29static REGEX_UUID: OnceLock<Regex> = OnceLock::new();
30static REGEX_ALL_CAPS: OnceLock<Regex> = OnceLock::new();
31static REGEX_SECTION_MARKER: OnceLock<Regex> = OnceLock::new();
33static REGEX_BRAND_CAMEL: OnceLock<Regex> = OnceLock::new();
35
36const ALL_CAPS_STOPWORDS: &[&str] = &[
54 "ACEITE",
55 "ACID",
56 "ACK",
57 "ACL",
58 "ACRESCENTADO",
59 "ADAPTER",
60 "ADICIONADA",
61 "ADICIONADAS",
62 "ADICIONADO",
63 "ADICIONADOS",
64 "ADICIONAR",
65 "AGENTS",
66 "AINDA",
67 "ALL",
68 "ALTA",
69 "ALWAYS",
70 "APENAS",
71 "API",
72 "ARTEFATOS",
73 "ATIVA",
74 "ATIVO",
75 "BAIXA",
76 "BANCO",
77 "BLOQUEAR",
78 "BORDA",
79 "BUG",
80 "CAPÍTULO",
81 "CASO",
82 "CEO",
83 "CHECKLIST",
84 "CLARO",
85 "CLAUDE_STREAM_IDLE_TIMEOUT_MS",
86 "CLI",
87 "COMPLETED",
88 "CONFIRMADO",
89 "CONFIRMARAM",
90 "CONFIRME",
91 "CONFIRMEI",
92 "CONFIRMOU",
93 "CONTRATO",
94 "CRIE",
95 "CRÍTICO",
96 "CRITICAL",
97 "CSV",
98 "DDL",
99 "DEFAULT",
100 "DEFINIR",
101 "DEPARTMENT",
102 "DESC",
103 "DEVE",
104 "DEVEMOS",
105 "DISCO",
106 "DONE",
107 "DSL",
108 "DTO",
109 "EFEITO",
110 "ENTRADA",
111 "EOF",
112 "EPERM",
113 "ERROR",
114 "ESCREVA",
115 "ESCRITA",
116 "ESRCH",
117 "ESSA",
118 "ESSE",
119 "ESSENCIAL",
120 "ESTA",
121 "ESTADO",
122 "ESTE",
123 "ETAPA",
124 "EVITAR",
125 "EXEMPLO",
126 "EXPANDIR",
127 "EXPOR",
128 "FALHA",
129 "FASE",
130 "FATO",
131 "FIFO",
132 "FIXED",
133 "FIXME",
134 "FLUXO",
135 "FONTES",
136 "FORBIDDEN",
137 "FUNCIONA",
138 "GNU",
139 "HACK",
140 "HEARTBEAT",
141 "HTTP",
142 "HTTPS",
143 "INATIVO",
144 "JAMAIS",
145 "JSON",
146 "JWT",
147 "LEITURA",
148 "LLM",
149 "MCP",
150 "MESMO",
151 "METADADOS",
152 "MUST",
153 "NDJSON",
154 "NEGUE",
155 "NEVER",
156 "NOTE",
157 "NUNCA",
158 "OBRIGATORIA",
159 "OBRIGATÓRIO",
160 "OBSERVEI",
161 "PADRÃO",
162 "PASSIVA",
163 "PASSO",
164 "PENDING",
165 "PGID",
166 "PID",
167 "PLAN",
168 "PODEMOS",
169 "PONTEIROS",
170 "PREFERIR",
171 "PROIBIDO",
172 "PROJETO",
173 "RECUSE",
174 "REGRA",
175 "REGRAS",
176 "REMOVIDAS",
177 "REQUIRED",
178 "REQUISITO",
179 "REST",
180 "SEÇÃO",
181 "SEMPRE",
182 "SHALL",
183 "SHOULD",
184 "SIGTERM",
185 "SOMENTE",
186 "SOUL",
187 "TODAS",
188 "TODO",
189 "TODOS",
190 "TOKEN",
191 "TOOLS",
192 "TSV",
193 "TUI",
194 "UI",
195 "URL",
196 "USAR",
197 "VALIDAR",
198 "VAMOS",
199 "VOCÊ",
200 "WARNING",
201 "XML",
202 "YAML",
203];
204
205const HTTP_METHODS: &[&str] = &[
208 "GET", "POST", "PUT", "DELETE", "PATCH", "HEAD", "OPTIONS", "CONNECT", "TRACE",
209];
210
211fn is_filtered_all_caps(token: &str) -> bool {
212 let is_identifier = token.contains('_');
214 if is_identifier {
215 return false;
216 }
217 ALL_CAPS_STOPWORDS.contains(&token) || HTTP_METHODS.contains(&token)
218}
219
220fn regex_email() -> &'static Regex {
221 REGEX_EMAIL.get_or_init(|| {
223 Regex::new(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}")
224 .expect("compile-time validated email regex literal")
225 })
226}
227
228fn regex_url() -> &'static Regex {
229 REGEX_URL.get_or_init(|| {
231 Regex::new(r#"https?://[^\s\)\]\}"'<>]+"#)
232 .expect("compile-time validated URL regex literal")
233 })
234}
235
236fn regex_uuid() -> &'static Regex {
237 REGEX_UUID.get_or_init(|| {
239 Regex::new(r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}")
240 .expect("compile-time validated UUID regex literal")
241 })
242}
243
244fn regex_all_caps() -> &'static Regex {
245 REGEX_ALL_CAPS.get_or_init(|| {
246 Regex::new(r"\b[A-Z][A-Z0-9_]{2,}\b")
247 .expect("compile-time validated all-caps regex literal")
248 })
249}
250
251fn regex_section_marker() -> &'static Regex {
252 REGEX_SECTION_MARKER.get_or_init(|| {
253 Regex::new("\\b(?:Etapa|Fase|Passo|Camada|Se\u{00e7}\u{00e3}o|Cap\u{00ed}tulo)\\s+\\d+\\b")
260 .expect("compile-time validated section marker regex literal")
261 })
262}
263
264fn regex_brand_camel() -> &'static Regex {
265 REGEX_BRAND_CAMEL.get_or_init(|| {
266 Regex::new(r"\b[A-Z][a-z]+[A-Z][A-Za-z]+\b")
269 .expect("compile-time validated CamelCase brand regex literal")
270 })
271}
272
273#[derive(Debug, Clone, PartialEq)]
274pub struct ExtractedEntity {
275 pub name: String,
276 pub entity_type: EntityType,
277}
278
279#[derive(Debug, Clone)]
281pub struct ExtractedUrl {
282 pub url: String,
283 pub offset: usize,
285}
286
287#[derive(Debug, Clone)]
288pub struct ExtractionResult {
289 pub entities: Vec<NewEntity>,
290 pub relationships: Vec<NewRelationship>,
291 pub relationships_truncated: bool,
294 pub extraction_method: String,
297 pub urls: Vec<ExtractedUrl>,
299}
300
301pub trait Extractor: Send + Sync {
302 fn extract(&self, body: &str) -> Result<ExtractionResult>;
303}
304
305#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
307pub enum GlinerVariant {
308 Fp32,
309 Fp16,
310 Int8,
311 Q4,
312 Q4f16,
313}
314
315impl GlinerVariant {
316 pub fn as_filename(self) -> &'static str {
318 match self {
319 Self::Fp32 => "model.onnx",
320 Self::Fp16 => "model_fp16.onnx",
321 Self::Int8 => "model_quantized.onnx",
322 Self::Q4 => "model_q4.onnx",
323 Self::Q4f16 => "model_q4f16.onnx",
324 }
325 }
326
327 pub fn display_size(self) -> &'static str {
329 match self {
330 Self::Fp32 => "1.1 GB",
331 Self::Fp16 => "580 MB",
332 Self::Int8 => "349 MB",
333 Self::Q4 => "894 MB",
334 Self::Q4f16 => "472 MB",
335 }
336 }
337}
338
339impl std::fmt::Display for GlinerVariant {
340 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
341 match self {
342 Self::Fp32 => f.write_str("fp32"),
343 Self::Fp16 => f.write_str("fp16"),
344 Self::Int8 => f.write_str("int8"),
345 Self::Q4 => f.write_str("q4"),
346 Self::Q4f16 => f.write_str("q4f16"),
347 }
348 }
349}
350
351impl std::str::FromStr for GlinerVariant {
352 type Err = anyhow::Error;
353 fn from_str(s: &str) -> Result<Self> {
354 match s.to_lowercase().as_str() {
355 "fp32" => Ok(Self::Fp32),
356 "fp16" => Ok(Self::Fp16),
357 "int8" => Ok(Self::Int8),
358 "q4" => Ok(Self::Q4),
359 "q4f16" => Ok(Self::Q4f16),
360 other => {
361 anyhow::bail!("unknown GLiNER variant: {other}. Valid: fp32, fp16, int8, q4, q4f16")
362 }
363 }
364 }
365}
366
367const GLINER_MAX_WIDTH: usize = 12;
368const GLINER_MAX_SEQ_LEN: usize = 384;
369const GLINER_ENT_TOKEN: &str = "<<ENT>>";
370const GLINER_SEP_TOKEN: &str = "<<SEP>>";
371
372const GLINER_ENTITY_LABELS: &[(&str, EntityType)] = &[
373 ("person", EntityType::Person),
374 ("organization", EntityType::Organization),
375 ("location", EntityType::Location),
376 ("date", EntityType::Date),
377 ("project", EntityType::Project),
378 ("tool", EntityType::Tool),
379 ("file", EntityType::File),
380 ("concept", EntityType::Concept),
381 ("decision", EntityType::Decision),
382 ("incident", EntityType::Incident),
383 ("dashboard", EntityType::Dashboard),
384 ("issue tracker", EntityType::IssueTracker),
385 ("memory", EntityType::Memory),
386];
387
388struct GlinerModel {
389 session: std::sync::Mutex<Session>,
390 tokenizer: tokenizers::Tokenizer,
391 #[allow(dead_code)]
392 variant: GlinerVariant,
393}
394
395impl GlinerModel {
396 fn load(model_dir: &Path, variant: GlinerVariant) -> Result<Self> {
397 let model_path = model_dir.join(variant.as_filename());
398 let tokenizer_path = model_dir.join("tokenizer.json");
399
400 let session = Session::builder()
401 .map_err(|e| anyhow::anyhow!("creating GLiNER session builder: {e}"))?
402 .with_optimization_level(GraphOptimizationLevel::Level3)
403 .map_err(|e| anyhow::anyhow!("setting optimization level: {e}"))?
404 .commit_from_file(&model_path)
405 .map_err(|e| anyhow::anyhow!("loading GLiNER ONNX model from {model_path:?}: {e}"))?;
406
407 let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
408 .map_err(|e| anyhow::anyhow!("loading GLiNER tokenizer: {e}"))?;
409
410 Ok(Self {
411 session: std::sync::Mutex::new(session),
412 tokenizer,
413 variant,
414 })
415 }
416
417 fn predict(
418 &self,
419 body: &str,
420 entity_labels: &[(&str, EntityType)],
421 threshold: f32,
422 ) -> Result<Vec<ExtractedEntity>> {
423 let label_names: Vec<&str> = entity_labels.iter().map(|(name, _)| *name).collect();
424 let words: Vec<&str> = body.split_whitespace().collect();
425 if words.is_empty() {
426 return Ok(Vec::new());
427 }
428
429 let label_token_count = label_names.len() * 2 + 1;
431 let max_words = GLINER_MAX_SEQ_LEN.saturating_sub(label_token_count + 2);
432 let words = if words.len() > max_words {
433 tracing::warn!(
434 original_words = words.len(),
435 capped_words = max_words,
436 "GLiNER input truncated to fit model sequence length"
437 );
438 &words[..max_words]
439 } else {
440 &words[..]
441 };
442 let num_words = words.len();
443
444 let mut prompt_tokens: Vec<String> =
446 Vec::with_capacity(label_names.len() * 2 + 1 + num_words);
447 for label in &label_names {
448 prompt_tokens.push(GLINER_ENT_TOKEN.to_string());
449 prompt_tokens.push((*label).to_string());
450 }
451 prompt_tokens.push(GLINER_SEP_TOKEN.to_string());
452 for word in words {
453 prompt_tokens.push((*word).to_string());
454 }
455
456 let mut all_ids: Vec<i64> = Vec::new();
458 let mut all_attention: Vec<i64> = Vec::new();
459 let mut all_word_mask: Vec<i64> = Vec::new();
460
461 all_ids.push(1);
463 all_attention.push(1);
464 all_word_mask.push(0);
465
466 let text_offset = label_names.len() * 2 + 1;
467 let mut word_id: i64 = 0;
468
469 for (pos, token_str) in prompt_tokens.iter().enumerate() {
470 let encoding = self
471 .tokenizer
472 .encode(token_str.as_str(), false)
473 .map_err(|e| anyhow::anyhow!("GLiNER tokenizer encode error: {e}"))?;
474 let ids = encoding.get_ids();
475 let is_text_token = pos >= text_offset;
476
477 for (sub_idx, &id) in ids.iter().enumerate() {
478 all_ids.push(id as i64);
479 all_attention.push(1);
480 if is_text_token && sub_idx == 0 {
481 word_id += 1;
482 all_word_mask.push(word_id);
483 } else {
484 all_word_mask.push(0);
485 }
486 }
487 }
488
489 all_ids.push(2);
491 all_attention.push(1);
492 all_word_mask.push(0);
493
494 let seq_len = all_ids.len();
495
496 let t_input_ids = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_ids))
498 .map_err(|e| anyhow::anyhow!("building input_ids tensor: {e}"))?;
499 let t_attention = ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_attention))
500 .map_err(|e| anyhow::anyhow!("building attention_mask tensor: {e}"))?;
501 let t_words_mask =
502 ort::value::Tensor::<i64>::from_array(([1usize, seq_len], all_word_mask))
503 .map_err(|e| anyhow::anyhow!("building words_mask tensor: {e}"))?;
504 let t_text_lengths =
505 ort::value::Tensor::<i64>::from_array(([1usize, 1usize], vec![num_words as i64]))
506 .map_err(|e| anyhow::anyhow!("building text_lengths tensor: {e}"))?;
507
508 let num_spans = num_words * GLINER_MAX_WIDTH;
510 let mut span_idx_data = vec![0i64; num_spans * 2];
511 let mut span_mask_data = vec![false; num_spans];
512
513 for start in 0..num_words {
514 let remaining = num_words - start;
515 let actual_max_width = GLINER_MAX_WIDTH.min(remaining);
516 for width in 0..actual_max_width {
517 let dim = start * GLINER_MAX_WIDTH + width;
518 span_idx_data[dim * 2] = start as i64;
519 span_idx_data[dim * 2 + 1] = (start + width) as i64;
520 span_mask_data[dim] = true;
521 }
522 }
523
524 let span_mask_i64: Vec<i64> = span_mask_data.iter().map(|&b| b as i64).collect();
526 let t_span_idx =
527 ort::value::Tensor::<i64>::from_array(([1usize, num_spans, 2usize], span_idx_data))
528 .map_err(|e| anyhow::anyhow!("building span_idx tensor: {e}"))?;
529 let t_span_mask =
530 ort::value::Tensor::<i64>::from_array(([1usize, num_spans], span_mask_i64))
531 .map_err(|e| anyhow::anyhow!("building span_mask tensor: {e}"))?;
532
533 let mut session_guard = self
535 .session
536 .lock()
537 .map_err(|_| anyhow::anyhow!("GLiNER session mutex poisoned"))?;
538 let outputs = session_guard
539 .run(ort::inputs![
540 "input_ids" => t_input_ids,
541 "attention_mask" => t_attention,
542 "words_mask" => t_words_mask,
543 "text_lengths" => t_text_lengths,
544 "span_idx" => t_span_idx,
545 "span_mask" => t_span_mask
546 ])
547 .map_err(|e| anyhow::anyhow!("GLiNER inference forward pass: {e}"))?;
548
549 let (logits_shape, logits_data) = outputs["logits"]
552 .try_extract_tensor::<f32>()
553 .map_err(|e| anyhow::anyhow!("extracting logits tensor: {e}"))?;
554
555 let num_classes = label_names.len();
556 let max_width = logits_shape
559 .get(2)
560 .copied()
561 .unwrap_or(GLINER_MAX_WIDTH as i64) as usize;
562 let nc = logits_shape.get(3).copied().unwrap_or(num_classes as i64) as usize;
563
564 let mut candidates: Vec<(usize, usize, usize, f32)> = Vec::new();
565
566 for start in 0..num_words {
567 for width in 0..max_width {
568 let end = start + width;
569 if end >= num_words {
570 break;
571 }
572 for class_idx in 0..nc.min(num_classes) {
573 let flat = start * (max_width * nc) + width * nc + class_idx;
575 if flat >= logits_data.len() {
576 break;
577 }
578 let raw = logits_data[flat];
579 let score = 1.0 / (1.0 + (-raw).exp());
580 if score >= threshold {
581 candidates.push((start, end, class_idx, score));
582 }
583 }
584 }
585 }
586
587 candidates.sort_by(|a, b| b.3.partial_cmp(&a.3).unwrap_or(std::cmp::Ordering::Equal));
589
590 let mut used = vec![false; num_words];
592 let mut entities: Vec<ExtractedEntity> = Vec::new();
593
594 for (start, end, class_idx, _score) in &candidates {
595 let overlap = (*start..=*end).any(|i| used[i]);
596 if overlap {
597 continue;
598 }
599 for flag in used.iter_mut().take(*end + 1).skip(*start) {
600 *flag = true;
601 }
602 let text = words[*start..=*end].join(" ");
603 if text.len() < MIN_ENTITY_CHARS {
604 continue;
605 }
606 let entity_type = entity_labels[*class_idx].1;
607 entities.push(ExtractedEntity {
608 name: text,
609 entity_type,
610 });
611 if entities.len() >= MAX_ENTS {
612 break;
613 }
614 }
615
616 Ok(entities)
617 }
618}
619
620static GLINER_MODEL: OnceLock<Option<GlinerModel>> = OnceLock::new();
621
622fn gliner_model_dir(paths: &AppPaths, variant: GlinerVariant) -> PathBuf {
623 paths.models.join(format!("gliner-multi-v2.1/{variant}"))
624}
625
626fn ensure_gliner_model_files(paths: &AppPaths, variant: GlinerVariant) -> Result<PathBuf> {
627 let dir = gliner_model_dir(paths, variant);
628 std::fs::create_dir_all(&dir)
629 .with_context(|| format!("creating GLiNER model directory: {dir:?}"))?;
630
631 let model_file = dir.join(variant.as_filename());
632 let tokenizer_file = dir.join("tokenizer.json");
633
634 if model_file.exists() && tokenizer_file.exists() {
635 return Ok(dir);
636 }
637
638 let repo = crate::constants::gliner_model_repo();
639 tracing::info!(
640 "Downloading GLiNER model ({variant}, ~{})...",
641 variant.display_size()
642 );
643 crate::output::emit_progress_i18n(
644 &format!(
645 "Downloading GLiNER model ({variant}, ~{})...",
646 variant.display_size()
647 ),
648 &format!(
649 "Baixando modelo GLiNER ({variant}, ~{})...",
650 variant.display_size()
651 ),
652 );
653
654 let api = huggingface_hub::api::sync::Api::new().context("creating HF Hub client")?;
655 let hf_repo = api.model(repo);
656
657 let remote_model = format!("onnx/{}", variant.as_filename());
658 if !model_file.exists() {
659 let src = hf_repo
660 .get(&remote_model)
661 .with_context(|| format!("downloading {remote_model} from HF Hub"))?;
662 std::fs::copy(&src, &model_file)
663 .with_context(|| format!("copying {} to cache", variant.as_filename()))?;
664 }
665
666 if !tokenizer_file.exists() {
667 let src = hf_repo
668 .get("tokenizer.json")
669 .context("downloading tokenizer.json from HF Hub")?;
670 std::fs::copy(&src, &tokenizer_file).context("copying tokenizer.json to cache")?;
671 }
672
673 Ok(dir)
674}
675
676fn load_gliner_model(paths: &AppPaths, variant: GlinerVariant) -> Result<GlinerModel> {
677 let dir = ensure_gliner_model_files(paths, variant)?;
678 GlinerModel::load(&dir, variant)
679}
680
681fn get_or_init_gliner(paths: &AppPaths, variant: GlinerVariant) -> Option<&'static GlinerModel> {
682 GLINER_MODEL
683 .get_or_init(|| match load_gliner_model(paths, variant) {
684 Ok(m) => Some(m),
685 Err(e) => {
686 tracing::warn!("GLiNER model unavailable (graceful degradation): {e:#}");
687 None
688 }
689 })
690 .as_ref()
691}
692
693fn apply_regex_prefilter(body: &str) -> Vec<ExtractedEntity> {
694 let mut entities = Vec::with_capacity(16);
695 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
696
697 let add = |entities: &mut Vec<ExtractedEntity>,
698 seen: &mut std::collections::HashSet<String>,
699 name: &str,
700 entity_type: EntityType| {
701 let name = name.trim().to_string();
702 if name.len() >= MIN_ENTITY_CHARS && seen.insert(name.clone()) {
703 entities.push(ExtractedEntity { name, entity_type });
704 }
705 };
706
707 let cleaned = regex_section_marker().replace_all(body, " ");
710 let cleaned = cleaned.as_ref();
711
712 for m in regex_email().find_iter(cleaned) {
713 add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
715 }
716 for m in regex_uuid().find_iter(cleaned) {
717 add(&mut entities, &mut seen, m.as_str(), EntityType::Concept);
718 }
719 for m in regex_all_caps().find_iter(cleaned) {
720 let candidate = m.as_str();
721 if !is_filtered_all_caps(candidate) {
723 add(&mut entities, &mut seen, candidate, EntityType::Concept);
724 }
725 }
726 for m in regex_brand_camel().find_iter(cleaned) {
729 let name = m.as_str();
730 if !ALL_CAPS_STOPWORDS.contains(&name.to_uppercase().as_str()) {
732 add(&mut entities, &mut seen, name, EntityType::Organization);
733 }
734 }
735
736 entities
737}
738
739pub fn extract_urls(body: &str) -> Vec<ExtractedUrl> {
743 let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
744 let mut result = Vec::with_capacity(4);
745 for m in regex_url().find_iter(body) {
746 let raw = m.as_str();
747 let cleaned = raw
748 .trim_end_matches('`')
749 .trim_end_matches(',')
750 .trim_end_matches('.')
751 .trim_end_matches(';')
752 .trim_end_matches(')')
753 .trim_end_matches(']')
754 .trim_end_matches('}');
755 if cleaned.len() >= 10 && seen.insert(cleaned.to_string()) {
756 result.push(ExtractedUrl {
757 url: cleaned.to_string(),
758 offset: m.start(),
759 });
760 }
761 }
762 result
763}
764
765#[cfg(test)]
775fn build_relationships(entities: &[NewEntity]) -> (Vec<NewRelationship>, bool) {
776 if entities.len() < 2 {
777 return (Vec::new(), false);
778 }
779
780 let max_rels = crate::constants::max_relationships_per_memory();
783 let n = entities.len().min(MAX_ENTS);
784 let mut rels: Vec<NewRelationship> = Vec::new();
785 let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
786
787 let mut hit_cap = false;
788 'outer: for i in 0..n {
789 if rels.len() >= max_rels {
790 hit_cap = true;
791 break;
792 }
793
794 let mut for_entity = 0usize;
795 for j in (i + 1)..n {
796 if for_entity >= TOP_K_RELATIONS {
797 break;
798 }
799 if rels.len() >= max_rels {
800 hit_cap = true;
801 break 'outer;
802 }
803
804 let key = (i.min(j), i.max(j));
805 if !seen.insert(key) {
806 continue;
807 }
808
809 rels.push(NewRelationship {
810 source: entities[i].name.clone(),
812 target: entities[j].name.clone(),
813 relation: DEFAULT_RELATION.to_string(),
814 strength: 0.5,
815 description: None,
816 });
817 for_entity += 1;
818 }
819 }
820
821 if hit_cap {
823 tracing::warn!(
824 "relationships truncated to {max_rels} (with {n} entities, theoretical max was ~{}x combinations)",
825 n.saturating_sub(1)
826 );
827 }
828
829 (rels, hit_cap)
830}
831
832fn build_relationships_by_sentence_cooccurrence(
843 body: &str,
844 entities: &[NewEntity],
845) -> (Vec<NewRelationship>, bool) {
846 if entities.len() < 2 {
847 return (Vec::new(), false);
848 }
849
850 let max_rels = crate::constants::max_relationships_per_memory();
851 let lower_names: Vec<(usize, String)> = entities
852 .iter()
853 .take(MAX_ENTS)
854 .enumerate()
855 .map(|(i, e)| (i, e.name.to_lowercase()))
856 .collect();
857
858 let mut rels: Vec<NewRelationship> = Vec::new();
859 let mut seen: std::collections::HashSet<(usize, usize)> = std::collections::HashSet::new();
860 let mut hit_cap = false;
861
862 for sentence in body.split(['.', '!', '?', '\n']) {
863 if sentence.trim().is_empty() {
864 continue;
865 }
866 let lower_sentence = sentence.to_lowercase();
867 let present: Vec<usize> = lower_names
868 .iter()
869 .filter(|(_, name)| !name.is_empty() && lower_sentence.contains(name.as_str()))
870 .map(|(i, _)| *i)
871 .collect();
872
873 if present.len() < 2 {
874 continue;
875 }
876
877 for i in 0..present.len() {
878 for j in (i + 1)..present.len() {
879 if rels.len() >= max_rels {
880 hit_cap = true;
881 tracing::warn!(
882 "relationships truncated to {max_rels} during sentence-level pairing"
883 );
884 return (rels, hit_cap);
885 }
886 let ei = present[i];
887 let ej = present[j];
888 let key = (ei.min(ej), ei.max(ej));
889 if seen.insert(key) {
890 rels.push(NewRelationship {
891 source: entities[ei].name.clone(),
892 target: entities[ej].name.clone(),
893 relation: DEFAULT_RELATION.to_string(),
894 strength: 0.5,
895 description: None,
896 });
897 }
898 }
899 }
900 }
901
902 (rels, hit_cap)
903}
904
905fn extend_with_numeric_suffix(entities: Vec<ExtractedEntity>, body: &str) -> Vec<ExtractedEntity> {
912 static SUFFIX_RE: OnceLock<Regex> = OnceLock::new();
913 let suffix_re = SUFFIX_RE.get_or_init(|| {
916 Regex::new(r"^([\-\s]+\d+(?:\.\d+)?[a-z]?)")
917 .expect("compile-time validated numeric suffix regex literal")
918 });
919
920 entities
921 .into_iter()
922 .map(|ent| {
923 if let Some(pos) = body.find(&ent.name) {
925 let after_pos = pos + ent.name.len();
926 if after_pos < body.len() {
927 let after = &body[after_pos..];
928 if let Some(m) = suffix_re.find(after) {
929 let suffix = m.as_str();
930 if suffix.len() <= 7 {
933 let mut extended = String::with_capacity(ent.name.len() + suffix.len());
934 extended.push_str(&ent.name);
935 extended.push_str(suffix);
936 return ExtractedEntity {
937 name: extended,
938 entity_type: ent.entity_type,
939 };
940 }
941 }
942 }
943 }
944 ent
945 })
946 .collect()
947}
948
949fn augment_versioned_model_names(
969 entities: Vec<ExtractedEntity>,
970 body: &str,
971) -> Vec<ExtractedEntity> {
972 static VERSIONED_MODEL_RE: OnceLock<Regex> = OnceLock::new();
973 let model_re = VERSIONED_MODEL_RE.get_or_init(|| {
980 Regex::new(
981 r"\b([A-Z][A-Za-z]{2,15})[\s\-]+(\d+(?:\.\d+)?(?:[a-z]|x\d+[A-Za-z]?)?)(?:\s+(?:Sonnet|Opus|Haiku|Turbo|Pro|Lite|Mini|Nano|Flash|Ultra))?\b",
982 )
983 .expect("compile-time validated versioned model regex literal")
984 });
985
986 let mut existing_lc: std::collections::HashSet<String> =
987 entities.iter().map(|ent| ent.name.to_lowercase()).collect();
988 let mut result = entities;
989
990 for caps in model_re.captures_iter(body) {
991 let full_match = caps.get(0).map(|m| m.as_str()).unwrap_or("");
992 if full_match.is_empty() || full_match.len() > 24 {
995 continue;
996 }
997 let normalized_lc = full_match.to_lowercase();
998 if existing_lc.contains(&normalized_lc) {
999 continue;
1000 }
1001 if result.len() >= MAX_ENTS {
1004 break;
1005 }
1006 existing_lc.insert(normalized_lc);
1007 result.push(ExtractedEntity {
1008 name: full_match.to_string(),
1009 entity_type: EntityType::Concept,
1010 });
1011 }
1012
1013 result
1014}
1015
1016fn merge_and_deduplicate(
1017 regex_ents: Vec<ExtractedEntity>,
1018 ner_ents: Vec<ExtractedEntity>,
1019) -> Vec<ExtractedEntity> {
1020 let mut by_lc: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
1035 let mut result: Vec<ExtractedEntity> = Vec::new();
1036 let mut truncated = false;
1037
1038 let total_input = regex_ents.len() + ner_ents.len();
1039 for ent in regex_ents.into_iter().chain(ner_ents) {
1040 let name_lc = ent.name.nfkc().collect::<String>().to_lowercase();
1041 let key = {
1045 let et = ent.entity_type.as_str();
1046 let mut k = String::with_capacity(et.len() + 1 + name_lc.len());
1047 k.push_str(et);
1048 k.push('\0');
1049 k.push_str(&name_lc);
1050 k
1051 };
1052
1053 let type_prefix = {
1058 let et = ent.entity_type.as_str();
1059 let mut p = String::with_capacity(et.len() + 1);
1060 p.push_str(et);
1061 p.push('\0');
1062 p
1063 };
1064 let mut collision_idx: Option<usize> = None;
1065 for (existing_key, idx) in &by_lc {
1066 if !existing_key.starts_with(&type_prefix) {
1068 continue;
1069 }
1070 let existing_name_lc = &existing_key[type_prefix.len()..];
1071 if existing_name_lc == name_lc
1072 || existing_name_lc.contains(name_lc.as_str())
1073 || name_lc.contains(existing_name_lc)
1074 {
1075 collision_idx = Some(*idx);
1076 break;
1077 }
1078 }
1079 match collision_idx {
1080 Some(idx) => {
1081 if ent.name.len() > result[idx].name.len() {
1084 let old_name_lc = result[idx].name.nfkc().collect::<String>().to_lowercase();
1085 let old_key = {
1086 let et = result[idx].entity_type.as_str();
1087 let mut k = String::with_capacity(et.len() + 1 + old_name_lc.len());
1088 k.push_str(et);
1089 k.push('\0');
1090 k.push_str(&old_name_lc);
1091 k
1092 };
1093 by_lc.remove(&old_key);
1094 result[idx] = ent;
1095 by_lc.insert(key, idx);
1096 }
1097 }
1098 None => {
1099 by_lc.insert(key, result.len());
1100 result.push(ent);
1101 }
1102 }
1103 if result.len() >= MAX_ENTS {
1104 truncated = true;
1105 break;
1106 }
1107 }
1108
1109 if truncated {
1111 tracing::warn!(
1112 "extraction truncated at {MAX_ENTS} entities (input had {total_input} candidates before deduplication)"
1113 );
1114 }
1115
1116 result
1117}
1118
1119fn to_new_entities(extracted: Vec<ExtractedEntity>) -> Vec<NewEntity> {
1120 extracted
1121 .into_iter()
1122 .map(|e| NewEntity {
1123 name: e.name,
1124 entity_type: e.entity_type,
1125 description: None,
1126 })
1127 .collect()
1128}
1129
1130pub fn extract_graph_auto(
1131 body: &str,
1132 paths: &AppPaths,
1133 variant: GlinerVariant,
1134) -> Result<ExtractionResult> {
1135 let regex_entities = apply_regex_prefilter(body);
1136 let threshold = crate::constants::gliner_confidence_threshold();
1137
1138 let mut gliner_used = false;
1139 let ner_entities = match get_or_init_gliner(paths, variant) {
1140 Some(model) => match model.predict(body, GLINER_ENTITY_LABELS, threshold) {
1141 Ok(ents) => {
1142 gliner_used = true;
1143 ents
1144 }
1145 Err(e) => {
1146 tracing::warn!("GLiNER NER failed, falling back to regex-only extraction: {e:#}");
1147 Vec::new()
1148 }
1149 },
1150 None => Vec::new(),
1151 };
1152
1153 let merged = merge_and_deduplicate(regex_entities, ner_entities);
1154 let extended = extend_with_numeric_suffix(merged, body);
1155 let with_models = augment_versioned_model_names(extended, body);
1156 let with_models: Vec<ExtractedEntity> = with_models
1157 .into_iter()
1158 .filter(|e| !regex_section_marker().is_match(&e.name))
1159 .collect();
1160 let entities = to_new_entities(with_models);
1161 let (relationships, relationships_truncated) =
1162 build_relationships_by_sentence_cooccurrence(body, &entities);
1163
1164 let extraction_method = if gliner_used {
1165 format!("gliner-{variant}+regex")
1166 } else {
1167 "regex-only".to_string()
1168 };
1169
1170 let urls = extract_urls(body);
1171
1172 Ok(ExtractionResult {
1173 entities,
1174 relationships,
1175 relationships_truncated,
1176 extraction_method,
1177 urls,
1178 })
1179}
1180
1181pub struct RegexExtractor;
1182
1183impl Extractor for RegexExtractor {
1184 fn extract(&self, body: &str) -> Result<ExtractionResult> {
1185 let regex_entities = apply_regex_prefilter(body);
1186 let entities = to_new_entities(regex_entities);
1187 let (relationships, relationships_truncated) =
1188 build_relationships_by_sentence_cooccurrence(body, &entities);
1189 let urls = extract_urls(body);
1190 Ok(ExtractionResult {
1191 entities,
1192 relationships,
1193 relationships_truncated,
1194 extraction_method: "regex-only".to_string(),
1195 urls,
1196 })
1197 }
1198}
1199
1200#[cfg(test)]
1201mod tests {
1202 use super::*;
1203 use crate::entity_type::EntityType;
1204
1205 fn make_paths() -> AppPaths {
1206 use std::path::PathBuf;
1207 AppPaths {
1208 db: PathBuf::from("/tmp/test.sqlite"),
1209 models: PathBuf::from("/tmp/test_models"),
1210 }
1211 }
1212
1213 #[test]
1214 fn regex_email_captures_address() {
1215 let ents = apply_regex_prefilter("contact: someone@company.com for more info");
1216 assert!(ents
1218 .iter()
1219 .any(|e| e.name == "someone@company.com" && e.entity_type == EntityType::Concept));
1220 }
1221
1222 #[test]
1223 fn regex_all_caps_filters_pt_rule_word() {
1224 let ents = apply_regex_prefilter("NUNCA do this. PROIBIDO use X. DEVE follow Y.");
1226 assert!(
1227 !ents.iter().any(|e| e.name == "NUNCA"),
1228 "NUNCA must be filtered as a stopword"
1229 );
1230 assert!(
1231 !ents.iter().any(|e| e.name == "PROIBIDO"),
1232 "PROIBIDO must be filtered"
1233 );
1234 assert!(
1235 !ents.iter().any(|e| e.name == "DEVE"),
1236 "DEVE must be filtered"
1237 );
1238 }
1239
1240 #[test]
1241 fn regex_all_caps_accepts_underscored_constant() {
1242 let ents = apply_regex_prefilter("configure MAX_RETRY=3 and API_TIMEOUT=30");
1244 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1245 assert!(ents.iter().any(|e| e.name == "API_TIMEOUT"));
1246 }
1247
1248 #[test]
1249 fn regex_all_caps_accepts_domain_acronym() {
1250 let ents = apply_regex_prefilter("OPENAI launched GPT-5 with NVIDIA H100");
1252 assert!(ents.iter().any(|e| e.name == "OPENAI"));
1253 assert!(ents.iter().any(|e| e.name == "NVIDIA"));
1254 }
1255
1256 #[test]
1257 fn regex_url_does_not_appear_in_apply_regex_prefilter() {
1258 let ents = apply_regex_prefilter("see https://docs.rs/crate for details");
1260 assert!(
1261 !ents.iter().any(|e| e.name.starts_with("https://")),
1262 "URLs must not appear as entities after the P0-2 split"
1263 );
1264 }
1265
1266 #[test]
1267 fn extract_urls_captures_https() {
1268 let urls = extract_urls("see https://docs.rs/crate for details");
1269 assert_eq!(urls.len(), 1);
1270 assert_eq!(urls[0].url, "https://docs.rs/crate");
1271 assert!(urls[0].offset > 0);
1272 }
1273
1274 #[test]
1275 fn extract_urls_trim_sufixo_pontuacao() {
1276 let urls = extract_urls("link: https://example.com/path. fim");
1277 assert!(!urls.is_empty());
1278 assert!(
1279 !urls[0].url.ends_with('.'),
1280 "sufixo ponto deve ser removido"
1281 );
1282 }
1283
1284 #[test]
1285 fn extract_urls_dedupes_repeated() {
1286 let body = "https://example.com referenciado aqui e depois aqui https://example.com";
1287 let urls = extract_urls(body);
1288 assert_eq!(urls.len(), 1, "URLs repetidas devem ser deduplicadas");
1289 }
1290
1291 #[test]
1292 fn regex_uuid_captura_identificador() {
1293 let ents = apply_regex_prefilter("id=550e8400-e29b-41d4-a716-446655440000 no sistema");
1294 assert!(ents.iter().any(|e| e.entity_type == EntityType::Concept));
1295 }
1296
1297 #[test]
1298 fn regex_all_caps_captura_constante() {
1299 let ents = apply_regex_prefilter("configure MAX_RETRY e TIMEOUT_MS");
1300 assert!(ents.iter().any(|e| e.name == "MAX_RETRY"));
1301 assert!(ents.iter().any(|e| e.name == "TIMEOUT_MS"));
1302 }
1303
1304 #[test]
1305 fn regex_all_caps_ignores_short_words() {
1306 let ents = apply_regex_prefilter("use AI em seu projeto");
1307 assert!(
1308 !ents.iter().any(|e| e.name == "AI"),
1309 "AI tem apenas 2 chars, deve ser ignorado"
1310 );
1311 }
1312
1313 #[test]
1314 fn build_relationships_respeitam_max_rels() {
1315 let entities: Vec<NewEntity> = (0..20)
1316 .map(|i| NewEntity {
1317 name: format!("entidade_{i}"),
1318 entity_type: EntityType::Concept,
1319 description: None,
1320 })
1321 .collect();
1322 let (rels, truncated) = build_relationships(&entities);
1323 let max_rels = crate::constants::max_relationships_per_memory();
1324 assert!(rels.len() <= max_rels, "deve respeitar max_rels={max_rels}");
1325 if rels.len() == max_rels {
1326 assert!(truncated, "truncated deve ser true quando atingiu o cap");
1327 }
1328 }
1329
1330 #[test]
1331 fn build_relationships_without_duplicates() {
1332 let entities: Vec<NewEntity> = (0..5)
1333 .map(|i| NewEntity {
1334 name: format!("ent_{i}"),
1335 entity_type: EntityType::Concept,
1336 description: None,
1337 })
1338 .collect();
1339 let (rels, _truncated) = build_relationships(&entities);
1340 let mut pares: std::collections::HashSet<(String, String)> =
1341 std::collections::HashSet::new();
1342 for r in &rels {
1343 let par = (r.source.clone(), r.target.clone());
1344 assert!(pares.insert(par), "par duplicado encontrado");
1345 }
1346 }
1347
1348 #[test]
1349 fn merge_dedupes_by_lowercase_name() {
1350 let a = vec![ExtractedEntity {
1353 name: "Rust".to_string(),
1354 entity_type: EntityType::Concept,
1355 }];
1356 let b = vec![ExtractedEntity {
1357 name: "rust".to_string(),
1358 entity_type: EntityType::Concept,
1359 }];
1360 let merged = merge_and_deduplicate(a, b);
1361 assert_eq!(
1362 merged.len(),
1363 1,
1364 "rust and Rust with the same type are the same entity"
1365 );
1366 }
1367
1368 #[test]
1369 fn regex_extractor_implements_trait() {
1370 let extractor = RegexExtractor;
1371 let result = extractor
1372 .extract("contato: dev@empresa.io e MAX_TIMEOUT configurado")
1373 .unwrap();
1374 assert!(!result.entities.is_empty());
1375 }
1376
1377 #[test]
1378 fn extract_returns_ok_without_model() {
1379 let paths = make_paths();
1381 let body = "contato: teste@exemplo.com com MAX_RETRY=3";
1382 let result = extract_graph_auto(body, &paths, GlinerVariant::Int8).unwrap();
1383 assert!(result
1384 .entities
1385 .iter()
1386 .any(|e| e.name.contains("teste@exemplo.com")));
1387 }
1388
1389 #[test]
1390 fn stopwords_filter_v1024_terms() {
1391 let body = "ACEITE ACK ACL BORDA CHECKLIST COMPLETED CONFIRME \
1394 DEVEMOS DONE FIXED NEGUE PENDING PLAN PODEMOS RECUSE TOKEN VAMOS";
1395 let ents = apply_regex_prefilter(body);
1396 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1397 for word in &[
1398 "ACEITE",
1399 "ACK",
1400 "ACL",
1401 "BORDA",
1402 "CHECKLIST",
1403 "COMPLETED",
1404 "CONFIRME",
1405 "DEVEMOS",
1406 "DONE",
1407 "FIXED",
1408 "NEGUE",
1409 "PENDING",
1410 "PLAN",
1411 "PODEMOS",
1412 "RECUSE",
1413 "TOKEN",
1414 "VAMOS",
1415 ] {
1416 assert!(
1417 !names.contains(word),
1418 "v1.0.24 stopword {word} should be filtered but was found in entities"
1419 );
1420 }
1421 }
1422
1423 #[test]
1424 fn dedup_normalizes_unicode_combining_marks() {
1425 let nfc = vec![ExtractedEntity {
1429 name: "Caf\u{e9}".to_string(),
1430 entity_type: EntityType::Concept,
1431 }];
1432 let nfd_name = "Cafe\u{301}".to_string();
1434 let nfd = vec![ExtractedEntity {
1435 name: nfd_name,
1436 entity_type: EntityType::Concept,
1437 }];
1438 let merged = merge_and_deduplicate(nfc, nfd);
1439 assert_eq!(
1440 merged.len(),
1441 1,
1442 "NFC 'Caf\\u{{e9}}' and NFD 'Cafe\\u{{301}}' must deduplicate to 1 entity after NFKC normalization"
1443 );
1444 }
1445
1446 #[test]
1447 fn extraction_method_regex_only_unchanged() {
1448 let result = RegexExtractor.extract("contact: dev@acme.io").unwrap();
1451 assert_eq!(
1452 result.extraction_method, "regex-only",
1453 "RegexExtractor must return regex-only"
1454 );
1455 }
1456
1457 #[test]
1460 fn extend_suffix_pure_numeric_unchanged() {
1461 let ents = vec![ExtractedEntity {
1463 name: "GPT".to_string(),
1464 entity_type: EntityType::Concept,
1465 }];
1466 let result = extend_with_numeric_suffix(ents, "using GPT-5 in the project");
1467 assert_eq!(
1468 result[0].name, "GPT-5",
1469 "purely numeric suffix must be extended"
1470 );
1471 }
1472
1473 #[test]
1474 fn extend_suffix_alphanumeric_letter_after_digit() {
1475 let ents = vec![ExtractedEntity {
1477 name: "GPT".to_string(),
1478 entity_type: EntityType::Concept,
1479 }];
1480 let result = extend_with_numeric_suffix(ents, "using GPT-4o for advanced tasks");
1481 assert_eq!(result[0].name, "GPT-4o", "suffix '4o' must be accepted");
1482 }
1483
1484 #[test]
1485 fn extend_suffix_alphanumeric_b_suffix() {
1486 let ents = vec![ExtractedEntity {
1488 name: "Llama".to_string(),
1489 entity_type: EntityType::Concept,
1490 }];
1491 let result = extend_with_numeric_suffix(ents, "Llama-5b open-weight model");
1492 assert_eq!(result[0].name, "Llama-5b", "suffix '5b' must be accepted");
1493 }
1494
1495 #[test]
1496 fn extend_suffix_alphanumeric_x_suffix() {
1497 let ents = vec![ExtractedEntity {
1499 name: "Mistral".to_string(),
1500 entity_type: EntityType::Concept,
1501 }];
1502 let result = extend_with_numeric_suffix(ents, "testing Mistral-8x in production");
1503 assert_eq!(result[0].name, "Mistral-8x", "suffix '8x' must be accepted");
1504 }
1505
1506 #[test]
1509 fn augment_versioned_gpt4o() {
1510 let result = augment_versioned_model_names(vec![], "using GPT-4o for analysis");
1512 assert!(
1513 result.iter().any(|e| e.name == "GPT-4o"),
1514 "GPT-4o must be captured by augment, found: {:?}",
1515 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1516 );
1517 }
1518
1519 #[test]
1520 fn augment_versioned_claude_4_sonnet() {
1521 let result =
1523 augment_versioned_model_names(vec![], "best model: Claude 4 Sonnet released today");
1524 assert!(
1525 result.iter().any(|e| e.name == "Claude 4 Sonnet"),
1526 "Claude 4 Sonnet must be captured, found: {:?}",
1527 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1528 );
1529 }
1530
1531 #[test]
1532 fn augment_versioned_llama_3_pro() {
1533 let result =
1535 augment_versioned_model_names(vec![], "fine-tuning com Llama 3 Pro localmente");
1536 assert!(
1537 result.iter().any(|e| e.name == "Llama 3 Pro"),
1538 "Llama 3 Pro deve ser capturado, achados: {:?}",
1539 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1540 );
1541 }
1542
1543 #[test]
1544 fn augment_versioned_mixtral_8x7b() {
1545 let result =
1547 augment_versioned_model_names(vec![], "executando Mixtral 8x7B no servidor local");
1548 assert!(
1549 result.iter().any(|e| e.name == "Mixtral 8x7B"),
1550 "Mixtral 8x7B deve ser capturado, achados: {:?}",
1551 result.iter().map(|e| &e.name).collect::<Vec<_>>()
1552 );
1553 }
1554
1555 #[test]
1556 fn augment_versioned_does_not_duplicate_existing() {
1557 let existing = vec![ExtractedEntity {
1559 name: "Claude 4".to_string(),
1560 entity_type: EntityType::Concept,
1561 }];
1562 let result = augment_versioned_model_names(existing, "using Claude 4 in the project");
1563 let count = result.iter().filter(|e| e.name == "Claude 4").count();
1564 assert_eq!(count, 1, "Claude 4 must not be duplicated");
1565 }
1566
1567 #[test]
1570 fn stopwords_filter_url_jwt_api_v1025() {
1571 let body = "We use URL, JWT, and API REST in our LLM-powered CLI via HTTP/HTTPS and UI.";
1573 let ents = apply_regex_prefilter(body);
1574 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1575 for blocked in &[
1576 "URL", "JWT", "API", "REST", "LLM", "CLI", "HTTP", "HTTPS", "UI",
1577 ] {
1578 assert!(
1579 !names.contains(blocked),
1580 "v1.0.25 stopword {blocked} leaked as entity; found names: {names:?}"
1581 );
1582 }
1583 }
1584
1585 #[test]
1588 fn section_markers_etapa_fase_filtered_v1025() {
1589 let body = "Etapa 3 do plano: implementar Fase 1 da Migra\u{e7}\u{e3}o.";
1593 let ents = apply_regex_prefilter(body);
1594 assert!(
1595 !ents
1596 .iter()
1597 .any(|e| e.name.contains("Etapa") || e.name.contains("Fase")),
1598 "section markers must be stripped; entities: {:?}",
1599 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1600 );
1601 }
1602
1603 #[test]
1604 fn section_markers_passo_secao_filtered_v1025() {
1605 let body = "Siga Passo 2 conforme Se\u{e7}\u{e3}o 3 do manual.";
1608 let ents = apply_regex_prefilter(body);
1609 assert!(
1610 !ents
1611 .iter()
1612 .any(|e| e.name.contains("Passo") || e.name.contains("Se\u{e7}\u{e3}o")),
1613 "Passo/Se\\u{{e7}}\\u{{e3}}o section markers must be stripped; entities: {:?}",
1614 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1615 );
1616 }
1617
1618 #[test]
1621 fn brand_camelcase_extracted_as_organization_v1025() {
1622 let body = "OpenAI launched GPT-4 and PostgreSQL added pgvector.";
1624 let ents = apply_regex_prefilter(body);
1625 let openai = ents.iter().find(|e| e.name == "OpenAI");
1626 assert!(
1627 openai.is_some(),
1628 "OpenAI must be extracted by CamelCase brand regex; entities: {:?}",
1629 ents.iter().map(|e| &e.name).collect::<Vec<_>>()
1630 );
1631 assert_eq!(
1632 openai.unwrap().entity_type,
1633 EntityType::Organization,
1634 "brand CamelCase must map to organization (V008)"
1635 );
1636 }
1637
1638 #[test]
1639 fn brand_postgresql_extracted_as_organization_v1025() {
1640 let body = "migrating from MySQL to PostgreSQL for better performance.";
1641 let ents = apply_regex_prefilter(body);
1642 assert!(
1643 ents.iter()
1644 .any(|e| e.name == "PostgreSQL" && e.entity_type == EntityType::Organization),
1645 "PostgreSQL must be extracted as organization; entities: {:?}",
1646 ents.iter()
1647 .map(|e| (&e.name, &e.entity_type))
1648 .collect::<Vec<_>>()
1649 );
1650 }
1651
1652 fn entity(name: &str, entity_type: EntityType) -> ExtractedEntity {
1655 ExtractedEntity {
1656 name: name.to_string(),
1657 entity_type,
1658 }
1659 }
1660
1661 #[test]
1662 fn merge_resolves_sonne_vs_sonnet_keeps_longest_v1025() {
1663 let regex = vec![entity("Sonne", EntityType::Concept)];
1665 let ner = vec![entity("Sonnet", EntityType::Concept)];
1666 let result = merge_and_deduplicate(regex, ner);
1667 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1668 assert_eq!(result[0].name, "Sonnet");
1669 }
1670
1671 #[test]
1672 fn merge_resolves_open_vs_openai_keeps_longest_v1025() {
1673 let regex = vec![
1675 entity("Open", EntityType::Organization),
1676 entity("OpenAI", EntityType::Organization),
1677 ];
1678 let result = merge_and_deduplicate(regex, vec![]);
1679 assert_eq!(result.len(), 1, "expected 1 entity, got: {result:?}");
1680 assert_eq!(result[0].name, "OpenAI");
1681 }
1682
1683 #[test]
1684 fn merge_keeps_both_when_no_containment_v1025() {
1685 let regex = vec![
1687 entity("Alice", EntityType::Person),
1688 entity("Bob", EntityType::Person),
1689 ];
1690 let result = merge_and_deduplicate(regex, vec![]);
1691 assert_eq!(result.len(), 2, "expected 2 entities, got: {result:?}");
1692 }
1693
1694 #[test]
1695 fn merge_respects_entity_type_boundary_v1025() {
1696 let regex = vec![
1698 entity("Apple", EntityType::Organization),
1699 entity("Apple", EntityType::Concept),
1700 ];
1701 let result = merge_and_deduplicate(regex, vec![]);
1702 assert_eq!(
1703 result.len(),
1704 2,
1705 "expected 2 entities (different types), got: {result:?}"
1706 );
1707 }
1708
1709 #[test]
1710 fn merge_case_insensitive_dedup_v1025() {
1711 let regex = vec![
1713 entity("OpenAI", EntityType::Organization),
1714 entity("openai", EntityType::Organization),
1715 ];
1716 let result = merge_and_deduplicate(regex, vec![]);
1717 assert_eq!(
1718 result.len(),
1719 1,
1720 "expected 1 entity after case-insensitive dedup, got: {result:?}"
1721 );
1722 }
1723
1724 #[test]
1727 fn extract_graph_auto_handles_large_body_under_30s() {
1728 let body = "x ".repeat(40_000);
1731 let paths = make_paths();
1732 let start = std::time::Instant::now();
1733 let result = extract_graph_auto(&body, &paths, GlinerVariant::Int8)
1734 .expect("extraction must not error");
1735 let elapsed = start.elapsed();
1736 assert!(
1737 elapsed.as_secs() < 30,
1738 "extract_graph_auto took {}s for 80 KB body (cap should keep it well under 30s)",
1739 elapsed.as_secs()
1740 );
1741 let _ = result.entities;
1743 }
1744
1745 #[test]
1748 fn pt_uppercase_stopwords_filtered_v1031() {
1749 let body = "Para o ADAPTER funcionar com PROJETO em modo PASSIVA, devemos usar \
1750 SOMENTE LEITURA conforme a REGRA OBRIGATORIA do EXEMPLO DEFAULT.";
1751 let ents = apply_regex_prefilter(body);
1752 let names: Vec<String> = ents.iter().map(|e| e.name.to_uppercase()).collect();
1753 for stop in &[
1754 "ADAPTER",
1755 "PROJETO",
1756 "PASSIVA",
1757 "SOMENTE",
1758 "LEITURA",
1759 "REGRA",
1760 "OBRIGATORIA",
1761 "EXEMPLO",
1762 "DEFAULT",
1763 ] {
1764 assert!(
1765 !names.contains(&stop.to_string()),
1766 "v1.0.31 A11 stoplist failed: {stop} leaked as entity; got names: {names:?}"
1767 );
1768 }
1769 }
1770
1771 #[test]
1772 fn pt_underscored_identifier_preserved_v1031() {
1773 let ents = apply_regex_prefilter("configure FLOWAIPER_API_KEY=foo and MAX_TIMEOUT=30");
1776 let names: Vec<&str> = ents.iter().map(|e| e.name.as_str()).collect();
1777 assert!(names.contains(&"FLOWAIPER_API_KEY"));
1778 assert!(names.contains(&"MAX_TIMEOUT"));
1779 }
1780
1781 #[test]
1784 fn build_relationships_by_sentence_only_links_co_occurring_entities() {
1785 let body = "Alice met Bob at the conference. Carol works alone in another room.";
1786 let entities = vec![
1787 NewEntity {
1788 name: "Alice".to_string(),
1789 entity_type: EntityType::Person,
1790 description: None,
1791 },
1792 NewEntity {
1793 name: "Bob".to_string(),
1794 entity_type: EntityType::Person,
1795 description: None,
1796 },
1797 NewEntity {
1798 name: "Carol".to_string(),
1799 entity_type: EntityType::Person,
1800 description: None,
1801 },
1802 ];
1803 let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1804 assert!(!truncated);
1805 assert_eq!(
1806 rels.len(),
1807 1,
1808 "only Alice/Bob should pair (same sentence); Carol is isolated"
1809 );
1810 let pair = (rels[0].source.as_str(), rels[0].target.as_str());
1811 assert!(
1812 matches!(pair, ("Alice", "Bob") | ("Bob", "Alice")),
1813 "unexpected pair {pair:?}"
1814 );
1815 }
1816
1817 #[test]
1818 fn build_relationships_by_sentence_returns_empty_for_single_entity() {
1819 let body = "Alice is here.";
1820 let entities = vec![NewEntity {
1821 name: "Alice".to_string(),
1822 entity_type: EntityType::Person,
1823 description: None,
1824 }];
1825 let (rels, truncated) = build_relationships_by_sentence_cooccurrence(body, &entities);
1826 assert!(rels.is_empty());
1827 assert!(!truncated);
1828 }
1829
1830 #[test]
1831 fn build_relationships_by_sentence_dedupes_pairs_across_sentences() {
1832 let body = "Alice met Bob. Bob saw Alice again.";
1833 let entities = vec![
1834 NewEntity {
1835 name: "Alice".to_string(),
1836 entity_type: EntityType::Person,
1837 description: None,
1838 },
1839 NewEntity {
1840 name: "Bob".to_string(),
1841 entity_type: EntityType::Person,
1842 description: None,
1843 },
1844 ];
1845 let (rels, _) = build_relationships_by_sentence_cooccurrence(body, &entities);
1846 assert_eq!(
1847 rels.len(),
1848 1,
1849 "Alice/Bob pair must be emitted only once even when co-occurring in multiple sentences"
1850 );
1851 }
1852
1853 #[test]
1854 fn extraction_max_tokens_default_is_5000() {
1855 std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1856 assert_eq!(crate::constants::extraction_max_tokens(), 5_000);
1857 }
1858
1859 #[test]
1860 fn extraction_max_tokens_env_override_clamped() {
1861 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200");
1862 assert_eq!(
1863 crate::constants::extraction_max_tokens(),
1864 5_000,
1865 "value below 512 must fall back to default"
1866 );
1867
1868 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "200000");
1869 assert_eq!(
1870 crate::constants::extraction_max_tokens(),
1871 5_000,
1872 "value above 100_000 must fall back to default"
1873 );
1874
1875 std::env::set_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS", "8000");
1876 assert_eq!(
1877 crate::constants::extraction_max_tokens(),
1878 8_000,
1879 "valid value must be honoured"
1880 );
1881
1882 std::env::remove_var("SQLITE_GRAPHRAG_EXTRACTION_MAX_TOKENS");
1883 }
1884
1885 #[test]
1886 fn gliner_variant_from_str_valid() {
1887 assert_eq!(
1888 "fp32".parse::<GlinerVariant>().unwrap(),
1889 GlinerVariant::Fp32
1890 );
1891 assert_eq!(
1892 "fp16".parse::<GlinerVariant>().unwrap(),
1893 GlinerVariant::Fp16
1894 );
1895 assert_eq!(
1896 "int8".parse::<GlinerVariant>().unwrap(),
1897 GlinerVariant::Int8
1898 );
1899 assert_eq!("q4".parse::<GlinerVariant>().unwrap(), GlinerVariant::Q4);
1900 assert_eq!(
1901 "q4f16".parse::<GlinerVariant>().unwrap(),
1902 GlinerVariant::Q4f16
1903 );
1904 assert_eq!(
1906 "FP32".parse::<GlinerVariant>().unwrap(),
1907 GlinerVariant::Fp32
1908 );
1909 assert_eq!(
1910 "INT8".parse::<GlinerVariant>().unwrap(),
1911 GlinerVariant::Int8
1912 );
1913 }
1914
1915 #[test]
1916 fn gliner_variant_from_str_invalid() {
1917 assert!("invalid".parse::<GlinerVariant>().is_err());
1918 assert!("fp64".parse::<GlinerVariant>().is_err());
1919 assert!("".parse::<GlinerVariant>().is_err());
1920 }
1921
1922 #[test]
1923 fn gliner_variant_filename_mapping() {
1924 assert_eq!(GlinerVariant::Fp32.as_filename(), "model.onnx");
1925 assert_eq!(GlinerVariant::Fp16.as_filename(), "model_fp16.onnx");
1926 assert_eq!(GlinerVariant::Int8.as_filename(), "model_quantized.onnx");
1927 assert_eq!(GlinerVariant::Q4.as_filename(), "model_q4.onnx");
1928 assert_eq!(GlinerVariant::Q4f16.as_filename(), "model_q4f16.onnx");
1929 }
1930
1931 #[test]
1932 fn gliner_variant_display() {
1933 assert_eq!(format!("{}", GlinerVariant::Fp32), "fp32");
1934 assert_eq!(format!("{}", GlinerVariant::Fp16), "fp16");
1935 assert_eq!(format!("{}", GlinerVariant::Int8), "int8");
1936 assert_eq!(format!("{}", GlinerVariant::Q4), "q4");
1937 assert_eq!(format!("{}", GlinerVariant::Q4f16), "q4f16");
1938 }
1939
1940 #[test]
1941 fn gliner_variant_display_size() {
1942 assert_eq!(GlinerVariant::Fp32.display_size(), "1.1 GB");
1943 assert_eq!(GlinerVariant::Int8.display_size(), "349 MB");
1944 }
1945
1946 #[test]
1947 fn gliner_entity_labels_covers_all_types() {
1948 let label_types: Vec<EntityType> = GLINER_ENTITY_LABELS.iter().map(|(_, t)| *t).collect();
1949 assert!(label_types.contains(&EntityType::Person));
1950 assert!(label_types.contains(&EntityType::Organization));
1951 assert!(label_types.contains(&EntityType::Location));
1952 assert!(label_types.contains(&EntityType::Date));
1953 assert!(label_types.contains(&EntityType::Project));
1954 assert!(label_types.contains(&EntityType::Tool));
1955 assert!(label_types.contains(&EntityType::File));
1956 assert!(label_types.contains(&EntityType::Concept));
1957 assert!(label_types.contains(&EntityType::Decision));
1958 assert!(label_types.contains(&EntityType::Incident));
1959 assert!(label_types.contains(&EntityType::Dashboard));
1960 assert!(label_types.contains(&EntityType::IssueTracker));
1961 assert!(label_types.contains(&EntityType::Memory));
1962 assert_eq!(GLINER_ENTITY_LABELS.len(), 13);
1963 }
1964
1965 #[test]
1966 fn gliner_entity_labels_no_duplicates() {
1967 let mut seen = std::collections::HashSet::new();
1968 for (label, _) in GLINER_ENTITY_LABELS {
1969 assert!(seen.insert(*label), "duplicate label: {label}");
1970 }
1971 }
1972
1973 #[test]
1974 fn extract_graph_auto_regex_only_fallback() {
1975 let result = extract_graph_auto(
1977 "Contact someone@test.com about OPENAI project",
1978 &make_paths(),
1979 GlinerVariant::Fp32,
1980 );
1981 assert!(result.is_ok());
1982 let res = result.unwrap();
1983 assert_eq!(res.extraction_method, "regex-only");
1984 assert!(res.entities.iter().any(|e| e.name == "someone@test.com"));
1986 }
1987
1988 #[test]
1989 fn gliner_variant_roundtrip() {
1990 for variant in &[
1991 GlinerVariant::Fp32,
1992 GlinerVariant::Fp16,
1993 GlinerVariant::Int8,
1994 GlinerVariant::Q4,
1995 GlinerVariant::Q4f16,
1996 ] {
1997 let s = format!("{variant}");
1998 let parsed: GlinerVariant = s.parse().unwrap();
1999 assert_eq!(*variant, parsed);
2000 }
2001 }
2002}