pub use self::builder::{Logseq, ThesaurusBuilder};
pub mod autocomplete;
pub mod builder;
pub mod markdown_directives;
pub mod matcher;
pub mod url_protector;
#[cfg(feature = "medical")]
pub mod medical_artifact;
#[cfg(feature = "medical")]
pub mod medical_extractor;
#[cfg(feature = "medical")]
pub mod sharded_extractor;
#[cfg(feature = "medical")]
pub mod snomed;
#[cfg(feature = "medical")]
pub mod umls;
#[cfg(feature = "medical")]
pub mod umls_extractor;
pub use autocomplete::{
AutocompleteConfig, AutocompleteIndex, AutocompleteMetadata, AutocompleteResult,
autocomplete_search, build_autocomplete_index, deserialize_autocomplete_index,
fuzzy_autocomplete_search, fuzzy_autocomplete_search_levenshtein, serialize_autocomplete_index,
};
pub use markdown_directives::{
MarkdownDirectiveWarning, MarkdownDirectivesParseResult, parse_markdown_directives_dir,
};
pub use matcher::{
LinkType, Matched, extract_paragraphs_from_automata, find_matches, replace_matches,
};
#[cfg(feature = "medical")]
pub use medical_extractor::{EntityExtractor, ExtractedEntity};
#[cfg(feature = "medical")]
pub use sharded_extractor::ShardedUmlsExtractor;
#[cfg(feature = "medical")]
pub use snomed::{SemanticType, SnomedConcept, SnomedMatch};
#[cfg(feature = "medical")]
pub use umls::{UmlsConcept, UmlsDataset, UmlsStats};
#[cfg(feature = "medical")]
pub use umls_extractor::{UmlsExtractor, UmlsExtractorStats, UmlsMatch};
pub mod autocomplete_helpers {
use super::autocomplete::{AutocompleteIndex, AutocompleteMetadata};
pub fn iter_metadata(
index: &AutocompleteIndex,
) -> impl Iterator<Item = (&str, &AutocompleteMetadata)> {
index.metadata_iter()
}
pub fn get_metadata<'a>(
index: &'a AutocompleteIndex,
term: &str,
) -> Option<&'a AutocompleteMetadata> {
index.metadata_get(term)
}
}
#[cfg(feature = "remote-loading")]
pub use autocomplete::load_autocomplete_index;
use std::collections::HashMap;
use std::fmt::Display;
use std::fs;
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
#[cfg(feature = "typescript")]
use tsify::Tsify;
use terraphim_types::{NormalizedTerm, NormalizedTermValue, Thesaurus};
#[derive(thiserror::Error, Debug)]
pub enum TerraphimAutomataError {
#[error("Invalid thesaurus: {0}")]
InvalidThesaurus(String),
#[error("Serde deserialization error: {0}")]
Serde(#[from] serde_json::Error),
#[error("Dict error: {0}")]
Dict(String),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Aho-Corasick build error: {0}")]
AhoCorasick(#[from] aho_corasick::BuildError),
#[error("FST error: {0}")]
Fst(#[from] fst::Error),
}
pub type Result<T> = std::result::Result<T, TerraphimAutomataError>;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[cfg_attr(feature = "typescript", derive(Tsify))]
#[cfg_attr(feature = "typescript", tsify(into_wasm_abi, from_wasm_abi))]
pub enum AutomataPath {
Local(PathBuf),
Remote(String),
}
impl Display for AutomataPath {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
AutomataPath::Local(path) => write!(f, "Local Path: {:?}", path),
AutomataPath::Remote(url) => write!(f, "Remote URL: {:?}", url),
}
}
}
impl AutomataPath {
pub fn from_remote(url: &str) -> Result<Self> {
if !url.starts_with("http://") && !url.starts_with("https://") {
return Err(TerraphimAutomataError::Dict(format!(
"Invalid URL scheme. Only `http` and `https` are supported right now. Got {}",
url
)));
}
Ok(AutomataPath::Remote(url.to_string()))
}
pub fn from_local<P: AsRef<std::path::Path>>(file: P) -> Self {
AutomataPath::Local(file.as_ref().to_path_buf())
}
pub fn local_example() -> Self {
log::debug!("Current folder {:?}", std::env::current_dir());
let cwd = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
let simple_path = if cwd.ends_with("terraphim_automata")
|| cwd.ends_with("terraphim_kg_orchestration")
|| cwd.ends_with("terraphim_task_decomposition")
|| cwd.ends_with("terraphim_kg_agents")
|| cwd.ends_with("terraphim_agent_registry")
{
"../../test-fixtures/term_to_id_simple.json"
} else if cwd.file_name().is_some_and(|name| name == "terraphim-ai") {
"test-fixtures/term_to_id_simple.json"
} else {
"data/term_to_id_simple.json" };
AutomataPath::from_local(simple_path)
}
pub fn local_example_full() -> Self {
let cwd = std::env::current_dir().unwrap_or_else(|_| std::path::PathBuf::from("."));
let possible_paths = [
"test-fixtures/term_to_id.json", "../../test-fixtures/term_to_id.json", "../test-fixtures/term_to_id.json", "data/term_to_id.json", ];
let full_path = possible_paths
.iter()
.find(|path| cwd.join(path).exists())
.unwrap_or(&"test-fixtures/term_to_id.json");
AutomataPath::from_local(full_path)
}
pub fn remote_example() -> Self {
AutomataPath::from_remote("https://staging-storage.terraphim.io/thesaurus_Default.json")
.unwrap()
}
}
pub fn load_thesaurus_from_json(json_str: &str) -> Result<Thesaurus> {
let thesaurus: Thesaurus = serde_json::from_str(json_str)?;
Ok(thesaurus)
}
pub fn load_thesaurus_from_json_and_replace(
json_str: &str,
content: &str,
link_type: LinkType,
) -> Result<Vec<u8>> {
let thesaurus = load_thesaurus_from_json(json_str)?;
let replaced = replace_matches(content, thesaurus, link_type)?;
Ok(replaced)
}
#[cfg(feature = "remote-loading")]
pub async fn load_thesaurus_from_json_async(json_str: &str) -> Result<Thesaurus> {
load_thesaurus_from_json(json_str)
}
#[cfg(feature = "remote-loading")]
pub async fn load_thesaurus_from_json_and_replace_async(
json_str: &str,
content: &str,
link_type: LinkType,
) -> Result<Vec<u8>> {
load_thesaurus_from_json_and_replace(json_str, content, link_type)
}
fn parse_thesaurus_json(contents: &str) -> Result<Thesaurus> {
#[derive(Deserialize)]
struct ThesaurusFormat {
name: String,
data: HashMap<String, NormalizedTerm>,
}
#[derive(Deserialize)]
#[allow(dead_code)]
struct LegacyTerm {
#[allow(dead_code)]
id: u64,
nterm: String,
#[serde(default)]
display_value: Option<String>,
#[serde(default)]
url: Option<String>,
}
match serde_json::from_str::<ThesaurusFormat>(contents) {
Ok(parsed) => {
log::debug!("Parsed thesaurus in new format with name: {}", parsed.name);
let mut thesaurus = Thesaurus::new(parsed.name);
for (key, term) in parsed.data {
thesaurus.insert(NormalizedTermValue::from(key.as_str()), term);
}
return Ok(thesaurus);
}
Err(e) => {
log::debug!(
"Failed to parse as new Thesaurus format: {}, trying legacy format",
e
);
}
}
match serde_json::from_str::<HashMap<String, LegacyTerm>>(contents) {
Ok(legacy) => {
log::info!(
"Parsed thesaurus in legacy flat format with {} terms",
legacy.len()
);
let mut thesaurus = Thesaurus::new("imported".to_string());
for (key, term) in legacy {
let normalized =
NormalizedTerm::with_auto_id(NormalizedTermValue::from(key.as_str()))
.with_display_value(
term.display_value.unwrap_or_else(|| term.nterm.clone()),
)
.with_url(term.url.unwrap_or_default());
thesaurus.insert(NormalizedTermValue::from(key.as_str()), normalized);
}
return Ok(thesaurus);
}
Err(e) => {
log::warn!("Failed to parse thesaurus JSON in either format: {}", e);
}
}
Err(TerraphimAutomataError::InvalidThesaurus(
"Could not parse thesaurus JSON in either new or legacy format".to_string(),
))
}
#[cfg(feature = "remote-loading")]
pub async fn load_thesaurus(automata_path: &AutomataPath) -> Result<Thesaurus> {
async fn read_url(url: String) -> Result<String> {
log::debug!("Reading thesaurus from remote: {url}");
let response = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(30))
.user_agent("Terraphim-Automata/1.0")
.build()
.unwrap_or_else(|_| reqwest::Client::new())
.get(url.clone())
.header("Accept", "application/json")
.send()
.await
.map_err(|e| {
TerraphimAutomataError::InvalidThesaurus(format!(
"Failed to fetch thesaurus from remote {url}. Error: {e:#?}",
))
})?;
let status = response.status();
let headers = response.headers().clone();
let body = response.text().await;
match body {
Ok(text) => Ok(text),
Err(e) => {
let error_info = format!(
"Failed to read thesaurus from remote {url}. Status: {status}. Headers: {headers:#?}. Error: {e:#?}",
);
Err(TerraphimAutomataError::InvalidThesaurus(error_info))
}
}
}
let contents = match automata_path {
AutomataPath::Local(path) => {
if !std::path::Path::new(path).exists() {
return Err(TerraphimAutomataError::InvalidThesaurus(format!(
"Thesaurus file not found: {}",
path.display()
)));
}
fs::read_to_string(path)?
}
AutomataPath::Remote(url) => read_url(url.clone()).await?,
};
parse_thesaurus_json(&contents)
}
#[cfg(not(feature = "remote-loading"))]
pub fn load_thesaurus(automata_path: &AutomataPath) -> Result<Thesaurus> {
let contents = match automata_path {
AutomataPath::Local(path) => fs::read_to_string(path)?,
AutomataPath::Remote(_) => {
return Err(TerraphimAutomataError::InvalidThesaurus(
"Remote loading is not supported. Enable the 'remote-loading' feature.".to_string(),
));
}
};
parse_thesaurus_json(&contents)
}
#[cfg(test)]
mod tests {
use terraphim_types::NormalizedTermValue;
use super::*;
#[cfg(feature = "remote-loading")]
#[tokio::test]
async fn test_load_thesaurus_from_file() {
let automata_path = AutomataPath::local_example();
let thesaurus = load_thesaurus(&automata_path).await.unwrap();
assert_eq!(thesaurus.len(), 3);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("foo")).unwrap().id,
1u64
);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("bar")).unwrap().id,
2u64
);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("baz")).unwrap().id,
1u64
);
}
#[cfg(feature = "remote-loading")]
#[tokio::test]
#[ignore]
async fn test_load_thesaurus_from_url() {
let automata_path = AutomataPath::remote_example();
let thesaurus = load_thesaurus(&automata_path).await.unwrap();
assert_eq!(thesaurus.len(), 1725);
assert_eq!(
thesaurus
.get(&NormalizedTermValue::from("@risk a user guide"))
.unwrap()
.id,
661u64
);
}
#[cfg(not(feature = "remote-loading"))]
#[test]
fn test_load_thesaurus_from_file_sync() {
let automata_path = AutomataPath::local_example();
let thesaurus = load_thesaurus(&automata_path).unwrap();
assert_eq!(thesaurus.len(), 3);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("foo")).unwrap().id,
1
);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("bar")).unwrap().id,
2
);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("baz")).unwrap().id,
1
);
}
#[cfg(feature = "remote-loading")]
#[tokio::test]
async fn test_load_thesaurus_from_file_async() {
let automata_path = AutomataPath::local_example();
let thesaurus = load_thesaurus(&automata_path).await.unwrap();
assert_eq!(thesaurus.len(), 3);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("foo")).unwrap().id,
1
);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("bar")).unwrap().id,
2
);
assert_eq!(
thesaurus.get(&NormalizedTermValue::from("baz")).unwrap().id,
1
);
}
#[test]
fn test_load_thesaurus_from_json() {
let json_str = r#"
{
"name": "Engineering",
"data": {
"project management framework tailoring": {
"id": 1,
"nterm": "project tailoring strategy",
"url": "https://example.com/project-tailoring-strategy"
},
"strategy documents": {
"id": 2,
"nterm": "strategy documents",
"url": "https://example.com/strategy-documents"
},
"project constraints": {
"id": 3,
"nterm": "project constraints",
"url": "https://example.com/project-constraints"
}
}
}"#;
let thesaurus = load_thesaurus_from_json(json_str).unwrap();
assert_eq!(thesaurus.len(), 3);
assert_eq!(
thesaurus
.get(&NormalizedTermValue::from(
"project management framework tailoring"
))
.unwrap()
.id,
1
);
assert_eq!(
thesaurus
.get(&NormalizedTermValue::from("strategy documents"))
.unwrap()
.id,
2
);
assert_eq!(
thesaurus
.get(&NormalizedTermValue::from("project constraints"))
.unwrap()
.id,
3
);
assert_eq!(
thesaurus
.get(&NormalizedTermValue::from(
"project management framework tailoring"
))
.unwrap()
.url,
Some("https://example.com/project-tailoring-strategy".to_string())
);
assert_eq!(
thesaurus
.get(&NormalizedTermValue::from("strategy documents"))
.unwrap()
.url,
Some("https://example.com/strategy-documents".to_string())
);
}
#[test]
fn test_load_thesaurus_from_json_and_replace() {
let json_str = r#"
{
"name": "Engineering",
"data": {
"project management framework tailoring": {
"id": 1,
"nterm": "project tailoring strategy",
"url": "https://example.com/project-tailoring-strategy"
},
"strategy documents": {
"id": 2,
"nterm": "strategy documents",
"url": "https://example.com/strategy-documents"
},
"project constraints": {
"id": 3,
"nterm": "project constraints",
"url": "https://example.com/project-constraints"
}
}
}"#;
let content = "I like project constraints and strategy documents.";
let replaced =
load_thesaurus_from_json_and_replace(json_str, content, LinkType::MarkdownLinks)
.unwrap();
let replaced_str = String::from_utf8(replaced).unwrap();
assert_eq!(
replaced_str,
"I like [project constraints](https://example.com/project-constraints) and [strategy documents](https://example.com/strategy-documents)."
);
let replaced =
load_thesaurus_from_json_and_replace(json_str, content, LinkType::HTMLLinks).unwrap();
let replaced_str = String::from_utf8(replaced).unwrap();
assert_eq!(
replaced_str,
"I like <a href=\"https://example.com/project-constraints\">project constraints</a> and <a href=\"https://example.com/strategy-documents\">strategy documents</a>."
);
let replaced =
load_thesaurus_from_json_and_replace(json_str, content, LinkType::WikiLinks).unwrap();
let replaced_str = String::from_utf8(replaced).unwrap();
assert_eq!(
replaced_str,
"I like [[project constraints]] and [[strategy documents]]."
);
}
#[test]
fn test_load_thesaurus_from_json_invalid() {
let invalid_json = "{invalid_json}";
let result = load_thesaurus_from_json(invalid_json);
assert!(result.is_err());
}
#[test]
fn test_from_remote_accepts_https() {
let result = AutomataPath::from_remote("https://example.com/thesaurus.json");
assert!(result.is_ok());
match result.unwrap() {
AutomataPath::Remote(url) => {
assert_eq!(url, "https://example.com/thesaurus.json");
}
AutomataPath::Local(_) => panic!("Expected Remote variant"),
}
}
#[test]
fn test_from_remote_accepts_http() {
let result = AutomataPath::from_remote("http://example.com/thesaurus.json");
assert!(result.is_ok());
match result.unwrap() {
AutomataPath::Remote(url) => {
assert_eq!(url, "http://example.com/thesaurus.json");
}
AutomataPath::Local(_) => panic!("Expected Remote variant"),
}
}
#[test]
fn test_from_remote_rejects_ftp() {
let result = AutomataPath::from_remote("ftp://example.com/thesaurus.json");
assert!(result.is_err());
}
#[test]
fn test_from_remote_rejects_file_path() {
let result = AutomataPath::from_remote("/tmp/thesaurus.json");
assert!(result.is_err());
}
#[test]
fn test_from_remote_rejects_empty() {
let result = AutomataPath::from_remote("");
assert!(result.is_err());
}
}