use fst::{IntoStreamer, Set, SetBuilder, Streamer};
use log::{debug, error, info};
use memmap2::Mmap;
use std::error::Error;
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader};
pub fn build_fst_set(input_path: &str, fst_path: &str) -> Result<(), Box<dyn Error>> {
info!("Building FST from input file: {}", input_path);
debug!("Output FST file: {}", fst_path);
let file = File::open(input_path).map_err(|e| {
error!("Failed to open input file '{}': {}", input_path, e);
e
})?;
let reader = BufReader::new(file);
let mut names: Vec<String> = reader.lines().filter_map(Result::ok).collect();
info!("Read {} chemical names from input file", names.len());
debug!("Sorting and deduplicating chemical names");
names.sort_unstable();
let original_count = names.len();
names.dedup();
let deduplicated_count = names.len();
if original_count != deduplicated_count {
info!(
"Removed {} duplicate entries, {} unique names remaining",
original_count - deduplicated_count,
deduplicated_count
);
}
debug!("Creating FST builder");
let wtr = File::create(fst_path).map_err(|e| {
error!("Failed to create output file '{}': {}", fst_path, e);
e
})?;
let mut builder = SetBuilder::new(wtr)?;
debug!("Inserting {} names into FST", names.len());
for (i, name) in names.iter().enumerate() {
if i > 0 && i % 10000 == 0 {
debug!("Inserted {} / {} names", i, names.len());
}
builder.insert(name)?;
}
debug!("Finalizing FST");
builder.finish()?;
info!(
"Successfully built FST with {} entries at: {}",
deduplicated_count, fst_path
);
Ok(())
}
pub fn load_fst_set(fst_path: &str) -> Result<Set<Mmap>, Box<dyn Error>> {
info!("Loading FST from: {}", fst_path);
let file = OpenOptions::new().read(true).open(fst_path).map_err(|e| {
error!("Failed to open FST file '{}': {}", fst_path, e);
e
})?;
debug!("Memory mapping FST file");
let mmap = unsafe {
Mmap::map(&file).map_err(|e| {
error!("Failed to memory map FST file '{}': {}", fst_path, e);
e
})?
};
debug!("Creating FST set from memory map");
let set = Set::new(mmap).map_err(|e| {
error!("Failed to create FST set from file '{}': {}", fst_path, e);
e
})?;
info!("Successfully loaded FST from: {}", fst_path);
Ok(set)
}
#[must_use]
pub fn prefix_search(set: &Set<Mmap>, prefix: &str, max_results: usize) -> Vec<String> {
debug!(
"Starting prefix search for '{}' with max_results={}",
prefix, max_results
);
let mut results = Vec::new();
let mut stream = set
.range()
.ge(prefix)
.lt(format!("{}{}", prefix, char::MAX))
.into_stream();
let mut checked_count = 0;
while let Some(key) = stream.next() {
checked_count += 1;
if results.len() >= max_results {
debug!("Reached max_results limit of {}", max_results);
break;
}
if let Ok(s) = String::from_utf8(key.to_vec()) {
debug!("Found match: {}", s);
results.push(s);
}
}
info!(
"Prefix search for '{}' found {} results (checked {} entries)",
prefix,
results.len(),
checked_count
);
results
}
pub fn substring_search(
set: &Set<Mmap>,
substring: &str,
max_results: usize,
) -> Result<Vec<String>, Box<dyn Error>> {
debug!(
"Starting substring search for '{}' with max_results={}",
substring, max_results
);
let substring_lower = substring.to_lowercase();
let mut results = Vec::new();
let mut stream = set.stream().into_stream();
let mut checked_count = 0;
while let Some(key) = stream.next() {
checked_count += 1;
if checked_count % 10000 == 0 {
debug!(
"Checked {} entries, found {} matches so far",
checked_count,
results.len()
);
}
if results.len() >= max_results {
debug!("Reached max_results limit of {}", max_results);
break;
}
if let Ok(s) = String::from_utf8(key.to_vec()) {
if s.to_lowercase().contains(&substring_lower) {
debug!("Found match: {}", s);
results.push(s);
}
}
}
info!(
"Substring search for '{}' found {} results (checked {} entries)",
substring,
results.len(),
checked_count
);
Ok(results)
}
pub fn preload_fst_set(set: &Set<Mmap>) -> Result<usize, Box<dyn Error>> {
info!("Starting FST preload to load all pages into memory");
let mut stream = set.stream().into_stream();
let mut count = 0;
while stream.next().is_some() {
count += 1;
if count % 10000 == 0 {
debug!("Preloaded {} entries", count);
}
}
info!("Successfully preloaded {} entries into memory", count);
Ok(count)
}