use std::sync::OnceLock;
use symspell::{AsciiStringStrategy, SymSpell, Verbosity};
static SYMSPELL: OnceLock<SymSpell<AsciiStringStrategy>> = OnceLock::new();
const FREQUENCY_DICT: &str = include_str!("../data/frequency_dictionary_en_82_765.txt");
const BIGRAM_DICT: &str = include_str!("../data/frequency_bigramdictionary_en_243_342.txt");
const COMMON_SHORT_WORDS: &[&str] = &[
"a", "i", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my",
"no", "of", "on", "or", "so", "to", "up", "us", "we", "am", "are", "can", "did", "for", "get",
"got", "had", "has", "her", "him", "his", "its", "let", "may", "nor", "not", "now", "off",
"old", "one", "our", "out", "own", "ran", "run", "saw", "say", "see", "set", "she", "the",
"too", "two", "use", "was", "way", "who", "why", "yet", "you", "all", "and", "any", "but",
"few", "how", "man", "new", "per", "put", "via",
];
fn init_symspell() -> SymSpell<AsciiStringStrategy> {
let mut symspell: SymSpell<AsciiStringStrategy> = SymSpell::default();
for line in FREQUENCY_DICT.lines() {
symspell.load_dictionary_line(line, 0, 1, " ");
}
for line in BIGRAM_DICT.lines() {
symspell.load_bigram_dictionary_line(line, 0, 2, " ");
}
tracing::debug!(
target: "memvid::symspell",
"SymSpell initialized with {} unigram and {} bigram entries",
FREQUENCY_DICT.lines().count(),
BIGRAM_DICT.lines().count()
);
symspell
}
fn get_symspell() -> &'static SymSpell<AsciiStringStrategy> {
SYMSPELL.get_or_init(init_symspell)
}
fn is_common_word(s: &str) -> bool {
let lower = s.to_ascii_lowercase();
COMMON_SHORT_WORDS.contains(&lower.as_str())
}
fn is_alpha(s: &str) -> bool {
!s.is_empty() && s.chars().all(|c| c.is_ascii_alphabetic())
}
fn is_fragment(s: &str) -> bool {
if !is_alpha(s) {
return false;
}
let len = s.len();
if len == 1 {
if let Some(c) = s.chars().next() {
return c != 'I' && c != 'a' && c != 'A';
}
return false;
}
if len <= 3 && !is_common_word(s) {
return true;
}
if len == 4 && !is_common_word(s) {
let symspell = get_symspell();
let suggestions = symspell.lookup(&s.to_lowercase(), Verbosity::Top, 0);
if suggestions.is_empty() {
return true;
}
}
false
}
fn prejoin_fragments(text: &str) -> String {
let words: Vec<&str> = text.split_whitespace().collect();
if words.len() < 2 {
return text.to_string();
}
let symspell = get_symspell();
let mut result: Vec<String> = Vec::with_capacity(words.len());
let mut i = 0;
while i < words.len() {
let word = words[i];
let mut best_merge: Option<(String, usize)> = None;
if is_alpha(word) && !is_common_word(word) && i + 1 < words.len() {
let mut merged = String::from(word);
let mut j = i + 1;
while j < words.len() && j - i < 6 && is_alpha(words[j]) {
merged.push_str(words[j]);
j += 1;
let suggestions = symspell.lookup(&merged.to_lowercase(), Verbosity::Closest, 2);
if let Some(suggestion) = suggestions.first() {
if suggestion.distance == 0
|| (suggestion.distance == 1 && merged.len() >= 6)
|| (suggestion.distance == 2 && merged.len() >= 10)
{
best_merge = Some((suggestion.term.clone(), j));
}
}
if j < words.len() && is_common_word(words[j]) && words[j].len() >= 3 {
break;
}
}
}
let should_try_old_merge = if best_merge.is_none() && i + 1 < words.len() {
let next = words[i + 1];
if is_fragment(word) && is_fragment(next) {
true
}
else if is_alpha(word) && word.len() <= 2 && is_fragment(next) {
let test_merge = format!("{}{}", word.to_lowercase(), next.to_lowercase());
let suggestions = symspell.lookup(&test_merge, Verbosity::Closest, 1);
suggestions
.first()
.map(|s| s.distance == 0)
.unwrap_or(false)
} else {
false
}
} else {
false
};
if let Some((corrected, end_idx)) = best_merge {
result.push(corrected);
i = end_idx;
} else if should_try_old_merge {
let mut merged = String::from(word);
let start_i = i;
i += 1;
while i < words.len() && is_fragment(words[i]) {
merged.push_str(words[i]);
i += 1;
}
let suggestions = symspell.lookup(&merged.to_lowercase(), Verbosity::Closest, 2);
if let Some(suggestion) = suggestions.first() {
if suggestion.distance == 0 || (suggestion.distance <= 2 && merged.len() >= 4) {
result.push(suggestion.term.clone());
continue;
}
}
for j in start_i..i {
result.push(words[j].to_string());
}
} else {
result.push(word.to_string());
i += 1;
}
}
result.join(" ")
}
#[must_use]
pub fn fix_pdf_text_symspell(text: &str, max_edit_distance: i64) -> String {
if text.is_empty() {
return String::new();
}
let symspell = get_symspell();
let lines: Vec<&str> = text.lines().collect();
let mut result = Vec::with_capacity(lines.len());
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
result.push(String::new());
continue;
}
let tokens: Vec<&str> = trimmed.split_whitespace().collect();
if tokens.is_empty() {
continue;
}
let mut chunks: Vec<(bool, Vec<&str>)> = Vec::new();
let mut current_chunk: Vec<&str> = Vec::new();
let mut current_is_protected = false;
for token in tokens {
let is_protected = token.chars().any(|c| c.is_ascii_digit());
if chunks.is_empty() && current_chunk.is_empty() {
current_is_protected = is_protected;
current_chunk.push(token);
} else if is_protected == current_is_protected {
current_chunk.push(token);
} else {
chunks.push((current_is_protected, current_chunk));
current_chunk = vec![token];
current_is_protected = is_protected;
}
}
if !current_chunk.is_empty() {
chunks.push((current_is_protected, current_chunk));
}
let mut line_parts: Vec<String> = Vec::new();
for (is_protected, chunk_tokens) in chunks {
if is_protected {
line_parts.push(chunk_tokens.join(" "));
} else {
let chunk_text = chunk_tokens.join(" ");
let prejoined = prejoin_fragments(&chunk_text);
let suggestions = symspell.lookup_compound(&prejoined, max_edit_distance);
if let Some(suggestion) = suggestions.first() {
line_parts.push(suggestion.term.clone());
} else {
line_parts.push(chunk_text);
}
}
}
result.push(line_parts.join(" "));
}
result.join("\n")
}
#[must_use]
pub fn fix_pdf_text(text: &str) -> String {
fix_pdf_text_symspell(text, 2)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fixes_split_words() {
let result = fix_pdf_text("emp lo yee");
assert!(
result == "employee" || result == "emp lo yee",
"got: {}",
result
);
let result = fix_pdf_text("co mp an y");
assert!(
result == "company" || result.contains("comp"),
"got: {}",
result
);
}
#[test]
fn fixes_classic_symspell_example() {
let input = "whereis th elove";
let result = fix_pdf_text(input);
assert!(
result.contains("where") && result.contains("love"),
"got: {}",
result
);
}
#[test]
fn preserves_correct_text() {
let result = fix_pdf_text("the manager reported");
assert!(
result.contains("manager") && result.contains("reported"),
"got: {}",
result
);
}
#[test]
fn handles_multiline() {
let input = "hello world\n\ntest sentence";
let result = fix_pdf_text(input);
assert!(result.contains("hello"));
assert!(result.contains("test"));
}
#[test]
fn fixes_name_fragments() {
let result = prejoin_fragments("A va Martin");
assert!(
result.contains("ava") || result.contains("Ava"),
"got: {}",
result
);
}
#[test]
fn fixes_supervisor_split() {
let result = fix_pdf_text("sup erviso r");
assert!(
result.contains("supervisor") || result.contains("supervise"),
"got: {}",
result
);
}
#[test]
fn preserves_valid_short_words() {
let result = fix_pdf_text("I am a person");
assert!(
result.contains("am") && result.contains("person"),
"got: {}",
result
);
let result = fix_pdf_text("to be or not");
assert!(
result.contains("to") || result.contains("be"),
"got: {}",
result
);
}
#[test]
fn fixes_joined_words() {
let result = fix_pdf_text("olderdo cuments");
assert!(
result.contains("older") || result.contains("document"),
"got: {}",
result
);
}
#[test]
fn handles_mixed_content() {
let result = fix_pdf_text("The emp lo yee reported to the man ager");
assert!(
result.contains("employee") || result.contains("emp"),
"got: {}",
result
);
assert!(
result.contains("manager") || result.contains("man"),
"got: {}",
result
);
}
#[test]
fn handles_empty_input() {
assert_eq!(fix_pdf_text(""), "");
assert_eq!(fix_pdf_text(" "), "");
}
#[test]
fn handles_single_word() {
let result = fix_pdf_text("hello");
assert_eq!(result, "hello");
}
#[test]
fn prejoin_respects_common_words() {
let result = prejoin_fragments("man ager");
assert!(
result == "manager" || result == "man ager",
"got: {}",
result
);
}
#[test]
fn fixes_numbers_and_proper_nouns() {
let result = fix_pdf_text("Model X500");
assert_eq!(result, "model X500");
let result = fix_pdf_text("The year 2025");
assert_eq!(result, "the year 2025");
let result = fix_pdf_text("iPhone 15 Pro");
assert_eq!(result, "iphone 15 pro");
let result = fix_pdf_text("COVID-19 pandemic");
assert_eq!(result, "COVID-19 pandemic");
let result = fix_pdf_text("emp lo yee 123");
assert_eq!(result, "employee 123");
}
}