Skip to main content

scirs2_text/spelling/
utils.rs

1//! Utility functions for spelling correction
2//!
3//! This module provides shared utility functions used across different
4//! spelling correction implementations.
5
6use crate::error::{Result, TextError};
7use std::collections::HashMap;
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::Path;
11
12/// Normalizes a string based on case sensitivity settings
13#[inline]
14#[allow(dead_code)]
15pub fn normalize_string(text: &str, casesensitive: bool) -> String {
16    if !casesensitive {
17        text.to_lowercase()
18    } else {
19        text.to_string()
20    }
21}
22
23/// Extract words from text, normalizing and filtering empty words
24#[allow(dead_code)]
25pub fn extract_words(text: &str) -> Vec<String> {
26    text.split_whitespace()
27        .map(|s| {
28            s.trim_matches(|c: char| !c.is_alphanumeric())
29                .to_lowercase()
30        })
31        .filter(|s| !s.is_empty())
32        .collect()
33}
34
35/// Split text into sentences
36#[allow(dead_code)]
37pub fn split_sentences(text: &str) -> Vec<&str> {
38    text.split(['.', '?', '!'])
39        .map(|s| s.trim())
40        .filter(|s| !s.is_empty())
41        .collect()
42}
43
44/// Check if two words are within the edit distance threshold based on length
45#[inline]
46#[allow(dead_code)]
47pub fn is_within_length_threshold(_word1: &str, word2: &str, max_editdistance: usize) -> bool {
48    _word1.len() <= word2.len() + max_editdistance && _word1.len() + max_editdistance >= word2.len()
49}
50
51/// Check if a word exists in a dictionary with optional case sensitivity
52#[inline]
53#[allow(dead_code)]
54pub fn dictionary_contains(
55    dictionary: &HashMap<String, usize>,
56    word: &str,
57    case_sensitive: bool,
58) -> bool {
59    if case_sensitive {
60        dictionary.contains_key(word)
61    } else {
62        let word_lower = word.to_lowercase();
63        dictionary
64            .keys()
65            .any(|dict_word| dict_word.to_lowercase() == word_lower)
66    }
67}
68
69/// Load data from a file line by line with a custom processor
70#[allow(dead_code)]
71pub fn load_from_file<P, F, T>(_path: P, mut lineprocessor: F) -> Result<T>
72where
73    P: AsRef<Path>,
74    F: FnMut(&str) -> Result<T>,
75    T: Default,
76{
77    let file =
78        File::open(_path).map_err(|e| TextError::IoError(format!("Failed to open file: {e}")))?;
79
80    let reader = BufReader::new(file);
81    let mut result = T::default();
82
83    for line in reader.lines() {
84        let line =
85            line.map_err(|e| TextError::IoError(format!("Failed to read line from file: {e}")))?;
86
87        // Skip empty lines
88        if line.trim().is_empty() {
89            continue;
90        }
91
92        result = lineprocessor(&line)?;
93    }
94
95    Ok(result)
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn test_normalize_string() {
104        assert_eq!(normalize_string("Hello", false), "hello");
105        assert_eq!(normalize_string("Hello", true), "Hello");
106    }
107
108    #[test]
109    fn test_extract_words() {
110        let text = "Hello, world! This is a test.";
111        let words = extract_words(text);
112        assert_eq!(words, vec!["hello", "world", "this", "is", "a", "test"]);
113    }
114
115    #[test]
116    fn test_split_sentences() {
117        let text = "Hello, world! This is a test. Another sentence.";
118        let sentences = split_sentences(text);
119        assert_eq!(
120            sentences,
121            vec!["Hello, world", "This is a test", "Another sentence"]
122        );
123    }
124
125    #[test]
126    fn test_is_within_length_threshold() {
127        assert!(is_within_length_threshold("hello", "hello", 2));
128        assert!(is_within_length_threshold("hello", "hell", 2));
129        assert!(is_within_length_threshold("hello", "helloo", 2));
130        assert!(!is_within_length_threshold("hello", "hi", 2));
131        assert!(!is_within_length_threshold("hello", "hello world", 2));
132    }
133
134    #[test]
135    fn test_dictionary_contains() {
136        let mut dictionary = HashMap::new();
137        dictionary.insert("Hello".to_string(), 10);
138        dictionary.insert("World".to_string(), 20);
139
140        // Case-sensitive checks
141        assert!(dictionary_contains(&dictionary, "Hello", true));
142        assert!(!dictionary_contains(&dictionary, "hello", true));
143
144        // Case-insensitive checks
145        assert!(dictionary_contains(&dictionary, "hello", false));
146        assert!(dictionary_contains(&dictionary, "WORLD", false));
147        assert!(!dictionary_contains(&dictionary, "test", false));
148    }
149}