scirs2_core/
string_ops.rs

1//! String processing utilities for scientific computing
2//!
3//! This module provides common string distance metrics, similarity measures,
4//! tokenization, n-gram generation, and case conversion utilities used across
5//! the SciRS2 ecosystem.
6//!
7//! # Distance Metrics
8//!
9//! - [`levenshtein_distance`] - Edit distance (insertions, deletions, substitutions)
10//! - [`hamming_distance`] - Number of positions where characters differ
11//! - [`jaro_similarity`] / [`jaro_winkler_similarity`] - Positional character similarity
12//!
13//! # Subsequences
14//!
15//! - [`longest_common_subsequence`] - LCS length
16//! - [`lcs_string`] - Actual LCS string
17//!
18//! # Tokenization & N-grams
19//!
20//! - [`tokenize_whitespace`] - Split by whitespace
21//! - [`tokenize_pattern`] - Split by custom delimiter patterns
22//! - [`ngrams`] / [`char_ngrams`] - Word-level and character-level n-grams
23//!
24//! # Case Conversions
25//!
26//! - [`to_snake_case`] / [`to_camel_case`] / [`to_pascal_case`] / [`to_kebab_case`] / [`to_screaming_snake_case`]
27
28use crate::error::{CoreError, CoreResult, ErrorContext};
29
30// ---------------------------------------------------------------------------
31// Levenshtein distance
32// ---------------------------------------------------------------------------
33
34/// Compute the Levenshtein edit distance between two strings.
35///
36/// The edit distance is the minimum number of single-character edits
37/// (insertions, deletions, substitutions) required to change one string
38/// into the other.
39///
40/// # Example
41///
42/// ```
43/// use scirs2_core::string_ops::levenshtein_distance;
44///
45/// assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
46/// assert_eq!(levenshtein_distance("", "abc"), 3);
47/// assert_eq!(levenshtein_distance("same", "same"), 0);
48/// ```
49pub fn levenshtein_distance(a: &str, b: &str) -> usize {
50    let a_chars: Vec<char> = a.chars().collect();
51    let b_chars: Vec<char> = b.chars().collect();
52    let m = a_chars.len();
53    let n = b_chars.len();
54
55    if m == 0 {
56        return n;
57    }
58    if n == 0 {
59        return m;
60    }
61
62    // Use two rows for O(min(m,n)) space
63    let mut prev = vec![0usize; n + 1];
64    let mut curr = vec![0usize; n + 1];
65
66    for j in 0..=n {
67        prev[j] = j;
68    }
69
70    for i in 1..=m {
71        curr[0] = i;
72        for j in 1..=n {
73            let cost = if a_chars[i - 1] == b_chars[j - 1] {
74                0
75            } else {
76                1
77            };
78            curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
79        }
80        std::mem::swap(&mut prev, &mut curr);
81    }
82
83    prev[n]
84}
85
86/// Compute the normalized Levenshtein distance (0.0 = identical, 1.0 = completely different).
87///
88/// Normalized by dividing by the length of the longer string.
89pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
90    let max_len = a.chars().count().max(b.chars().count());
91    if max_len == 0 {
92        return 0.0;
93    }
94    levenshtein_distance(a, b) as f64 / max_len as f64
95}
96
97/// Compute the Levenshtein similarity (1.0 = identical, 0.0 = completely different).
98pub fn levenshtein_similarity(a: &str, b: &str) -> f64 {
99    1.0 - normalized_levenshtein(a, b)
100}
101
102// ---------------------------------------------------------------------------
103// Hamming distance
104// ---------------------------------------------------------------------------
105
106/// Compute the Hamming distance between two strings of equal length.
107///
108/// The Hamming distance is the number of positions at which the
109/// corresponding characters are different.
110///
111/// Returns an error if the strings have different lengths.
112///
113/// # Example
114///
115/// ```
116/// use scirs2_core::string_ops::hamming_distance;
117///
118/// assert_eq!(hamming_distance("karolin", "kathrin").expect("should succeed"), 3);
119/// assert!(hamming_distance("short", "longer").is_err());
120/// ```
121pub fn hamming_distance(a: &str, b: &str) -> CoreResult<usize> {
122    let a_chars: Vec<char> = a.chars().collect();
123    let b_chars: Vec<char> = b.chars().collect();
124    if a_chars.len() != b_chars.len() {
125        return Err(CoreError::ValueError(ErrorContext::new(format!(
126            "Hamming distance requires equal-length strings (got {} and {})",
127            a_chars.len(),
128            b_chars.len()
129        ))));
130    }
131    let dist = a_chars
132        .iter()
133        .zip(b_chars.iter())
134        .filter(|(x, y)| x != y)
135        .count();
136    Ok(dist)
137}
138
139// ---------------------------------------------------------------------------
140// Jaro / Jaro-Winkler similarity
141// ---------------------------------------------------------------------------
142
143/// Compute the Jaro similarity between two strings.
144///
145/// Returns a value in [0, 1] where 1 means the strings are identical.
146///
147/// # Example
148///
149/// ```
150/// use scirs2_core::string_ops::jaro_similarity;
151///
152/// let sim = jaro_similarity("martha", "marhta");
153/// assert!((sim - 0.9444).abs() < 0.001);
154/// ```
155pub fn jaro_similarity(a: &str, b: &str) -> f64 {
156    let a_chars: Vec<char> = a.chars().collect();
157    let b_chars: Vec<char> = b.chars().collect();
158    let la = a_chars.len();
159    let lb = b_chars.len();
160
161    if la == 0 && lb == 0 {
162        return 1.0;
163    }
164    if la == 0 || lb == 0 {
165        return 0.0;
166    }
167
168    let match_window = (la.max(lb) / 2).saturating_sub(1);
169
170    let mut a_matched = vec![false; la];
171    let mut b_matched = vec![false; lb];
172
173    let mut matches = 0usize;
174    let mut transpositions = 0usize;
175
176    // Find matching characters
177    for i in 0..la {
178        let start = i.saturating_sub(match_window);
179        let end = (i + match_window + 1).min(lb);
180        for j in start..end {
181            if !b_matched[j] && a_chars[i] == b_chars[j] {
182                a_matched[i] = true;
183                b_matched[j] = true;
184                matches += 1;
185                break;
186            }
187        }
188    }
189
190    if matches == 0 {
191        return 0.0;
192    }
193
194    // Count transpositions
195    let mut k = 0;
196    for i in 0..la {
197        if !a_matched[i] {
198            continue;
199        }
200        while !b_matched[k] {
201            k += 1;
202        }
203        if a_chars[i] != b_chars[k] {
204            transpositions += 1;
205        }
206        k += 1;
207    }
208
209    let m = matches as f64;
210    (m / la as f64 + m / lb as f64 + (m - transpositions as f64 / 2.0) / m) / 3.0
211}
212
213/// Compute the Jaro-Winkler similarity between two strings.
214///
215/// Extends Jaro similarity with a bonus for matching prefixes, controlled
216/// by `prefix_weight` (default 0.1, should not exceed 0.25).
217///
218/// # Example
219///
220/// ```
221/// use scirs2_core::string_ops::jaro_winkler_similarity;
222///
223/// let sim = jaro_winkler_similarity("martha", "marhta", 0.1);
224/// assert!(sim > 0.94);
225/// ```
226pub fn jaro_winkler_similarity(a: &str, b: &str, prefix_weight: f64) -> f64 {
227    let jaro = jaro_similarity(a, b);
228    let a_chars: Vec<char> = a.chars().collect();
229    let b_chars: Vec<char> = b.chars().collect();
230
231    // Compute common prefix length (up to 4 characters)
232    let max_prefix = 4.min(a_chars.len()).min(b_chars.len());
233    let mut prefix_len = 0;
234    for i in 0..max_prefix {
235        if a_chars[i] == b_chars[i] {
236            prefix_len += 1;
237        } else {
238            break;
239        }
240    }
241
242    let weight = prefix_weight.min(0.25); // Clamp to prevent exceeding 1.0
243    jaro + prefix_len as f64 * weight * (1.0 - jaro)
244}
245
246// ---------------------------------------------------------------------------
247// Longest common subsequence
248// ---------------------------------------------------------------------------
249
250/// Compute the length of the longest common subsequence (LCS) of two strings.
251///
252/// # Example
253///
254/// ```
255/// use scirs2_core::string_ops::longest_common_subsequence;
256///
257/// assert_eq!(longest_common_subsequence("abcde", "ace"), 3);
258/// assert_eq!(longest_common_subsequence("abc", "def"), 0);
259/// ```
260pub fn longest_common_subsequence(a: &str, b: &str) -> usize {
261    let a_chars: Vec<char> = a.chars().collect();
262    let b_chars: Vec<char> = b.chars().collect();
263    let m = a_chars.len();
264    let n = b_chars.len();
265
266    if m == 0 || n == 0 {
267        return 0;
268    }
269
270    // Use two rows for O(n) space
271    let mut prev = vec![0usize; n + 1];
272    let mut curr = vec![0usize; n + 1];
273
274    for i in 1..=m {
275        for j in 1..=n {
276            if a_chars[i - 1] == b_chars[j - 1] {
277                curr[j] = prev[j - 1] + 1;
278            } else {
279                curr[j] = prev[j].max(curr[j - 1]);
280            }
281        }
282        std::mem::swap(&mut prev, &mut curr);
283        for v in curr.iter_mut() {
284            *v = 0;
285        }
286    }
287
288    *prev.iter().max().unwrap_or(&0)
289}
290
291/// Return the actual longest common subsequence string.
292///
293/// # Example
294///
295/// ```
296/// use scirs2_core::string_ops::lcs_string;
297///
298/// assert_eq!(lcs_string("abcde", "ace"), "ace");
299/// ```
300pub fn lcs_string(a: &str, b: &str) -> String {
301    let a_chars: Vec<char> = a.chars().collect();
302    let b_chars: Vec<char> = b.chars().collect();
303    let m = a_chars.len();
304    let n = b_chars.len();
305
306    if m == 0 || n == 0 {
307        return String::new();
308    }
309
310    // Full DP table for backtracking
311    let mut dp = vec![vec![0usize; n + 1]; m + 1];
312    for i in 1..=m {
313        for j in 1..=n {
314            if a_chars[i - 1] == b_chars[j - 1] {
315                dp[i][j] = dp[i - 1][j - 1] + 1;
316            } else {
317                dp[i][j] = dp[i - 1][j].max(dp[i][j - 1]);
318            }
319        }
320    }
321
322    // Backtrack to find the actual subsequence
323    let mut result = Vec::new();
324    let mut i = m;
325    let mut j = n;
326    while i > 0 && j > 0 {
327        if a_chars[i - 1] == b_chars[j - 1] {
328            result.push(a_chars[i - 1]);
329            i -= 1;
330            j -= 1;
331        } else if dp[i - 1][j] > dp[i][j - 1] {
332            i -= 1;
333        } else {
334            j -= 1;
335        }
336    }
337
338    result.reverse();
339    result.into_iter().collect()
340}
341
342/// Compute the LCS similarity (0.0 = no common subsequence, 1.0 = identical).
343pub fn lcs_similarity(a: &str, b: &str) -> f64 {
344    let max_len = a.chars().count().max(b.chars().count());
345    if max_len == 0 {
346        return 1.0;
347    }
348    longest_common_subsequence(a, b) as f64 / max_len as f64
349}
350
351// ---------------------------------------------------------------------------
352// N-gram generation
353// ---------------------------------------------------------------------------
354
355/// Generate word-level n-grams from a list of tokens.
356///
357/// # Example
358///
359/// ```
360/// use scirs2_core::string_ops::ngrams;
361///
362/// let tokens = vec!["the", "quick", "brown", "fox"];
363/// let bigrams = ngrams(&tokens, 2).expect("bigrams");
364/// assert_eq!(bigrams.len(), 3);
365/// assert_eq!(bigrams[0], vec!["the", "quick"]);
366/// ```
367pub fn ngrams<'a>(tokens: &[&'a str], n: usize) -> CoreResult<Vec<Vec<&'a str>>> {
368    if n == 0 {
369        return Err(CoreError::ValueError(ErrorContext::new(
370            "n must be >= 1 for n-gram generation",
371        )));
372    }
373    if tokens.len() < n {
374        return Ok(Vec::new());
375    }
376    let result: Vec<Vec<&str>> = tokens.windows(n).map(|w| w.to_vec()).collect();
377    Ok(result)
378}
379
380/// Generate character-level n-grams from a string.
381///
382/// # Example
383///
384/// ```
385/// use scirs2_core::string_ops::char_ngrams;
386///
387/// let grams = char_ngrams("hello", 2).expect("bigrams");
388/// assert_eq!(grams, vec!["he", "el", "ll", "lo"]);
389/// ```
390pub fn char_ngrams(text: &str, n: usize) -> CoreResult<Vec<String>> {
391    if n == 0 {
392        return Err(CoreError::ValueError(ErrorContext::new(
393            "n must be >= 1 for char n-gram generation",
394        )));
395    }
396    let chars: Vec<char> = text.chars().collect();
397    if chars.len() < n {
398        return Ok(Vec::new());
399    }
400    let result: Vec<String> = chars.windows(n).map(|w| w.iter().collect()).collect();
401    Ok(result)
402}
403
404/// Generate skip-grams (n-grams with gaps).
405///
406/// A skip-gram of order (n, k) takes n tokens with up to k skips.
407/// This implementation generates bigrams with a given skip distance.
408pub fn skip_bigrams<'a>(tokens: &[&'a str], skip: usize) -> Vec<(&'a str, &'a str)> {
409    let mut result = Vec::new();
410    for i in 0..tokens.len() {
411        for gap in 1..=(skip + 1) {
412            if i + gap < tokens.len() {
413                result.push((tokens[i], tokens[i + gap]));
414            }
415        }
416    }
417    result
418}
419
420// ---------------------------------------------------------------------------
421// Tokenization
422// ---------------------------------------------------------------------------
423
424/// Tokenize a string by splitting on whitespace.
425///
426/// # Example
427///
428/// ```
429/// use scirs2_core::string_ops::tokenize_whitespace;
430///
431/// let tokens = tokenize_whitespace("  hello   world  ");
432/// assert_eq!(tokens, vec!["hello", "world"]);
433/// ```
434pub fn tokenize_whitespace(text: &str) -> Vec<&str> {
435    text.split_whitespace().collect()
436}
437
438/// Tokenize a string by splitting on a delimiter character.
439///
440/// # Example
441///
442/// ```
443/// use scirs2_core::string_ops::tokenize_char;
444///
445/// let tokens = tokenize_char("a,b,,c", ',');
446/// assert_eq!(tokens, vec!["a", "b", "", "c"]);
447/// ```
448pub fn tokenize_char(text: &str, delimiter: char) -> Vec<&str> {
449    text.split(delimiter).collect()
450}
451
452/// Tokenize by splitting on any character matching a predicate.
453///
454/// Consecutive delimiters produce empty tokens (like `str::split`).
455pub fn tokenize_predicate<F: Fn(char) -> bool>(text: &str, predicate: F) -> Vec<&str> {
456    text.split(|c| predicate(c)).collect()
457}
458
459/// Tokenize by splitting on a string pattern.
460pub fn tokenize_pattern<'a>(text: &'a str, pattern: &str) -> Vec<&'a str> {
461    text.split(pattern).collect()
462}
463
464/// Tokenize into sentences (split on '.', '!', '?').
465///
466/// Trims whitespace from each sentence and removes empty strings.
467pub fn tokenize_sentences(text: &str) -> Vec<String> {
468    text.split(['.', '!', '?'])
469        .map(|s| s.trim().to_string())
470        .filter(|s| !s.is_empty())
471        .collect()
472}
473
474/// Simple word tokenizer that splits on non-alphanumeric characters.
475///
476/// Produces only non-empty tokens of alphanumeric characters.
477pub fn tokenize_words(text: &str) -> Vec<&str> {
478    let mut tokens = Vec::new();
479    let mut start = None;
480    for (i, c) in text.char_indices() {
481        if c.is_alphanumeric() || c == '_' {
482            if start.is_none() {
483                start = Some(i);
484            }
485        } else {
486            if let Some(s) = start {
487                tokens.push(&text[s..i]);
488                start = None;
489            }
490        }
491    }
492    if let Some(s) = start {
493        tokens.push(&text[s..]);
494    }
495    tokens
496}
497
498// ---------------------------------------------------------------------------
499// Case conversion
500// ---------------------------------------------------------------------------
501
502/// Convert a string to snake_case.
503///
504/// # Example
505///
506/// ```
507/// use scirs2_core::string_ops::to_snake_case;
508///
509/// assert_eq!(to_snake_case("helloWorld"), "hello_world");
510/// assert_eq!(to_snake_case("HTTPClient"), "http_client");
511/// assert_eq!(to_snake_case("already_snake"), "already_snake");
512/// ```
513pub fn to_snake_case(s: &str) -> String {
514    let words = split_into_words(s);
515    words
516        .iter()
517        .map(|w| w.to_lowercase())
518        .collect::<Vec<_>>()
519        .join("_")
520}
521
522/// Convert a string to camelCase.
523///
524/// # Example
525///
526/// ```
527/// use scirs2_core::string_ops::to_camel_case;
528///
529/// assert_eq!(to_camel_case("hello_world"), "helloWorld");
530/// assert_eq!(to_camel_case("some-kebab-case"), "someKebabCase");
531/// ```
532pub fn to_camel_case(s: &str) -> String {
533    let words = split_into_words(s);
534    let mut result = String::new();
535    for (i, word) in words.iter().enumerate() {
536        if i == 0 {
537            result.push_str(&word.to_lowercase());
538        } else {
539            let mut chars = word.chars();
540            if let Some(first) = chars.next() {
541                result.extend(first.to_uppercase());
542                result.push_str(&chars.as_str().to_lowercase());
543            }
544        }
545    }
546    result
547}
548
549/// Convert a string to PascalCase.
550///
551/// # Example
552///
553/// ```
554/// use scirs2_core::string_ops::to_pascal_case;
555///
556/// assert_eq!(to_pascal_case("hello_world"), "HelloWorld");
557/// assert_eq!(to_pascal_case("some-kebab-case"), "SomeKebabCase");
558/// ```
559pub fn to_pascal_case(s: &str) -> String {
560    let words = split_into_words(s);
561    let mut result = String::new();
562    for word in &words {
563        let mut chars = word.chars();
564        if let Some(first) = chars.next() {
565            result.extend(first.to_uppercase());
566            result.push_str(&chars.as_str().to_lowercase());
567        }
568    }
569    result
570}
571
572/// Convert a string to kebab-case.
573///
574/// # Example
575///
576/// ```
577/// use scirs2_core::string_ops::to_kebab_case;
578///
579/// assert_eq!(to_kebab_case("helloWorld"), "hello-world");
580/// assert_eq!(to_kebab_case("SomeClassName"), "some-class-name");
581/// ```
582pub fn to_kebab_case(s: &str) -> String {
583    let words = split_into_words(s);
584    words
585        .iter()
586        .map(|w| w.to_lowercase())
587        .collect::<Vec<_>>()
588        .join("-")
589}
590
591/// Convert a string to SCREAMING_SNAKE_CASE.
592///
593/// # Example
594///
595/// ```
596/// use scirs2_core::string_ops::to_screaming_snake_case;
597///
598/// assert_eq!(to_screaming_snake_case("helloWorld"), "HELLO_WORLD");
599/// assert_eq!(to_screaming_snake_case("some-kebab"), "SOME_KEBAB");
600/// ```
601pub fn to_screaming_snake_case(s: &str) -> String {
602    let words = split_into_words(s);
603    words
604        .iter()
605        .map(|w| w.to_uppercase())
606        .collect::<Vec<_>>()
607        .join("_")
608}
609
610/// Convert a string to Title Case.
611pub fn to_title_case(s: &str) -> String {
612    let words = split_into_words(s);
613    let mut result = Vec::with_capacity(words.len());
614    for word in &words {
615        let mut chars = word.chars();
616        if let Some(first) = chars.next() {
617            let mut titled = String::new();
618            titled.extend(first.to_uppercase());
619            titled.push_str(&chars.as_str().to_lowercase());
620            result.push(titled);
621        }
622    }
623    result.join(" ")
624}
625
626/// Split a string into words, handling camelCase, PascalCase, snake_case,
627/// kebab-case, and spaces.
628fn split_into_words(s: &str) -> Vec<String> {
629    let mut words = Vec::new();
630    let mut current = String::new();
631    let chars: Vec<char> = s.chars().collect();
632    let len = chars.len();
633
634    let mut i = 0;
635    while i < len {
636        let c = chars[i];
637
638        // Delimiters: underscore, hyphen, space, dot
639        if c == '_' || c == '-' || c == ' ' || c == '.' {
640            if !current.is_empty() {
641                words.push(current.clone());
642                current.clear();
643            }
644            i += 1;
645            continue;
646        }
647
648        if c.is_uppercase() {
649            // Check for acronym (consecutive uppercase)
650            if !current.is_empty() {
651                // If previous was lowercase, start new word
652                let prev = chars[i - 1];
653                if prev.is_lowercase() || prev.is_ascii_digit() {
654                    words.push(current.clone());
655                    current.clear();
656                }
657                // If next is lowercase and current is uppercase, end the acronym
658                // e.g., "HTTPClient" -> "HTTP" + "Client"
659                else if i + 1 < len && chars[i + 1].is_lowercase() && !current.is_empty() {
660                    // The current uppercase is the start of a new word
661                    words.push(current.clone());
662                    current.clear();
663                }
664            }
665            current.push(c);
666        } else {
667            current.push(c);
668        }
669
670        i += 1;
671    }
672
673    if !current.is_empty() {
674        words.push(current);
675    }
676
677    words
678}
679
680// ---------------------------------------------------------------------------
681// Additional string utilities
682// ---------------------------------------------------------------------------
683
684/// Compute the Dice coefficient (bigram overlap) between two strings.
685///
686/// The Dice coefficient is 2 * |intersection of bigrams| / (|bigrams_a| + |bigrams_b|).
687pub fn dice_coefficient(a: &str, b: &str) -> f64 {
688    let a_bigrams: Vec<String> = char_ngrams(a, 2).unwrap_or_default();
689    let b_bigrams: Vec<String> = char_ngrams(b, 2).unwrap_or_default();
690
691    if a_bigrams.is_empty() && b_bigrams.is_empty() {
692        return 1.0;
693    }
694    if a_bigrams.is_empty() || b_bigrams.is_empty() {
695        return 0.0;
696    }
697
698    let mut intersection = 0usize;
699    let mut b_used = vec![false; b_bigrams.len()];
700
701    for bg_a in &a_bigrams {
702        for (j, bg_b) in b_bigrams.iter().enumerate() {
703            if !b_used[j] && bg_a == bg_b {
704                intersection += 1;
705                b_used[j] = true;
706                break;
707            }
708        }
709    }
710
711    2.0 * intersection as f64 / (a_bigrams.len() + b_bigrams.len()) as f64
712}
713
714/// Pad a string on the left to a given width.
715pub fn pad_left(s: &str, width: usize, fill: char) -> String {
716    let char_count = s.chars().count();
717    if char_count >= width {
718        return s.to_string();
719    }
720    let padding: String = std::iter::repeat_n(fill, width - char_count).collect();
721    format!("{}{}", padding, s)
722}
723
724/// Pad a string on the right to a given width.
725pub fn pad_right(s: &str, width: usize, fill: char) -> String {
726    let char_count = s.chars().count();
727    if char_count >= width {
728        return s.to_string();
729    }
730    let padding: String = std::iter::repeat_n(fill, width - char_count).collect();
731    format!("{}{}", s, padding)
732}
733
734/// Center a string within a given width.
735pub fn center(s: &str, width: usize, fill: char) -> String {
736    let char_count = s.chars().count();
737    if char_count >= width {
738        return s.to_string();
739    }
740    let total_pad = width - char_count;
741    let left_pad = total_pad / 2;
742    let right_pad = total_pad - left_pad;
743    let left: String = std::iter::repeat_n(fill, left_pad).collect();
744    let right: String = std::iter::repeat_n(fill, right_pad).collect();
745    format!("{}{}{}", left, s, right)
746}
747
748/// Reverse a string (Unicode-aware).
749pub fn reverse(s: &str) -> String {
750    s.chars().rev().collect()
751}
752
753/// Check if a string is a palindrome (case-insensitive, alphanumeric only).
754pub fn is_palindrome(s: &str) -> bool {
755    let chars: Vec<char> = s
756        .chars()
757        .filter(|c| c.is_alphanumeric())
758        .map(|c| c.to_lowercase().next().unwrap_or(c))
759        .collect();
760    let len = chars.len();
761    if len <= 1 {
762        return true;
763    }
764    for i in 0..len / 2 {
765        if chars[i] != chars[len - 1 - i] {
766            return false;
767        }
768    }
769    true
770}
771
772/// Count occurrences of a substring.
773pub fn count_occurrences(text: &str, pattern: &str) -> usize {
774    if pattern.is_empty() {
775        return 0;
776    }
777    text.matches(pattern).count()
778}
779
780// ---------------------------------------------------------------------------
781// Tests
782// ---------------------------------------------------------------------------
783
784#[cfg(test)]
785mod tests {
786    use super::*;
787
788    // -- Levenshtein --
789
790    #[test]
791    fn test_levenshtein_identical() {
792        assert_eq!(levenshtein_distance("hello", "hello"), 0);
793    }
794
795    #[test]
796    fn test_levenshtein_empty() {
797        assert_eq!(levenshtein_distance("", "abc"), 3);
798        assert_eq!(levenshtein_distance("xyz", ""), 3);
799        assert_eq!(levenshtein_distance("", ""), 0);
800    }
801
802    #[test]
803    fn test_levenshtein_known() {
804        assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
805        assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
806    }
807
808    #[test]
809    fn test_normalized_levenshtein() {
810        assert!((normalized_levenshtein("abc", "abc") - 0.0).abs() < 1e-10);
811        assert!((normalized_levenshtein("", "") - 0.0).abs() < 1e-10);
812    }
813
814    #[test]
815    fn test_levenshtein_similarity() {
816        assert!((levenshtein_similarity("abc", "abc") - 1.0).abs() < 1e-10);
817    }
818
819    // -- Hamming --
820
821    #[test]
822    fn test_hamming_basic() {
823        assert_eq!(hamming_distance("karolin", "kathrin").expect("ok"), 3);
824        assert_eq!(hamming_distance("abc", "abc").expect("ok"), 0);
825    }
826
827    #[test]
828    fn test_hamming_unequal_length() {
829        assert!(hamming_distance("ab", "abc").is_err());
830    }
831
832    // -- Jaro / Jaro-Winkler --
833
834    #[test]
835    fn test_jaro_identical() {
836        assert!((jaro_similarity("abc", "abc") - 1.0).abs() < 1e-10);
837    }
838
839    #[test]
840    fn test_jaro_empty() {
841        assert!((jaro_similarity("", "") - 1.0).abs() < 1e-10);
842        assert!((jaro_similarity("abc", "") - 0.0).abs() < 1e-10);
843    }
844
845    #[test]
846    fn test_jaro_known() {
847        let sim = jaro_similarity("martha", "marhta");
848        assert!((sim - 0.9444).abs() < 0.001, "jaro: {}", sim);
849    }
850
851    #[test]
852    fn test_jaro_winkler() {
853        let sim = jaro_winkler_similarity("martha", "marhta", 0.1);
854        assert!(sim > 0.94, "jw: {}", sim);
855        // Winkler bonus should make it >= Jaro
856        assert!(sim >= jaro_similarity("martha", "marhta"));
857    }
858
859    #[test]
860    fn test_jaro_winkler_different() {
861        let sim = jaro_winkler_similarity("abc", "xyz", 0.1);
862        assert!(sim < 0.1);
863    }
864
865    // -- LCS --
866
867    #[test]
868    fn test_lcs_length() {
869        assert_eq!(longest_common_subsequence("abcde", "ace"), 3);
870        assert_eq!(longest_common_subsequence("abc", "def"), 0);
871        assert_eq!(longest_common_subsequence("", "abc"), 0);
872    }
873
874    #[test]
875    fn test_lcs_string() {
876        assert_eq!(lcs_string("abcde", "ace"), "ace");
877        assert_eq!(lcs_string("abc", "def"), "");
878    }
879
880    #[test]
881    fn test_lcs_similarity() {
882        assert!((lcs_similarity("abc", "abc") - 1.0).abs() < 1e-10);
883        assert!((lcs_similarity("", "") - 1.0).abs() < 1e-10);
884    }
885
886    // -- N-grams --
887
888    #[test]
889    fn test_word_ngrams() {
890        let tokens = vec!["a", "b", "c", "d"];
891        let bigrams = ngrams(&tokens, 2).expect("bigrams");
892        assert_eq!(bigrams.len(), 3);
893        assert_eq!(bigrams[0], vec!["a", "b"]);
894        assert_eq!(bigrams[2], vec!["c", "d"]);
895    }
896
897    #[test]
898    fn test_word_trigrams() {
899        let tokens = vec!["a", "b", "c", "d"];
900        let trigrams = ngrams(&tokens, 3).expect("trigrams");
901        assert_eq!(trigrams.len(), 2);
902    }
903
904    #[test]
905    fn test_ngrams_zero() {
906        assert!(ngrams(&["a"], 0).is_err());
907    }
908
909    #[test]
910    fn test_ngrams_too_short() {
911        let tokens = vec!["a"];
912        let result = ngrams(&tokens, 3).expect("ok");
913        assert!(result.is_empty());
914    }
915
916    #[test]
917    fn test_char_ngrams() {
918        let grams = char_ngrams("hello", 2).expect("ok");
919        assert_eq!(grams, vec!["he", "el", "ll", "lo"]);
920    }
921
922    #[test]
923    fn test_char_ngrams_zero() {
924        assert!(char_ngrams("hello", 0).is_err());
925    }
926
927    #[test]
928    fn test_skip_bigrams() {
929        let tokens = vec!["a", "b", "c"];
930        let result = skip_bigrams(&tokens, 1);
931        // skip=1: (a,b),(a,c),(b,c) = 3 pairs
932        assert_eq!(result.len(), 3);
933        assert!(result.contains(&("a", "b")));
934        assert!(result.contains(&("a", "c")));
935        assert!(result.contains(&("b", "c")));
936    }
937
938    // -- Tokenization --
939
940    #[test]
941    fn test_tokenize_whitespace() {
942        let tokens = tokenize_whitespace("  hello   world  ");
943        assert_eq!(tokens, vec!["hello", "world"]);
944    }
945
946    #[test]
947    fn test_tokenize_char() {
948        let tokens = tokenize_char("a,b,,c", ',');
949        assert_eq!(tokens, vec!["a", "b", "", "c"]);
950    }
951
952    #[test]
953    fn test_tokenize_pattern() {
954        let tokens = tokenize_pattern("a::b::c", "::");
955        assert_eq!(tokens, vec!["a", "b", "c"]);
956    }
957
958    #[test]
959    fn test_tokenize_sentences() {
960        let sentences = tokenize_sentences("Hello world. How are you? Fine!");
961        assert_eq!(sentences.len(), 3);
962        assert_eq!(sentences[0], "Hello world");
963    }
964
965    #[test]
966    fn test_tokenize_words() {
967        let tokens = tokenize_words("Hello, world! It's 2026.");
968        assert_eq!(tokens, vec!["Hello", "world", "It", "s", "2026"]);
969    }
970
971    // -- Case conversions --
972
973    #[test]
974    fn test_to_snake_case() {
975        assert_eq!(to_snake_case("helloWorld"), "hello_world");
976        assert_eq!(to_snake_case("HTTPClient"), "http_client");
977        assert_eq!(to_snake_case("already_snake"), "already_snake");
978        assert_eq!(to_snake_case("SomeClassName"), "some_class_name");
979    }
980
981    #[test]
982    fn test_to_camel_case() {
983        assert_eq!(to_camel_case("hello_world"), "helloWorld");
984        assert_eq!(to_camel_case("some-kebab-case"), "someKebabCase");
985        assert_eq!(to_camel_case("SCREAMING_SNAKE"), "screamingSnake");
986    }
987
988    #[test]
989    fn test_to_pascal_case() {
990        assert_eq!(to_pascal_case("hello_world"), "HelloWorld");
991        assert_eq!(to_pascal_case("some-kebab"), "SomeKebab");
992    }
993
994    #[test]
995    fn test_to_kebab_case() {
996        assert_eq!(to_kebab_case("helloWorld"), "hello-world");
997        assert_eq!(to_kebab_case("SomeClassName"), "some-class-name");
998    }
999
1000    #[test]
1001    fn test_to_screaming_snake_case() {
1002        assert_eq!(to_screaming_snake_case("helloWorld"), "HELLO_WORLD");
1003        assert_eq!(to_screaming_snake_case("some-kebab"), "SOME_KEBAB");
1004    }
1005
1006    #[test]
1007    fn test_to_title_case() {
1008        assert_eq!(to_title_case("hello_world"), "Hello World");
1009        assert_eq!(to_title_case("helloWorld"), "Hello World");
1010    }
1011
1012    #[test]
1013    fn test_split_into_words() {
1014        let words = split_into_words("helloWorldFoo");
1015        assert_eq!(words, vec!["hello", "World", "Foo"]);
1016
1017        let words2 = split_into_words("snake_case_var");
1018        assert_eq!(words2, vec!["snake", "case", "var"]);
1019
1020        let words3 = split_into_words("kebab-case-var");
1021        assert_eq!(words3, vec!["kebab", "case", "var"]);
1022    }
1023
1024    // -- Dice coefficient --
1025
1026    #[test]
1027    fn test_dice_coefficient_identical() {
1028        assert!((dice_coefficient("night", "night") - 1.0).abs() < 1e-10);
1029    }
1030
1031    #[test]
1032    fn test_dice_coefficient_different() {
1033        let d = dice_coefficient("abc", "xyz");
1034        assert!(d < 0.01);
1035    }
1036
1037    // -- Padding --
1038
1039    #[test]
1040    fn test_pad_left() {
1041        assert_eq!(pad_left("42", 5, '0'), "00042");
1042        assert_eq!(pad_left("hello", 3, ' '), "hello"); // no pad needed
1043    }
1044
1045    #[test]
1046    fn test_pad_right() {
1047        assert_eq!(pad_right("hi", 5, '.'), "hi...");
1048    }
1049
1050    #[test]
1051    fn test_center() {
1052        assert_eq!(center("hi", 6, '-'), "--hi--");
1053        assert_eq!(center("hi", 7, '-'), "--hi---");
1054    }
1055
1056    // -- Misc --
1057
1058    #[test]
1059    fn test_reverse() {
1060        assert_eq!(reverse("hello"), "olleh");
1061        assert_eq!(reverse(""), "");
1062    }
1063
1064    #[test]
1065    fn test_is_palindrome() {
1066        assert!(is_palindrome("racecar"));
1067        assert!(is_palindrome("A man a plan a canal Panama"));
1068        assert!(!is_palindrome("hello"));
1069        assert!(is_palindrome(""));
1070    }
1071
1072    #[test]
1073    fn test_count_occurrences() {
1074        assert_eq!(count_occurrences("abababab", "ab"), 4);
1075        assert_eq!(count_occurrences("hello", "xyz"), 0);
1076        assert_eq!(count_occurrences("hello", ""), 0);
1077    }
1078
1079    // -- Unicode support --
1080
1081    #[test]
1082    fn test_levenshtein_unicode() {
1083        assert_eq!(levenshtein_distance("cafe", "cafe"), 0);
1084        // Japanese characters
1085        assert_eq!(levenshtein_distance("abc", "abc"), 0);
1086    }
1087
1088    #[test]
1089    fn test_char_ngrams_unicode() {
1090        let grams = char_ngrams("ab", 2).expect("ok");
1091        assert_eq!(grams.len(), 1);
1092        assert_eq!(grams[0], "ab");
1093    }
1094}
scirs2_core/string_ops.rs

scirs2_core/
string_ops.rs