Skip to main content

scirs2_text/
string_metrics.rs

1//! String metrics module for distance calculations and phonetic algorithms.
2
3use crate::Result;
4use std::cmp::{max, min};
5use std::collections::HashMap;
6
7/// Trait for string distance metrics
8pub trait StringMetric {
9    /// Calculate the distance between two strings
10    fn distance(&self, s1: &str, s2: &str) -> Result<f64>;
11
12    /// Calculate normalized distance (0.0 to 1.0)
13    fn normalized_distance(&self, s1: &str, s2: &str) -> Result<f64> {
14        let dist = self.distance(s1, s2)?;
15        let max_len = max(s1.len(), s2.len()) as f64;
16        Ok(if max_len > 0.0 { dist / max_len } else { 0.0 })
17    }
18
19    /// Calculate similarity (1.0 - normalized distance)
20    fn similarity(&self, s1: &str, s2: &str) -> Result<f64> {
21        Ok(1.0 - self.normalized_distance(s1, s2)?)
22    }
23}
24
25/// Damerau-Levenshtein distance metric
26#[derive(Debug, Clone)]
27pub struct DamerauLevenshteinMetric {
28    /// Whether to use restricted Damerau-Levenshtein (no substring transpositions)
29    restricted: bool,
30}
31
32impl DamerauLevenshteinMetric {
33    /// Create a new Damerau-Levenshtein metric
34    pub fn new() -> Self {
35        Self { restricted: false }
36    }
37
38    /// Create a restricted variant (OSA - Optimal String Alignment)
39    pub fn restricted() -> Self {
40        Self { restricted: true }
41    }
42}
43
44impl Default for DamerauLevenshteinMetric {
45    fn default() -> Self {
46        Self::new()
47    }
48}
49
50impl StringMetric for DamerauLevenshteinMetric {
51    fn distance(&self, s1: &str, s2: &str) -> Result<f64> {
52        if self.restricted {
53            Ok(osa_distance(s1, s2) as f64)
54        } else {
55            Ok(damerau_levenshtein_distance(s1, s2) as f64)
56        }
57    }
58}
59
60/// Calculate restricted Damerau-Levenshtein (OSA) distance
61#[allow(dead_code)]
62fn osa_distance(s1: &str, s2: &str) -> usize {
63    let a: Vec<char> = s1.chars().collect();
64    let b: Vec<char> = s2.chars().collect();
65    let len_a = a.len();
66    let len_b = b.len();
67
68    if len_a == 0 {
69        return len_b;
70    }
71    if len_b == 0 {
72        return len_a;
73    }
74
75    let mut matrix = vec![vec![0; len_b + 1]; len_a + 1];
76
77    for (i, row) in matrix.iter_mut().enumerate().take(len_a + 1) {
78        row[0] = i;
79    }
80    // For the first row, set each column index as the value
81    for j in 0..=len_b {
82        matrix[0][j] = j;
83    }
84
85    for i in 1..=len_a {
86        for j in 1..=len_b {
87            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
88
89            matrix[i][j] = min(
90                min(
91                    matrix[i - 1][j] + 1, // deletion
92                    matrix[i][j - 1] + 1, // insertion
93                ),
94                matrix[i - 1][j - 1] + cost, // substitution
95            );
96
97            // Transposition
98            if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1] {
99                matrix[i][j] = min(matrix[i][j], matrix[i - 2][j - 2] + cost);
100            }
101        }
102    }
103
104    matrix[len_a][len_b]
105}
106
107/// Calculate full Damerau-Levenshtein distance
108#[allow(dead_code)]
109fn damerau_levenshtein_distance(s1: &str, s2: &str) -> usize {
110    let a: Vec<char> = s1.chars().collect();
111    let b: Vec<char> = s2.chars().collect();
112    let len_a = a.len();
113    let len_b = b.len();
114
115    if len_a == 0 {
116        return len_b;
117    }
118    if len_b == 0 {
119        return len_a;
120    }
121
122    let max_dist = len_a + len_b;
123    let mut h = vec![vec![max_dist; len_b + 2]; len_a + 2];
124    let mut da: HashMap<char, usize> = HashMap::new();
125
126    for i in 0..=len_a {
127        h[i + 1][0] = max_dist;
128        h[i + 1][1] = i;
129    }
130    for j in 0..=len_b {
131        h[0][j + 1] = max_dist;
132        h[1][j + 1] = j;
133    }
134
135    for i in 1..=len_a {
136        let mut db = 0;
137        for j in 1..=len_b {
138            let k = da.get(&b[j - 1]).copied().unwrap_or(0);
139            let l = db;
140            let cost = if a[i - 1] == b[j - 1] {
141                db = j;
142                0
143            } else {
144                1
145            };
146
147            h[i + 1][j + 1] = min(
148                min(
149                    h[i][j] + cost,  // substitution
150                    h[i + 1][j] + 1, // insertion
151                ),
152                min(
153                    h[i][j + 1] + 1,                         // deletion
154                    h[k][l] + (i - k - 1) + 1 + (j - l - 1), // transposition
155                ),
156            );
157        }
158
159        da.insert(a[i - 1], i);
160    }
161
162    h[len_a + 1][len_b + 1]
163}
164
165/// Phonetic algorithms trait
166pub trait PhoneticAlgorithm {
167    /// Encode a string using the phonetic algorithm
168    fn encode(&self, text: &str) -> Result<String>;
169
170    /// Check if two strings are phonetically similar
171    fn sounds_like(&self, s1: &str, s2: &str) -> Result<bool> {
172        Ok(self.encode(s1)? == self.encode(s2)?)
173    }
174}
175
176/// Soundex phonetic algorithm
177#[derive(Debug, Clone)]
178pub struct Soundex {
179    /// Length of the code (standard is 4)
180    length: usize,
181}
182
183impl Soundex {
184    /// Create new Soundex encoder with standard length (4)
185    pub fn new() -> Self {
186        Self { length: 4 }
187    }
188
189    /// Create with custom code length
190    pub fn with_length(length: usize) -> Self {
191        Self { length }
192    }
193}
194
195impl Default for Soundex {
196    fn default() -> Self {
197        Self::new()
198    }
199}
200
201impl PhoneticAlgorithm for Soundex {
202    fn encode(&self, text: &str) -> Result<String> {
203        if text.is_empty() {
204            return Ok(String::new());
205        }
206
207        let text = text.to_uppercase();
208        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
209
210        if chars.is_empty() {
211            return Ok(String::new());
212        }
213
214        let mut code = String::new();
215        code.push(chars[0]);
216
217        let mut last_code = encode_char(chars[0]);
218
219        for &ch in &chars[1..] {
220            let ch_code = encode_char(ch);
221            if ch_code != '0' && ch_code != last_code {
222                code.push(ch_code);
223                last_code = ch_code;
224            }
225
226            if code.len() >= self.length {
227                break;
228            }
229        }
230
231        // Pad with zeros if necessary
232        while code.len() < self.length {
233            code.push('0');
234        }
235
236        Ok(code)
237    }
238}
239
240/// Encode a character for Soundex
241#[allow(dead_code)]
242fn encode_char(ch: char) -> char {
243    match ch.to_ascii_uppercase() {
244        'B' | 'F' | 'P' | 'V' => '1',
245        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
246        'D' | 'T' => '3',
247        'L' => '4',
248        'M' | 'N' => '5',
249        'R' => '6',
250        _ => '0',
251    }
252}
253
254/// Metaphone phonetic algorithm
255#[derive(Debug, Clone)]
256pub struct Metaphone {
257    /// Maximum length of the code
258    max_length: usize,
259}
260
261/// NYSIIS (New York State Identification and Intelligence System) phonetic algorithm
262#[derive(Debug, Clone)]
263pub struct Nysiis {
264    /// Maximum length of the code (0 for no limit)
265    max_length: usize,
266}
267
268/// Needleman-Wunsch global sequence alignment algorithm
269#[derive(Debug, Clone)]
270pub struct NeedlemanWunsch {
271    /// Score for matching characters
272    match_score: i32,
273    /// Penalty for mismatching characters
274    mismatch_penalty: i32,
275    /// Penalty for gaps
276    gap_penalty: i32,
277}
278
279/// Result of sequence alignment
280#[derive(Debug, Clone, PartialEq)]
281pub struct AlignmentResult {
282    /// Aligned first sequence
283    pub aligned_seq1: String,
284    /// Aligned second sequence
285    pub aligned_seq2: String,
286    /// Alignment score
287    pub score: i32,
288}
289
290/// Smith-Waterman local sequence alignment algorithm
291#[derive(Debug, Clone)]
292pub struct SmithWaterman {
293    /// Score for matching characters
294    match_score: i32,
295    /// Penalty for mismatching characters
296    mismatch_penalty: i32,
297    /// Penalty for gaps
298    gap_penalty: i32,
299}
300
301impl Metaphone {
302    /// Create new Metaphone encoder
303    pub fn new() -> Self {
304        Self { max_length: 6 }
305    }
306
307    /// Create with custom maximum length
308    pub fn with_max_length(_maxlength: usize) -> Self {
309        Self {
310            max_length: _maxlength,
311        }
312    }
313}
314
315impl Default for Metaphone {
316    fn default() -> Self {
317        Self::new()
318    }
319}
320
321impl PhoneticAlgorithm for Metaphone {
322    fn encode(&self, text: &str) -> Result<String> {
323        if text.is_empty() {
324            return Ok(String::new());
325        }
326
327        let text = text.to_uppercase();
328        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
329
330        if chars.is_empty() {
331            return Ok(String::new());
332        }
333
334        let mut result = String::new();
335        let mut i = 0;
336
337        // Initial transformations
338        if chars.len() >= 2 {
339            match (chars[0], chars[1]) {
340                ('A', 'E') | ('G', 'N') | ('K', 'N') | ('P', 'N') | ('W', 'R') => i = 1,
341                ('W', 'H') => {
342                    result.push('W');
343                    i = 2;
344                }
345                _ => {}
346            }
347        }
348
349        while i < chars.len() && result.len() < self.max_length {
350            let ch = chars[i];
351            let next = chars.get(i + 1).copied();
352            let prev = if i > 0 {
353                chars.get(i - 1).copied()
354            } else {
355                None
356            };
357
358            match ch {
359                'A' | 'E' | 'I' | 'O' | 'U' if i == 0 => {
360                    result.push(ch);
361                }
362                'B' if !result.ends_with('B') => {
363                    result.push('B');
364                }
365                'C' => {
366                    if next == Some('H') {
367                        result.push('X');
368                        i += 1;
369                    } else if next == Some('I') || next == Some('E') || next == Some('Y') {
370                        result.push('S');
371                    } else {
372                        result.push('K');
373                    }
374                }
375                'D' => {
376                    if next == Some('G') && chars.get(i + 2) == Some(&'E') {
377                        result.push('J');
378                        i += 2;
379                    } else {
380                        result.push('T');
381                    }
382                }
383                'F' | 'J' | 'L' | 'M' | 'N' | 'R' if !result.ends_with(ch) => {
384                    result.push(ch);
385                }
386                'G' => {
387                    if next == Some('H') {
388                        i += 1;
389                    } else if next == Some('N') {
390                        result.push('N');
391                        i += 1;
392                    } else {
393                        result.push('K');
394                    }
395                }
396                'H' if prev != Some('C')
397                    && prev != Some('S')
398                    && prev != Some('P')
399                    && prev != Some('T')
400                    && prev != Some('G') =>
401                {
402                    result.push('H');
403                }
404                'K' if prev != Some('C') => {
405                    result.push('K');
406                }
407                'P' => {
408                    if next == Some('H') {
409                        result.push('F');
410                        i += 1;
411                    } else {
412                        result.push('P');
413                    }
414                }
415                'Q' => result.push('K'),
416                'S' => {
417                    if next == Some('H') {
418                        result.push('X');
419                        i += 1;
420                    } else if next == Some('I')
421                        && (chars.get(i + 2) == Some(&'A') || chars.get(i + 2) == Some(&'O'))
422                    {
423                        result.push('X');
424                    } else {
425                        result.push('S');
426                    }
427                }
428                'T' => {
429                    if next == Some('H') {
430                        result.push('0');
431                        i += 1;
432                    } else if next != Some('C') || chars.get(i + 2) != Some(&'H') {
433                        result.push('T');
434                    }
435                }
436                'V' => result.push('F'),
437                'W' | 'Y' if next.map(|c| "AEIOU".contains(c)).unwrap_or(false) => {
438                    result.push(ch);
439                }
440                'X' => {
441                    result.push('K');
442                    result.push('S');
443                }
444                'Z' => result.push('S'),
445                _ => {}
446            }
447
448            i += 1;
449        }
450
451        Ok(result)
452    }
453}
454
455impl Nysiis {
456    /// Create new NYSIIS encoder with no length limit
457    pub fn new() -> Self {
458        Self { max_length: 0 }
459    }
460
461    /// Create with custom maximum length
462    pub fn with_max_length(_maxlength: usize) -> Self {
463        Self {
464            max_length: _maxlength,
465        }
466    }
467}
468
469impl Default for Nysiis {
470    fn default() -> Self {
471        Self::new()
472    }
473}
474
475impl PhoneticAlgorithm for Nysiis {
476    fn encode(&self, text: &str) -> Result<String> {
477        if text.is_empty() {
478            return Ok(String::new());
479        }
480
481        // Convert to uppercase and keep only alphabetic characters
482        let mut word: Vec<char> = text
483            .to_uppercase()
484            .chars()
485            .filter(|c| c.is_alphabetic())
486            .collect();
487
488        if word.is_empty() {
489            return Ok(String::new());
490        }
491
492        // Step 1: Handle initial letter combinations
493        if word.len() >= 3 {
494            let start3 = &word[0..3].iter().collect::<String>();
495            let start2 = &word[0..2].iter().collect::<String>();
496
497            if start3 == "MAC" {
498                word[1] = 'C';
499            } else if start2 == "KN" {
500                word.remove(0);
501            }
502        } else if word.len() >= 2 {
503            let start = &word[0..2].iter().collect::<String>();
504            if start == "KN" {
505                word.remove(0);
506            }
507        }
508
509        // Handle PH, PF at start
510        if word.len() >= 2 && (word[0] == 'P' && (word[1] == 'H' || word[1] == 'F')) {
511            word[0] = 'F';
512            word[1] = 'F';
513        }
514
515        // Handle SCH at start
516        if word.len() >= 3 && word[0] == 'S' && word[1] == 'C' && word[2] == 'H' {
517            word[0] = 'S';
518            word[1] = 'S';
519            word.remove(2); // Remove the third character to get SS instead of SSS
520        }
521
522        // Step 2: Handle last letter combinations
523        let len = word.len();
524        if len >= 2 {
525            let end = &word[len - 2..].iter().collect::<String>();
526            match end.as_str() {
527                "EE" | "IE" => {
528                    word.truncate(len - 2);
529                    word.push('Y');
530                }
531                "DT" | "RT" | "RD" | "NT" | "ND" => {
532                    word.truncate(len - 1);
533                    word.push('D');
534                }
535                _ => {}
536            }
537        }
538
539        // Step 3: First character of key is first character of name
540        let mut result = vec![word[0]];
541
542        // Step 4: Translate remaining characters
543        for i in 1..word.len() {
544            let ch = word[i];
545            let prev = word[i - 1];
546            let next = word.get(i + 1).copied();
547
548            match ch {
549                'A' => {
550                    // Add 'A' only if previous character is different and result doesn't end with 'A'
551                    if prev != 'A' && result.last() != Some(&'A') {
552                        result.push('A');
553                    }
554                }
555                'E' | 'I' | 'O' | 'U' => {
556                    if prev == ch {
557                        continue; // Skip repeated vowels
558                    }
559                    // Convert vowels to 'A', but avoid consecutive 'A's in the result
560                    if result.last() != Some(&'A') {
561                        result.push('A');
562                    }
563                }
564                'Q' => result.push('G'),
565                'Z' => result.push('S'),
566                'M' => result.push('N'),
567                'K' => {
568                    if next == Some('N') {
569                        result.push('N');
570                    } else {
571                        result.push('C');
572                    }
573                }
574                'S' => {
575                    if next == Some('C') && word.get(i + 2) == Some(&'H') {
576                        result.push('S');
577                        result.push('S');
578                        result.push('S');
579                    } else {
580                        result.push('S');
581                    }
582                }
583                'P' => {
584                    if next == Some('H') {
585                        result.push('F');
586                        result.push('F');
587                    } else {
588                        result.push('P');
589                    }
590                }
591                'H' => {
592                    // Skip 'H' if it follows 'G' (for GH combinations)
593                    if prev == 'G' {
594                        // Skip this 'H'
595                    } else if !matches!(prev, 'A' | 'E' | 'I' | 'O' | 'U')
596                        && !matches!(
597                            next,
598                            Some('A') | Some('E') | Some('I') | Some('O') | Some('U')
599                        )
600                        && prev != ch
601                    {
602                        result.push('H');
603                    }
604                }
605                'W' => {
606                    if matches!(prev, 'A' | 'E' | 'I' | 'O' | 'U') && prev != ch {
607                        result.push('W');
608                    }
609                }
610                _ => {
611                    if prev != ch {
612                        // Skip repeated consonants
613                        result.push(ch);
614                    } else if i == 1 && ch == 'F' && result.len() == 1 && result[0] == 'F' {
615                        // Special case: allow FF from PH conversion at start only
616                        result.push(ch);
617                    } else if i == 1 && ch == 'S' && result.len() == 1 && result[0] == 'S' {
618                        // Special case: allow SS from SCH conversion at start only
619                        result.push(ch);
620                    }
621                }
622            }
623        }
624
625        // Step 5: Remove trailing 'S', 'A', and 'H'
626        while result.len() > 1
627            && (result.last() == Some(&'S')
628                || result.last() == Some(&'A')
629                || result.last() == Some(&'H'))
630        {
631            result.pop();
632        }
633
634        // Step 6: Replace 'AY' at end with 'Y'
635        if result.len() >= 2 && result[result.len() - 2] == 'A' && result[result.len() - 1] == 'Y' {
636            result.pop();
637            result.pop();
638            result.push('Y');
639        }
640
641        // Apply max length if specified
642        let mut encoded: String = result.into_iter().collect();
643        if self.max_length > 0 && encoded.len() > self.max_length {
644            encoded.truncate(self.max_length);
645        }
646
647        Ok(encoded)
648    }
649}
650
651impl NeedlemanWunsch {
652    /// Create a new Needleman-Wunsch aligner with default parameters
653    pub fn new() -> Self {
654        Self {
655            match_score: 1,
656            mismatch_penalty: -1,
657            gap_penalty: -1,
658        }
659    }
660
661    /// Create with custom scoring parameters
662    pub fn with_scores(_match_score: i32, mismatch_penalty: i32, gappenalty: i32) -> Self {
663        Self {
664            match_score: _match_score,
665            mismatch_penalty,
666            gap_penalty: gappenalty,
667        }
668    }
669
670    /// Align two sequences using the Needleman-Wunsch algorithm
671    pub fn align(&self, seq1: &str, seq2: &str) -> AlignmentResult {
672        let seq1_chars: Vec<char> = seq1.chars().collect();
673        let seq2_chars: Vec<char> = seq2.chars().collect();
674        let m = seq1_chars.len();
675        let n = seq2_chars.len();
676
677        // Initialize scoring matrix
678        let mut matrix = vec![vec![0; n + 1]; m + 1];
679
680        // Initialize first row and column with gap penalties
681        for (i, item) in matrix.iter_mut().enumerate().take(m + 1) {
682            item[0] = i as i32 * self.gap_penalty;
683        }
684        for j in 0..=n {
685            matrix[0][j] = j as i32 * self.gap_penalty;
686        }
687
688        // Fill the matrix
689        for i in 1..=m {
690            for j in 1..=n {
691                let match_mismatch = if seq1_chars[i - 1] == seq2_chars[j - 1] {
692                    matrix[i - 1][j - 1] + self.match_score
693                } else {
694                    matrix[i - 1][j - 1] + self.mismatch_penalty
695                };
696
697                let delete = matrix[i - 1][j] + self.gap_penalty;
698                let insert = matrix[i][j - 1] + self.gap_penalty;
699
700                matrix[i][j] = *[match_mismatch, delete, insert]
701                    .iter()
702                    .max()
703                    .expect("Operation failed");
704            }
705        }
706
707        // Traceback to find the alignment
708        let mut aligned_seq1 = String::new();
709        let mut aligned_seq2 = String::new();
710        let mut i = m;
711        let mut j = n;
712
713        while i > 0 || j > 0 {
714            if i > 0 && j > 0 {
715                let current_score = matrix[i][j];
716                let diagonal_score = if seq1_chars[i - 1] == seq2_chars[j - 1] {
717                    matrix[i - 1][j - 1] + self.match_score
718                } else {
719                    matrix[i - 1][j - 1] + self.mismatch_penalty
720                };
721                let up_score = matrix[i - 1][j] + self.gap_penalty;
722                let left_score = matrix[i][j - 1] + self.gap_penalty;
723
724                // Prioritize diagonal (match/mismatch), then left (insertion), then up (deletion)
725                // This ensures consistent behavior when multiple paths have the same score
726                if current_score == diagonal_score {
727                    aligned_seq1.insert(0, seq1_chars[i - 1]);
728                    aligned_seq2.insert(0, seq2_chars[j - 1]);
729                    i -= 1;
730                    j -= 1;
731                } else if current_score == left_score {
732                    aligned_seq1.insert(0, '-');
733                    aligned_seq2.insert(0, seq2_chars[j - 1]);
734                    j -= 1;
735                } else if current_score == up_score {
736                    aligned_seq1.insert(0, seq1_chars[i - 1]);
737                    aligned_seq2.insert(0, '-');
738                    i -= 1;
739                } else {
740                    // Fallback case - should not happen with correct implementation
741                    aligned_seq1.insert(0, seq1_chars[i - 1]);
742                    aligned_seq2.insert(0, seq2_chars[j - 1]);
743                    i -= 1;
744                    j -= 1;
745                }
746            } else if i > 0 {
747                aligned_seq1.insert(0, seq1_chars[i - 1]);
748                aligned_seq2.insert(0, '-');
749                i -= 1;
750            } else {
751                aligned_seq1.insert(0, '-');
752                aligned_seq2.insert(0, seq2_chars[j - 1]);
753                j -= 1;
754            }
755        }
756
757        AlignmentResult {
758            aligned_seq1,
759            aligned_seq2,
760            score: matrix[m][n],
761        }
762    }
763}
764
765impl Default for NeedlemanWunsch {
766    fn default() -> Self {
767        Self::new()
768    }
769}
770
771impl SmithWaterman {
772    /// Create a new Smith-Waterman aligner with default parameters
773    pub fn new() -> Self {
774        Self {
775            match_score: 2,
776            mismatch_penalty: -1,
777            gap_penalty: -1,
778        }
779    }
780
781    /// Create with custom scoring parameters
782    pub fn with_scores(_match_score: i32, mismatch_penalty: i32, gappenalty: i32) -> Self {
783        Self {
784            match_score: _match_score,
785            mismatch_penalty,
786            gap_penalty: gappenalty,
787        }
788    }
789
790    /// Align two sequences using the Smith-Waterman algorithm
791    pub fn align(&self, seq1: &str, seq2: &str) -> AlignmentResult {
792        let seq1_chars: Vec<char> = seq1.chars().collect();
793        let seq2_chars: Vec<char> = seq2.chars().collect();
794        let m = seq1_chars.len();
795        let n = seq2_chars.len();
796
797        // Initialize scoring matrix
798        let mut matrix = vec![vec![0; n + 1]; m + 1];
799        let mut max_score = 0;
800        let mut max_i = 0;
801        let mut max_j = 0;
802
803        // Fill the matrix
804        for i in 1..=m {
805            for j in 1..=n {
806                let match_mismatch = if seq1_chars[i - 1] == seq2_chars[j - 1] {
807                    matrix[i - 1][j - 1] + self.match_score
808                } else {
809                    matrix[i - 1][j - 1] + self.mismatch_penalty
810                };
811
812                let delete = matrix[i - 1][j] + self.gap_penalty;
813                let insert = matrix[i][j - 1] + self.gap_penalty;
814
815                matrix[i][j] = *[0, match_mismatch, delete, insert]
816                    .iter()
817                    .max()
818                    .expect("Operation failed");
819
820                // Track maximum score position
821                if matrix[i][j] > max_score {
822                    max_score = matrix[i][j];
823                    max_i = i;
824                    max_j = j;
825                }
826            }
827        }
828
829        // Traceback from the maximum score position
830        let mut aligned_seq1 = String::new();
831        let mut aligned_seq2 = String::new();
832        let mut i = max_i;
833        let mut j = max_j;
834
835        while i > 0 && j > 0 && matrix[i][j] > 0 {
836            let current_score = matrix[i][j];
837            let diagonal_score = if seq1_chars[i - 1] == seq2_chars[j - 1] {
838                matrix[i - 1][j - 1] + self.match_score
839            } else {
840                matrix[i - 1][j - 1] + self.mismatch_penalty
841            };
842
843            if current_score == diagonal_score {
844                aligned_seq1.insert(0, seq1_chars[i - 1]);
845                aligned_seq2.insert(0, seq2_chars[j - 1]);
846                i -= 1;
847                j -= 1;
848            } else if current_score == matrix[i - 1][j] + self.gap_penalty {
849                aligned_seq1.insert(0, seq1_chars[i - 1]);
850                aligned_seq2.insert(0, '-');
851                i -= 1;
852            } else if current_score == matrix[i][j - 1] + self.gap_penalty {
853                aligned_seq1.insert(0, '-');
854                aligned_seq2.insert(0, seq2_chars[j - 1]);
855                j -= 1;
856            } else {
857                // This shouldn't happen in correct implementation
858                break;
859            }
860        }
861
862        AlignmentResult {
863            aligned_seq1,
864            aligned_seq2,
865            score: max_score,
866        }
867    }
868}
869
870impl Default for SmithWaterman {
871    fn default() -> Self {
872        Self::new()
873    }
874}
875
876#[cfg(test)]
877mod tests {
878    use super::*;
879
880    #[test]
881    fn test_damerau_levenshtein() {
882        let metric = DamerauLevenshteinMetric::new();
883
884        // Basic operations
885        assert_eq!(metric.distance("", "").expect("Operation failed"), 0.0);
886        assert_eq!(metric.distance("abc", "").expect("Operation failed"), 3.0);
887        assert_eq!(metric.distance("", "abc").expect("Operation failed"), 3.0);
888        assert_eq!(
889            metric.distance("abc", "abc").expect("Operation failed"),
890            0.0
891        );
892
893        // Single operations
894        assert_eq!(
895            metric.distance("abc", "aXc").expect("Operation failed"),
896            1.0
897        ); // substitution
898        assert_eq!(metric.distance("abc", "ac").expect("Operation failed"), 1.0); // deletion
899        assert_eq!(metric.distance("ac", "abc").expect("Operation failed"), 1.0); // insertion
900        assert_eq!(
901            metric.distance("abc", "acb").expect("Operation failed"),
902            1.0
903        ); // transposition
904
905        // Multiple operations
906        assert_eq!(
907            metric
908                .distance("kitten", "sitting")
909                .expect("Operation failed"),
910            3.0
911        );
912
913        // Test normalized distance
914        assert!(
915            (metric
916                .normalized_distance("abc", "aXc")
917                .expect("Operation failed")
918                - 0.333)
919                .abs()
920                < 0.01
921        );
922        assert_eq!(
923            metric
924                .normalized_distance("", "")
925                .expect("Operation failed"),
926            0.0
927        );
928
929        // Test similarity
930        assert!((metric.similarity("abc", "aXc").expect("Operation failed") - 0.667).abs() < 0.01);
931    }
932
933    #[test]
934    fn test_restricted_damerau_levenshtein() {
935        let metric = DamerauLevenshteinMetric::restricted();
936
937        // OSA doesn't allow substring transpositions
938        assert_eq!(metric.distance("ca", "abc").expect("Operation failed"), 3.0);
939        // Not 2.0 as in full DL
940    }
941
942    #[test]
943    fn test_soundex() {
944        let soundex = Soundex::new();
945
946        assert_eq!(soundex.encode("Robert").expect("Operation failed"), "R163");
947        assert_eq!(soundex.encode("Rupert").expect("Operation failed"), "R163");
948        assert_eq!(soundex.encode("SOUNDEX").expect("Operation failed"), "S532");
949        assert_eq!(soundex.encode("Smith").expect("Operation failed"), "S530");
950        assert_eq!(soundex.encode("Smythe").expect("Operation failed"), "S530");
951        assert_eq!(soundex.encode("").expect("Operation failed"), "");
952        assert_eq!(soundex.encode("123").expect("Operation failed"), "");
953
954        // Test sounds_like
955        assert!(soundex
956            .sounds_like("Robert", "Rupert")
957            .expect("Operation failed"));
958        assert!(soundex
959            .sounds_like("Smith", "Smythe")
960            .expect("Operation failed"));
961        assert!(!soundex
962            .sounds_like("Smith", "Jones")
963            .expect("Operation failed"));
964
965        // Test custom length
966        let soundex_5 = Soundex::with_length(5);
967        assert_eq!(
968            soundex_5.encode("SOUNDEX").expect("Operation failed"),
969            "S5320"
970        );
971    }
972
973    #[test]
974    fn test_metaphone() {
975        let metaphone = Metaphone::new();
976
977        assert_eq!(
978            metaphone.encode("programming").expect("Operation failed"),
979            "PRKRMN"
980        );
981        assert_eq!(
982            metaphone.encode("programmer").expect("Operation failed"),
983            "PRKRMR"
984        );
985        assert_eq!(metaphone.encode("Wright").expect("Operation failed"), "RT");
986        assert_eq!(metaphone.encode("White").expect("Operation failed"), "WT");
987        assert_eq!(metaphone.encode("Knight").expect("Operation failed"), "NT");
988        assert_eq!(metaphone.encode("").expect("Operation failed"), "");
989        assert_eq!(metaphone.encode("123").expect("Operation failed"), "");
990
991        // Test sounds_like - "Wright" and "Write" actually should sound similar in Metaphone
992        assert!(metaphone
993            .sounds_like("Wright", "Write")
994            .expect("Operation failed"));
995        assert!(!metaphone
996            .sounds_like("White", "Wright")
997            .expect("Operation failed"));
998
999        // Test custom max length
1000        let metaphone_3 = Metaphone::with_max_length(3);
1001        assert_eq!(
1002            metaphone_3.encode("programming").expect("Operation failed"),
1003            "PRK"
1004        );
1005    }
1006
1007    #[test]
1008    fn test_phonetic_edge_cases() {
1009        let soundex = Soundex::new();
1010        let metaphone = Metaphone::new();
1011
1012        // Test with non-alphabetic characters
1013        assert_eq!(soundex.encode("O'Brien").expect("Operation failed"), "O165");
1014        assert_eq!(
1015            metaphone.encode("O'Brien").expect("Operation failed"),
1016            "OBRN"
1017        );
1018
1019        // Test with mixed case
1020        assert_eq!(
1021            soundex.encode("McDonald").expect("Operation failed"),
1022            "M235"
1023        );
1024        assert_eq!(
1025            metaphone.encode("McDonald").expect("Operation failed"),
1026            "MKTNLT"
1027        );
1028    }
1029
1030    #[test]
1031    fn test_nysiis() {
1032        let nysiis = Nysiis::new();
1033
1034        // Basic tests
1035        assert_eq!(
1036            nysiis.encode("Johnson").expect("Operation failed"),
1037            "JANSAN"
1038        );
1039        assert_eq!(
1040            nysiis.encode("Williams").expect("Operation failed"),
1041            "WALAN"
1042        ); // Standard NYSIIS code
1043        assert_eq!(nysiis.encode("Jones").expect("Operation failed"), "JAN");
1044        assert_eq!(nysiis.encode("Smith").expect("Operation failed"), "SNAT");
1045        assert_eq!(
1046            nysiis.encode("MacDonald").expect("Operation failed"),
1047            "MCDANALD"
1048        );
1049        assert_eq!(nysiis.encode("Knight").expect("Operation failed"), "NAGT");
1050        assert_eq!(nysiis.encode("").expect("Operation failed"), "");
1051        assert_eq!(nysiis.encode("123").expect("Operation failed"), "");
1052
1053        // Test sounds_like
1054        assert!(nysiis
1055            .sounds_like("Johnson", "Jonson")
1056            .expect("Operation failed"));
1057        assert!(!nysiis
1058            .sounds_like("Smith", "Jones")
1059            .expect("Operation failed"));
1060
1061        // Test edge cases
1062        assert_eq!(
1063            nysiis.encode("Philips").expect("Operation failed"),
1064            "FFALAP"
1065        );
1066        assert_eq!(nysiis.encode("Schmidt").expect("Operation failed"), "SSNAD");
1067        assert_eq!(
1068            nysiis.encode("Schneider").expect("Operation failed"),
1069            "SSNADAR"
1070        );
1071
1072        // Test with max length
1073        let nysiis_6 = Nysiis::with_max_length(6);
1074        assert_eq!(
1075            nysiis_6.encode("Williams").expect("Operation failed"),
1076            "WALAN"
1077        ); // 5 chars, so not truncated
1078        assert_eq!(
1079            nysiis_6.encode("MacDonald").expect("Operation failed"),
1080            "MCDANA"
1081        ); // 6 chars, truncated from longer
1082    }
1083
1084    #[test]
1085    fn test_needleman_wunsch() {
1086        let aligner = NeedlemanWunsch::new();
1087
1088        // Test simple alignment
1089        let result = aligner.align("GATTACA", "GCATGCU");
1090
1091        // Check that we get a valid optimal alignment with correct score
1092        assert_eq!(result.aligned_seq1, "G-ATTACA");
1093        assert_eq!(result.score, 0);
1094
1095        // Verify the alignment is valid by checking that all characters from seq2 are present
1096        let seq2_chars = result
1097            .aligned_seq2
1098            .chars()
1099            .filter(|&c| c != '-')
1100            .collect::<String>();
1101        assert_eq!(seq2_chars, "GCATGCU");
1102
1103        // Verify alignment length consistency
1104        assert_eq!(result.aligned_seq1.len(), result.aligned_seq2.len());
1105
1106        // Test identical sequences
1107        let result = aligner.align("HELLO", "HELLO");
1108        assert_eq!(result.aligned_seq1, "HELLO");
1109        assert_eq!(result.aligned_seq2, "HELLO");
1110        assert_eq!(result.score, 5);
1111
1112        // Test empty sequences
1113        let result = aligner.align("", "ABC");
1114        assert_eq!(result.aligned_seq1, "---");
1115        assert_eq!(result.aligned_seq2, "ABC");
1116        assert_eq!(result.score, -3);
1117
1118        let result = aligner.align("ABC", "");
1119        assert_eq!(result.aligned_seq1, "ABC");
1120        assert_eq!(result.aligned_seq2, "---");
1121        assert_eq!(result.score, -3);
1122
1123        // Test with custom scores
1124        let custom_aligner = NeedlemanWunsch::with_scores(2, -2, -1);
1125        let result = custom_aligner.align("CAT", "CART");
1126        assert!(result.score > 0);
1127    }
1128
1129    #[test]
1130    fn test_smith_waterman() {
1131        let aligner = SmithWaterman::new();
1132
1133        // Test local alignment
1134        let result = aligner.align("GGTTGACTA", "TGTTACGG");
1135        assert!(result.score > 0);
1136        assert!(result.aligned_seq1.contains("GTT"));
1137        assert!(result.aligned_seq2.contains("GTT"));
1138
1139        // Test with sequences having common substring
1140        let result = aligner.align("ABCDEFG", "XYZCDEFPQ");
1141        assert_eq!(result.aligned_seq1, "CDEF");
1142        assert_eq!(result.aligned_seq2, "CDEF");
1143        assert_eq!(result.score, 8); // 4 matches * 2
1144
1145        // Test empty sequences
1146        let result = aligner.align("", "ABC");
1147        assert_eq!(result.aligned_seq1, "");
1148        assert_eq!(result.aligned_seq2, "");
1149        assert_eq!(result.score, 0);
1150
1151        // Test no common subsequence
1152        let result = aligner.align("AAA", "BBB");
1153        assert_eq!(result.score, 0);
1154
1155        // Test with custom scores
1156        let custom_aligner = SmithWaterman::with_scores(3, -3, -2);
1157        let result = custom_aligner.align("ACACACTA", "AGCACACA");
1158        assert!(result.score > 0);
1159    }
1160}