scirs2_text/
string_metrics.rs

1//! String metrics module for distance calculations and phonetic algorithms.
2
3use crate::Result;
4use std::cmp::{max, min};
5use std::collections::HashMap;
6
7/// Trait for string distance metrics
8pub trait StringMetric {
9    /// Calculate the distance between two strings
10    fn distance(&self, s1: &str, s2: &str) -> Result<f64>;
11
12    /// Calculate normalized distance (0.0 to 1.0)
13    fn normalized_distance(&self, s1: &str, s2: &str) -> Result<f64> {
14        let dist = self.distance(s1, s2)?;
15        let max_len = max(s1.len(), s2.len()) as f64;
16        Ok(if max_len > 0.0 { dist / max_len } else { 0.0 })
17    }
18
19    /// Calculate similarity (1.0 - normalized distance)
20    fn similarity(&self, s1: &str, s2: &str) -> Result<f64> {
21        Ok(1.0 - self.normalized_distance(s1, s2)?)
22    }
23}
24
25/// Damerau-Levenshtein distance metric
26#[derive(Debug, Clone)]
27pub struct DamerauLevenshteinMetric {
28    /// Whether to use restricted Damerau-Levenshtein (no substring transpositions)
29    restricted: bool,
30}
31
32impl DamerauLevenshteinMetric {
33    /// Create a new Damerau-Levenshtein metric
34    pub fn new() -> Self {
35        Self { restricted: false }
36    }
37
38    /// Create a restricted variant (OSA - Optimal String Alignment)
39    pub fn restricted() -> Self {
40        Self { restricted: true }
41    }
42}
43
44impl Default for DamerauLevenshteinMetric {
45    fn default() -> Self {
46        Self::new()
47    }
48}
49
50impl StringMetric for DamerauLevenshteinMetric {
51    fn distance(&self, s1: &str, s2: &str) -> Result<f64> {
52        if self.restricted {
53            Ok(osa_distance(s1, s2) as f64)
54        } else {
55            Ok(damerau_levenshtein_distance(s1, s2) as f64)
56        }
57    }
58}
59
60/// Calculate restricted Damerau-Levenshtein (OSA) distance
61#[allow(dead_code)]
62fn osa_distance(s1: &str, s2: &str) -> usize {
63    let a: Vec<char> = s1.chars().collect();
64    let b: Vec<char> = s2.chars().collect();
65    let len_a = a.len();
66    let len_b = b.len();
67
68    if len_a == 0 {
69        return len_b;
70    }
71    if len_b == 0 {
72        return len_a;
73    }
74
75    let mut matrix = vec![vec![0; len_b + 1]; len_a + 1];
76
77    for (i, row) in matrix.iter_mut().enumerate().take(len_a + 1) {
78        row[0] = i;
79    }
80    // For the first row, set each column index as the value
81    for j in 0..=len_b {
82        matrix[0][j] = j;
83    }
84
85    for i in 1..=len_a {
86        for j in 1..=len_b {
87            let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
88
89            matrix[i][j] = min(
90                min(
91                    matrix[i - 1][j] + 1, // deletion
92                    matrix[i][j - 1] + 1, // insertion
93                ),
94                matrix[i - 1][j - 1] + cost, // substitution
95            );
96
97            // Transposition
98            if i > 1 && j > 1 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1] {
99                matrix[i][j] = min(matrix[i][j], matrix[i - 2][j - 2] + cost);
100            }
101        }
102    }
103
104    matrix[len_a][len_b]
105}
106
107/// Calculate full Damerau-Levenshtein distance
108#[allow(dead_code)]
109fn damerau_levenshtein_distance(s1: &str, s2: &str) -> usize {
110    let a: Vec<char> = s1.chars().collect();
111    let b: Vec<char> = s2.chars().collect();
112    let len_a = a.len();
113    let len_b = b.len();
114
115    if len_a == 0 {
116        return len_b;
117    }
118    if len_b == 0 {
119        return len_a;
120    }
121
122    let max_dist = len_a + len_b;
123    let mut h = vec![vec![max_dist; len_b + 2]; len_a + 2];
124    let mut da: HashMap<char, usize> = HashMap::new();
125
126    for i in 0..=len_a {
127        h[i + 1][0] = max_dist;
128        h[i + 1][1] = i;
129    }
130    for j in 0..=len_b {
131        h[0][j + 1] = max_dist;
132        h[1][j + 1] = j;
133    }
134
135    for i in 1..=len_a {
136        let mut db = 0;
137        for j in 1..=len_b {
138            let k = da.get(&b[j - 1]).copied().unwrap_or(0);
139            let l = db;
140            let cost = if a[i - 1] == b[j - 1] {
141                db = j;
142                0
143            } else {
144                1
145            };
146
147            h[i + 1][j + 1] = min(
148                min(
149                    h[i][j] + cost,  // substitution
150                    h[i + 1][j] + 1, // insertion
151                ),
152                min(
153                    h[i][j + 1] + 1,                         // deletion
154                    h[k][l] + (i - k - 1) + 1 + (j - l - 1), // transposition
155                ),
156            );
157        }
158
159        da.insert(a[i - 1], i);
160    }
161
162    h[len_a + 1][len_b + 1]
163}
164
165/// Phonetic algorithms trait
166pub trait PhoneticAlgorithm {
167    /// Encode a string using the phonetic algorithm
168    fn encode(&self, text: &str) -> Result<String>;
169
170    /// Check if two strings are phonetically similar
171    fn sounds_like(&self, s1: &str, s2: &str) -> Result<bool> {
172        Ok(self.encode(s1)? == self.encode(s2)?)
173    }
174}
175
176/// Soundex phonetic algorithm
177#[derive(Debug, Clone)]
178pub struct Soundex {
179    /// Length of the code (standard is 4)
180    length: usize,
181}
182
183impl Soundex {
184    /// Create new Soundex encoder with standard length (4)
185    pub fn new() -> Self {
186        Self { length: 4 }
187    }
188
189    /// Create with custom code length
190    pub fn with_length(length: usize) -> Self {
191        Self { length }
192    }
193}
194
195impl Default for Soundex {
196    fn default() -> Self {
197        Self::new()
198    }
199}
200
201impl PhoneticAlgorithm for Soundex {
202    fn encode(&self, text: &str) -> Result<String> {
203        if text.is_empty() {
204            return Ok(String::new());
205        }
206
207        let text = text.to_uppercase();
208        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
209
210        if chars.is_empty() {
211            return Ok(String::new());
212        }
213
214        let mut code = String::new();
215        code.push(chars[0]);
216
217        let mut last_code = encode_char(chars[0]);
218
219        for &ch in &chars[1..] {
220            let ch_code = encode_char(ch);
221            if ch_code != '0' && ch_code != last_code {
222                code.push(ch_code);
223                last_code = ch_code;
224            }
225
226            if code.len() >= self.length {
227                break;
228            }
229        }
230
231        // Pad with zeros if necessary
232        while code.len() < self.length {
233            code.push('0');
234        }
235
236        Ok(code)
237    }
238}
239
240/// Encode a character for Soundex
241#[allow(dead_code)]
242fn encode_char(ch: char) -> char {
243    match ch.to_ascii_uppercase() {
244        'B' | 'F' | 'P' | 'V' => '1',
245        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
246        'D' | 'T' => '3',
247        'L' => '4',
248        'M' | 'N' => '5',
249        'R' => '6',
250        _ => '0',
251    }
252}
253
254/// Metaphone phonetic algorithm
255#[derive(Debug, Clone)]
256pub struct Metaphone {
257    /// Maximum length of the code
258    max_length: usize,
259}
260
261/// NYSIIS (New York State Identification and Intelligence System) phonetic algorithm
262#[derive(Debug, Clone)]
263pub struct Nysiis {
264    /// Maximum length of the code (0 for no limit)
265    max_length: usize,
266}
267
268/// Needleman-Wunsch global sequence alignment algorithm
269#[derive(Debug, Clone)]
270pub struct NeedlemanWunsch {
271    /// Score for matching characters
272    match_score: i32,
273    /// Penalty for mismatching characters
274    mismatch_penalty: i32,
275    /// Penalty for gaps
276    gap_penalty: i32,
277}
278
279/// Result of sequence alignment
280#[derive(Debug, Clone, PartialEq)]
281pub struct AlignmentResult {
282    /// Aligned first sequence
283    pub aligned_seq1: String,
284    /// Aligned second sequence
285    pub aligned_seq2: String,
286    /// Alignment score
287    pub score: i32,
288}
289
290/// Smith-Waterman local sequence alignment algorithm
291#[derive(Debug, Clone)]
292pub struct SmithWaterman {
293    /// Score for matching characters
294    match_score: i32,
295    /// Penalty for mismatching characters
296    mismatch_penalty: i32,
297    /// Penalty for gaps
298    gap_penalty: i32,
299}
300
301impl Metaphone {
302    /// Create new Metaphone encoder
303    pub fn new() -> Self {
304        Self { max_length: 6 }
305    }
306
307    /// Create with custom maximum length
308    pub fn with_max_length(_maxlength: usize) -> Self {
309        Self {
310            max_length: _maxlength,
311        }
312    }
313}
314
315impl Default for Metaphone {
316    fn default() -> Self {
317        Self::new()
318    }
319}
320
321impl PhoneticAlgorithm for Metaphone {
322    fn encode(&self, text: &str) -> Result<String> {
323        if text.is_empty() {
324            return Ok(String::new());
325        }
326
327        let text = text.to_uppercase();
328        let chars: Vec<char> = text.chars().filter(|c| c.is_alphabetic()).collect();
329
330        if chars.is_empty() {
331            return Ok(String::new());
332        }
333
334        let mut result = String::new();
335        let mut i = 0;
336
337        // Initial transformations
338        if chars.len() >= 2 {
339            match (chars[0], chars[1]) {
340                ('A', 'E') | ('G', 'N') | ('K', 'N') | ('P', 'N') | ('W', 'R') => i = 1,
341                ('W', 'H') => {
342                    result.push('W');
343                    i = 2;
344                }
345                _ => {}
346            }
347        }
348
349        while i < chars.len() && result.len() < self.max_length {
350            let ch = chars[i];
351            let next = chars.get(i + 1).copied();
352            let prev = if i > 0 {
353                chars.get(i - 1).copied()
354            } else {
355                None
356            };
357
358            match ch {
359                'A' | 'E' | 'I' | 'O' | 'U' => {
360                    if i == 0 {
361                        result.push(ch);
362                    }
363                }
364                'B' => {
365                    if !result.ends_with('B') {
366                        result.push('B');
367                    }
368                }
369                'C' => {
370                    if next == Some('H') {
371                        result.push('X');
372                        i += 1;
373                    } else if next == Some('I') || next == Some('E') || next == Some('Y') {
374                        result.push('S');
375                    } else {
376                        result.push('K');
377                    }
378                }
379                'D' => {
380                    if next == Some('G') && chars.get(i + 2) == Some(&'E') {
381                        result.push('J');
382                        i += 2;
383                    } else {
384                        result.push('T');
385                    }
386                }
387                'F' | 'J' | 'L' | 'M' | 'N' | 'R' => {
388                    if !result.ends_with(ch) {
389                        result.push(ch);
390                    }
391                }
392                'G' => {
393                    if next == Some('H') {
394                        i += 1;
395                    } else if next == Some('N') {
396                        result.push('N');
397                        i += 1;
398                    } else {
399                        result.push('K');
400                    }
401                }
402                'H' => {
403                    if prev != Some('C')
404                        && prev != Some('S')
405                        && prev != Some('P')
406                        && prev != Some('T')
407                        && prev != Some('G')
408                    {
409                        result.push('H');
410                    }
411                }
412                'K' => {
413                    if prev != Some('C') {
414                        result.push('K');
415                    }
416                }
417                'P' => {
418                    if next == Some('H') {
419                        result.push('F');
420                        i += 1;
421                    } else {
422                        result.push('P');
423                    }
424                }
425                'Q' => result.push('K'),
426                'S' => {
427                    if next == Some('H') {
428                        result.push('X');
429                        i += 1;
430                    } else if next == Some('I')
431                        && (chars.get(i + 2) == Some(&'A') || chars.get(i + 2) == Some(&'O'))
432                    {
433                        result.push('X');
434                    } else {
435                        result.push('S');
436                    }
437                }
438                'T' => {
439                    if next == Some('H') {
440                        result.push('0');
441                        i += 1;
442                    } else if next != Some('C') || chars.get(i + 2) != Some(&'H') {
443                        result.push('T');
444                    }
445                }
446                'V' => result.push('F'),
447                'W' | 'Y' => {
448                    if next.map(|c| "AEIOU".contains(c)).unwrap_or(false) {
449                        result.push(ch);
450                    }
451                }
452                'X' => {
453                    result.push('K');
454                    result.push('S');
455                }
456                'Z' => result.push('S'),
457                _ => {}
458            }
459
460            i += 1;
461        }
462
463        Ok(result)
464    }
465}
466
467impl Nysiis {
468    /// Create new NYSIIS encoder with no length limit
469    pub fn new() -> Self {
470        Self { max_length: 0 }
471    }
472
473    /// Create with custom maximum length
474    pub fn with_max_length(_maxlength: usize) -> Self {
475        Self {
476            max_length: _maxlength,
477        }
478    }
479}
480
481impl Default for Nysiis {
482    fn default() -> Self {
483        Self::new()
484    }
485}
486
487impl PhoneticAlgorithm for Nysiis {
488    fn encode(&self, text: &str) -> Result<String> {
489        if text.is_empty() {
490            return Ok(String::new());
491        }
492
493        // Convert to uppercase and keep only alphabetic characters
494        let mut word: Vec<char> = text
495            .to_uppercase()
496            .chars()
497            .filter(|c| c.is_alphabetic())
498            .collect();
499
500        if word.is_empty() {
501            return Ok(String::new());
502        }
503
504        // Step 1: Handle initial letter combinations
505        if word.len() >= 3 {
506            let start3 = &word[0..3].iter().collect::<String>();
507            let start2 = &word[0..2].iter().collect::<String>();
508
509            if start3 == "MAC" {
510                word[1] = 'C';
511            } else if start2 == "KN" {
512                word.remove(0);
513            }
514        } else if word.len() >= 2 {
515            let start = &word[0..2].iter().collect::<String>();
516            if start == "KN" {
517                word.remove(0);
518            }
519        }
520
521        // Handle PH, PF at start
522        if word.len() >= 2 && (word[0] == 'P' && (word[1] == 'H' || word[1] == 'F')) {
523            word[0] = 'F';
524            word[1] = 'F';
525        }
526
527        // Handle SCH at start
528        if word.len() >= 3 && word[0] == 'S' && word[1] == 'C' && word[2] == 'H' {
529            word[0] = 'S';
530            word[1] = 'S';
531            word.remove(2); // Remove the third character to get SS instead of SSS
532        }
533
534        // Step 2: Handle last letter combinations
535        let len = word.len();
536        if len >= 2 {
537            let end = &word[len - 2..].iter().collect::<String>();
538            match end.as_str() {
539                "EE" | "IE" => {
540                    word.truncate(len - 2);
541                    word.push('Y');
542                }
543                "DT" | "RT" | "RD" | "NT" | "ND" => {
544                    word.truncate(len - 1);
545                    word.push('D');
546                }
547                _ => {}
548            }
549        }
550
551        // Step 3: First character of key is first character of name
552        let mut result = vec![word[0]];
553
554        // Step 4: Translate remaining characters
555        for i in 1..word.len() {
556            let ch = word[i];
557            let prev = word[i - 1];
558            let next = word.get(i + 1).copied();
559
560            match ch {
561                'A' => {
562                    // Add 'A' only if previous character is different and result doesn't end with 'A'
563                    if prev != 'A' && result.last() != Some(&'A') {
564                        result.push('A');
565                    }
566                }
567                'E' | 'I' | 'O' | 'U' => {
568                    if prev == ch {
569                        continue; // Skip repeated vowels
570                    }
571                    // Convert vowels to 'A', but avoid consecutive 'A's in the result
572                    if result.last() != Some(&'A') {
573                        result.push('A');
574                    }
575                }
576                'Q' => result.push('G'),
577                'Z' => result.push('S'),
578                'M' => result.push('N'),
579                'K' => {
580                    if next == Some('N') {
581                        result.push('N');
582                    } else {
583                        result.push('C');
584                    }
585                }
586                'S' => {
587                    if next == Some('C') && word.get(i + 2) == Some(&'H') {
588                        result.push('S');
589                        result.push('S');
590                        result.push('S');
591                    } else {
592                        result.push('S');
593                    }
594                }
595                'P' => {
596                    if next == Some('H') {
597                        result.push('F');
598                        result.push('F');
599                    } else {
600                        result.push('P');
601                    }
602                }
603                'H' => {
604                    // Skip 'H' if it follows 'G' (for GH combinations)
605                    if prev == 'G' {
606                        // Skip this 'H'
607                    } else if !matches!(prev, 'A' | 'E' | 'I' | 'O' | 'U')
608                        && !matches!(
609                            next,
610                            Some('A') | Some('E') | Some('I') | Some('O') | Some('U')
611                        )
612                        && prev != ch
613                    {
614                        result.push('H');
615                    }
616                }
617                'W' => {
618                    if matches!(prev, 'A' | 'E' | 'I' | 'O' | 'U') && prev != ch {
619                        result.push('W');
620                    }
621                }
622                _ => {
623                    if prev != ch {
624                        // Skip repeated consonants
625                        result.push(ch);
626                    } else if i == 1 && ch == 'F' && result.len() == 1 && result[0] == 'F' {
627                        // Special case: allow FF from PH conversion at start only
628                        result.push(ch);
629                    } else if i == 1 && ch == 'S' && result.len() == 1 && result[0] == 'S' {
630                        // Special case: allow SS from SCH conversion at start only
631                        result.push(ch);
632                    }
633                }
634            }
635        }
636
637        // Step 5: Remove trailing 'S', 'A', and 'H'
638        while result.len() > 1
639            && (result.last() == Some(&'S')
640                || result.last() == Some(&'A')
641                || result.last() == Some(&'H'))
642        {
643            result.pop();
644        }
645
646        // Step 6: Replace 'AY' at end with 'Y'
647        if result.len() >= 2 && result[result.len() - 2] == 'A' && result[result.len() - 1] == 'Y' {
648            result.pop();
649            result.pop();
650            result.push('Y');
651        }
652
653        // Apply max length if specified
654        let mut encoded: String = result.into_iter().collect();
655        if self.max_length > 0 && encoded.len() > self.max_length {
656            encoded.truncate(self.max_length);
657        }
658
659        Ok(encoded)
660    }
661}
662
663impl NeedlemanWunsch {
664    /// Create a new Needleman-Wunsch aligner with default parameters
665    pub fn new() -> Self {
666        Self {
667            match_score: 1,
668            mismatch_penalty: -1,
669            gap_penalty: -1,
670        }
671    }
672
673    /// Create with custom scoring parameters
674    pub fn with_scores(_match_score: i32, mismatch_penalty: i32, gappenalty: i32) -> Self {
675        Self {
676            match_score: _match_score,
677            mismatch_penalty,
678            gap_penalty: gappenalty,
679        }
680    }
681
682    /// Align two sequences using the Needleman-Wunsch algorithm
683    pub fn align(&self, seq1: &str, seq2: &str) -> AlignmentResult {
684        let seq1_chars: Vec<char> = seq1.chars().collect();
685        let seq2_chars: Vec<char> = seq2.chars().collect();
686        let m = seq1_chars.len();
687        let n = seq2_chars.len();
688
689        // Initialize scoring matrix
690        let mut matrix = vec![vec![0; n + 1]; m + 1];
691
692        // Initialize first row and column with gap penalties
693        for (i, item) in matrix.iter_mut().enumerate().take(m + 1) {
694            item[0] = i as i32 * self.gap_penalty;
695        }
696        for j in 0..=n {
697            matrix[0][j] = j as i32 * self.gap_penalty;
698        }
699
700        // Fill the matrix
701        for i in 1..=m {
702            for j in 1..=n {
703                let match_mismatch = if seq1_chars[i - 1] == seq2_chars[j - 1] {
704                    matrix[i - 1][j - 1] + self.match_score
705                } else {
706                    matrix[i - 1][j - 1] + self.mismatch_penalty
707                };
708
709                let delete = matrix[i - 1][j] + self.gap_penalty;
710                let insert = matrix[i][j - 1] + self.gap_penalty;
711
712                matrix[i][j] = *[match_mismatch, delete, insert]
713                    .iter()
714                    .max()
715                    .expect("Operation failed");
716            }
717        }
718
719        // Traceback to find the alignment
720        let mut aligned_seq1 = String::new();
721        let mut aligned_seq2 = String::new();
722        let mut i = m;
723        let mut j = n;
724
725        while i > 0 || j > 0 {
726            if i > 0 && j > 0 {
727                let current_score = matrix[i][j];
728                let diagonal_score = if seq1_chars[i - 1] == seq2_chars[j - 1] {
729                    matrix[i - 1][j - 1] + self.match_score
730                } else {
731                    matrix[i - 1][j - 1] + self.mismatch_penalty
732                };
733                let up_score = matrix[i - 1][j] + self.gap_penalty;
734                let left_score = matrix[i][j - 1] + self.gap_penalty;
735
736                // Prioritize diagonal (match/mismatch), then left (insertion), then up (deletion)
737                // This ensures consistent behavior when multiple paths have the same score
738                if current_score == diagonal_score {
739                    aligned_seq1.insert(0, seq1_chars[i - 1]);
740                    aligned_seq2.insert(0, seq2_chars[j - 1]);
741                    i -= 1;
742                    j -= 1;
743                } else if current_score == left_score {
744                    aligned_seq1.insert(0, '-');
745                    aligned_seq2.insert(0, seq2_chars[j - 1]);
746                    j -= 1;
747                } else if current_score == up_score {
748                    aligned_seq1.insert(0, seq1_chars[i - 1]);
749                    aligned_seq2.insert(0, '-');
750                    i -= 1;
751                } else {
752                    // Fallback case - should not happen with correct implementation
753                    aligned_seq1.insert(0, seq1_chars[i - 1]);
754                    aligned_seq2.insert(0, seq2_chars[j - 1]);
755                    i -= 1;
756                    j -= 1;
757                }
758            } else if i > 0 {
759                aligned_seq1.insert(0, seq1_chars[i - 1]);
760                aligned_seq2.insert(0, '-');
761                i -= 1;
762            } else {
763                aligned_seq1.insert(0, '-');
764                aligned_seq2.insert(0, seq2_chars[j - 1]);
765                j -= 1;
766            }
767        }
768
769        AlignmentResult {
770            aligned_seq1,
771            aligned_seq2,
772            score: matrix[m][n],
773        }
774    }
775}
776
777impl Default for NeedlemanWunsch {
778    fn default() -> Self {
779        Self::new()
780    }
781}
782
783impl SmithWaterman {
784    /// Create a new Smith-Waterman aligner with default parameters
785    pub fn new() -> Self {
786        Self {
787            match_score: 2,
788            mismatch_penalty: -1,
789            gap_penalty: -1,
790        }
791    }
792
793    /// Create with custom scoring parameters
794    pub fn with_scores(_match_score: i32, mismatch_penalty: i32, gappenalty: i32) -> Self {
795        Self {
796            match_score: _match_score,
797            mismatch_penalty,
798            gap_penalty: gappenalty,
799        }
800    }
801
802    /// Align two sequences using the Smith-Waterman algorithm
803    pub fn align(&self, seq1: &str, seq2: &str) -> AlignmentResult {
804        let seq1_chars: Vec<char> = seq1.chars().collect();
805        let seq2_chars: Vec<char> = seq2.chars().collect();
806        let m = seq1_chars.len();
807        let n = seq2_chars.len();
808
809        // Initialize scoring matrix
810        let mut matrix = vec![vec![0; n + 1]; m + 1];
811        let mut max_score = 0;
812        let mut max_i = 0;
813        let mut max_j = 0;
814
815        // Fill the matrix
816        for i in 1..=m {
817            for j in 1..=n {
818                let match_mismatch = if seq1_chars[i - 1] == seq2_chars[j - 1] {
819                    matrix[i - 1][j - 1] + self.match_score
820                } else {
821                    matrix[i - 1][j - 1] + self.mismatch_penalty
822                };
823
824                let delete = matrix[i - 1][j] + self.gap_penalty;
825                let insert = matrix[i][j - 1] + self.gap_penalty;
826
827                matrix[i][j] = *[0, match_mismatch, delete, insert]
828                    .iter()
829                    .max()
830                    .expect("Operation failed");
831
832                // Track maximum score position
833                if matrix[i][j] > max_score {
834                    max_score = matrix[i][j];
835                    max_i = i;
836                    max_j = j;
837                }
838            }
839        }
840
841        // Traceback from the maximum score position
842        let mut aligned_seq1 = String::new();
843        let mut aligned_seq2 = String::new();
844        let mut i = max_i;
845        let mut j = max_j;
846
847        while i > 0 && j > 0 && matrix[i][j] > 0 {
848            let current_score = matrix[i][j];
849            let diagonal_score = if seq1_chars[i - 1] == seq2_chars[j - 1] {
850                matrix[i - 1][j - 1] + self.match_score
851            } else {
852                matrix[i - 1][j - 1] + self.mismatch_penalty
853            };
854
855            if current_score == diagonal_score {
856                aligned_seq1.insert(0, seq1_chars[i - 1]);
857                aligned_seq2.insert(0, seq2_chars[j - 1]);
858                i -= 1;
859                j -= 1;
860            } else if current_score == matrix[i - 1][j] + self.gap_penalty {
861                aligned_seq1.insert(0, seq1_chars[i - 1]);
862                aligned_seq2.insert(0, '-');
863                i -= 1;
864            } else if current_score == matrix[i][j - 1] + self.gap_penalty {
865                aligned_seq1.insert(0, '-');
866                aligned_seq2.insert(0, seq2_chars[j - 1]);
867                j -= 1;
868            } else {
869                // This shouldn't happen in correct implementation
870                break;
871            }
872        }
873
874        AlignmentResult {
875            aligned_seq1,
876            aligned_seq2,
877            score: max_score,
878        }
879    }
880}
881
882impl Default for SmithWaterman {
883    fn default() -> Self {
884        Self::new()
885    }
886}
887
888#[cfg(test)]
889mod tests {
890    use super::*;
891
892    #[test]
893    fn test_damerau_levenshtein() {
894        let metric = DamerauLevenshteinMetric::new();
895
896        // Basic operations
897        assert_eq!(metric.distance("", "").expect("Operation failed"), 0.0);
898        assert_eq!(metric.distance("abc", "").expect("Operation failed"), 3.0);
899        assert_eq!(metric.distance("", "abc").expect("Operation failed"), 3.0);
900        assert_eq!(
901            metric.distance("abc", "abc").expect("Operation failed"),
902            0.0
903        );
904
905        // Single operations
906        assert_eq!(
907            metric.distance("abc", "aXc").expect("Operation failed"),
908            1.0
909        ); // substitution
910        assert_eq!(metric.distance("abc", "ac").expect("Operation failed"), 1.0); // deletion
911        assert_eq!(metric.distance("ac", "abc").expect("Operation failed"), 1.0); // insertion
912        assert_eq!(
913            metric.distance("abc", "acb").expect("Operation failed"),
914            1.0
915        ); // transposition
916
917        // Multiple operations
918        assert_eq!(
919            metric
920                .distance("kitten", "sitting")
921                .expect("Operation failed"),
922            3.0
923        );
924
925        // Test normalized distance
926        assert!(
927            (metric
928                .normalized_distance("abc", "aXc")
929                .expect("Operation failed")
930                - 0.333)
931                .abs()
932                < 0.01
933        );
934        assert_eq!(
935            metric
936                .normalized_distance("", "")
937                .expect("Operation failed"),
938            0.0
939        );
940
941        // Test similarity
942        assert!((metric.similarity("abc", "aXc").expect("Operation failed") - 0.667).abs() < 0.01);
943    }
944
945    #[test]
946    fn test_restricted_damerau_levenshtein() {
947        let metric = DamerauLevenshteinMetric::restricted();
948
949        // OSA doesn't allow substring transpositions
950        assert_eq!(metric.distance("ca", "abc").expect("Operation failed"), 3.0);
951        // Not 2.0 as in full DL
952    }
953
954    #[test]
955    fn test_soundex() {
956        let soundex = Soundex::new();
957
958        assert_eq!(soundex.encode("Robert").expect("Operation failed"), "R163");
959        assert_eq!(soundex.encode("Rupert").expect("Operation failed"), "R163");
960        assert_eq!(soundex.encode("SOUNDEX").expect("Operation failed"), "S532");
961        assert_eq!(soundex.encode("Smith").expect("Operation failed"), "S530");
962        assert_eq!(soundex.encode("Smythe").expect("Operation failed"), "S530");
963        assert_eq!(soundex.encode("").expect("Operation failed"), "");
964        assert_eq!(soundex.encode("123").expect("Operation failed"), "");
965
966        // Test sounds_like
967        assert!(soundex
968            .sounds_like("Robert", "Rupert")
969            .expect("Operation failed"));
970        assert!(soundex
971            .sounds_like("Smith", "Smythe")
972            .expect("Operation failed"));
973        assert!(!soundex
974            .sounds_like("Smith", "Jones")
975            .expect("Operation failed"));
976
977        // Test custom length
978        let soundex_5 = Soundex::with_length(5);
979        assert_eq!(
980            soundex_5.encode("SOUNDEX").expect("Operation failed"),
981            "S5320"
982        );
983    }
984
985    #[test]
986    fn test_metaphone() {
987        let metaphone = Metaphone::new();
988
989        assert_eq!(
990            metaphone.encode("programming").expect("Operation failed"),
991            "PRKRMN"
992        );
993        assert_eq!(
994            metaphone.encode("programmer").expect("Operation failed"),
995            "PRKRMR"
996        );
997        assert_eq!(metaphone.encode("Wright").expect("Operation failed"), "RT");
998        assert_eq!(metaphone.encode("White").expect("Operation failed"), "WT");
999        assert_eq!(metaphone.encode("Knight").expect("Operation failed"), "NT");
1000        assert_eq!(metaphone.encode("").expect("Operation failed"), "");
1001        assert_eq!(metaphone.encode("123").expect("Operation failed"), "");
1002
1003        // Test sounds_like - "Wright" and "Write" actually should sound similar in Metaphone
1004        assert!(metaphone
1005            .sounds_like("Wright", "Write")
1006            .expect("Operation failed"));
1007        assert!(!metaphone
1008            .sounds_like("White", "Wright")
1009            .expect("Operation failed"));
1010
1011        // Test custom max length
1012        let metaphone_3 = Metaphone::with_max_length(3);
1013        assert_eq!(
1014            metaphone_3.encode("programming").expect("Operation failed"),
1015            "PRK"
1016        );
1017    }
1018
1019    #[test]
1020    fn test_phonetic_edge_cases() {
1021        let soundex = Soundex::new();
1022        let metaphone = Metaphone::new();
1023
1024        // Test with non-alphabetic characters
1025        assert_eq!(soundex.encode("O'Brien").expect("Operation failed"), "O165");
1026        assert_eq!(
1027            metaphone.encode("O'Brien").expect("Operation failed"),
1028            "OBRN"
1029        );
1030
1031        // Test with mixed case
1032        assert_eq!(
1033            soundex.encode("McDonald").expect("Operation failed"),
1034            "M235"
1035        );
1036        assert_eq!(
1037            metaphone.encode("McDonald").expect("Operation failed"),
1038            "MKTNLT"
1039        );
1040    }
1041
1042    #[test]
1043    fn test_nysiis() {
1044        let nysiis = Nysiis::new();
1045
1046        // Basic tests
1047        assert_eq!(
1048            nysiis.encode("Johnson").expect("Operation failed"),
1049            "JANSAN"
1050        );
1051        assert_eq!(
1052            nysiis.encode("Williams").expect("Operation failed"),
1053            "WALAN"
1054        ); // Standard NYSIIS code
1055        assert_eq!(nysiis.encode("Jones").expect("Operation failed"), "JAN");
1056        assert_eq!(nysiis.encode("Smith").expect("Operation failed"), "SNAT");
1057        assert_eq!(
1058            nysiis.encode("MacDonald").expect("Operation failed"),
1059            "MCDANALD"
1060        );
1061        assert_eq!(nysiis.encode("Knight").expect("Operation failed"), "NAGT");
1062        assert_eq!(nysiis.encode("").expect("Operation failed"), "");
1063        assert_eq!(nysiis.encode("123").expect("Operation failed"), "");
1064
1065        // Test sounds_like
1066        assert!(nysiis
1067            .sounds_like("Johnson", "Jonson")
1068            .expect("Operation failed"));
1069        assert!(!nysiis
1070            .sounds_like("Smith", "Jones")
1071            .expect("Operation failed"));
1072
1073        // Test edge cases
1074        assert_eq!(
1075            nysiis.encode("Philips").expect("Operation failed"),
1076            "FFALAP"
1077        );
1078        assert_eq!(nysiis.encode("Schmidt").expect("Operation failed"), "SSNAD");
1079        assert_eq!(
1080            nysiis.encode("Schneider").expect("Operation failed"),
1081            "SSNADAR"
1082        );
1083
1084        // Test with max length
1085        let nysiis_6 = Nysiis::with_max_length(6);
1086        assert_eq!(
1087            nysiis_6.encode("Williams").expect("Operation failed"),
1088            "WALAN"
1089        ); // 5 chars, so not truncated
1090        assert_eq!(
1091            nysiis_6.encode("MacDonald").expect("Operation failed"),
1092            "MCDANA"
1093        ); // 6 chars, truncated from longer
1094    }
1095
1096    #[test]
1097    fn test_needleman_wunsch() {
1098        let aligner = NeedlemanWunsch::new();
1099
1100        // Test simple alignment
1101        let result = aligner.align("GATTACA", "GCATGCU");
1102
1103        // Check that we get a valid optimal alignment with correct score
1104        assert_eq!(result.aligned_seq1, "G-ATTACA");
1105        assert_eq!(result.score, 0);
1106
1107        // Verify the alignment is valid by checking that all characters from seq2 are present
1108        let seq2_chars = result
1109            .aligned_seq2
1110            .chars()
1111            .filter(|&c| c != '-')
1112            .collect::<String>();
1113        assert_eq!(seq2_chars, "GCATGCU");
1114
1115        // Verify alignment length consistency
1116        assert_eq!(result.aligned_seq1.len(), result.aligned_seq2.len());
1117
1118        // Test identical sequences
1119        let result = aligner.align("HELLO", "HELLO");
1120        assert_eq!(result.aligned_seq1, "HELLO");
1121        assert_eq!(result.aligned_seq2, "HELLO");
1122        assert_eq!(result.score, 5);
1123
1124        // Test empty sequences
1125        let result = aligner.align("", "ABC");
1126        assert_eq!(result.aligned_seq1, "---");
1127        assert_eq!(result.aligned_seq2, "ABC");
1128        assert_eq!(result.score, -3);
1129
1130        let result = aligner.align("ABC", "");
1131        assert_eq!(result.aligned_seq1, "ABC");
1132        assert_eq!(result.aligned_seq2, "---");
1133        assert_eq!(result.score, -3);
1134
1135        // Test with custom scores
1136        let custom_aligner = NeedlemanWunsch::with_scores(2, -2, -1);
1137        let result = custom_aligner.align("CAT", "CART");
1138        assert!(result.score > 0);
1139    }
1140
1141    #[test]
1142    fn test_smith_waterman() {
1143        let aligner = SmithWaterman::new();
1144
1145        // Test local alignment
1146        let result = aligner.align("GGTTGACTA", "TGTTACGG");
1147        assert!(result.score > 0);
1148        assert!(result.aligned_seq1.contains("GTT"));
1149        assert!(result.aligned_seq2.contains("GTT"));
1150
1151        // Test with sequences having common substring
1152        let result = aligner.align("ABCDEFG", "XYZCDEFPQ");
1153        assert_eq!(result.aligned_seq1, "CDEF");
1154        assert_eq!(result.aligned_seq2, "CDEF");
1155        assert_eq!(result.score, 8); // 4 matches * 2
1156
1157        // Test empty sequences
1158        let result = aligner.align("", "ABC");
1159        assert_eq!(result.aligned_seq1, "");
1160        assert_eq!(result.aligned_seq2, "");
1161        assert_eq!(result.score, 0);
1162
1163        // Test no common subsequence
1164        let result = aligner.align("AAA", "BBB");
1165        assert_eq!(result.score, 0);
1166
1167        // Test with custom scores
1168        let custom_aligner = SmithWaterman::with_scores(3, -3, -2);
1169        let result = custom_aligner.align("ACACACTA", "AGCACACA");
1170        assert!(result.score > 0);
1171    }
1172}